Unverified Commit f1b19a6b authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA] Many CUDA operators; Prepare for DGLGraph on CUDA (#1660)

* add cuda utils; change g.to; add g.device

* split array.h into several headers

* cuda index select

* file

* three cuda kernels

* add cuda elementwise arith and several others

* cuda CSRIsNonZero

* fix lint

* lint

* lint

* fix bug in changing ctx to property

* address comments

* remove unused codes

* address comments
parent 42b0c38f
......@@ -12,14 +12,7 @@
#include <utility>
#include "../../c_api_common.h"
#define CHECK_SAME_DTYPE(VAR1, VAR2) \
CHECK(VAR1->dtype == VAR2->dtype) \
<< "Expected " << (#VAR2) << " to be the same type as " << (#VAR1) << "(" \
<< (VAR1)->dtype << ")" \
<< ". But got " << (VAR2)->dtype;
namespace dgl {
namespace aten {
/*!
......@@ -125,8 +118,7 @@ struct PairHash {
}
};
}; // namespace aten
}; // namespace dgl
} // namespace aten
} // namespace dgl
#endif // DGL_ARRAY_CPU_ARRAY_UTILS_H_
......@@ -8,6 +8,7 @@
#include <dgl/array.h>
#include <functional>
#include <algorithm>
namespace dgl {
namespace aten {
......
......@@ -20,8 +20,6 @@ namespace impl {
template <DLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
if (csr.sorted) {
......@@ -43,8 +41,6 @@ template bool CSRIsNonZero<kDLCPU, int64_t>(CSRMatrix, int64_t, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
CHECK_SAME_DTYPE(csr.indices, row);
CHECK_SAME_DTYPE(csr.indices, col);
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen);
......@@ -90,7 +86,6 @@ template bool CSRHasDuplicate<kDLCPU, int64_t>(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
return indptr_data[row + 1] - indptr_data[row];
}
......@@ -120,7 +115,6 @@ template NDArray CSRGetRowNNZ<kDLCPU, int64_t>(CSRMatrix, NDArray);
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const int64_t offset = indptr_data[row] * sizeof(IdType);
......@@ -134,7 +128,6 @@ template NDArray CSRGetRowColumnIndices<kDLCPU, int64_t>(CSRMatrix, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const int64_t offset = indptr_data[row] * sizeof(IdType);
......@@ -172,8 +165,6 @@ void CollectDataFromSorted(const IdType *indices_data, const IdType *data,
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
std::vector<IdType> ret_vec;
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
......@@ -197,8 +188,6 @@ template NDArray CSRGetData<kDLCPU, int64_t>(CSRMatrix, int64_t, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetData(CSRMatrix csr, NDArray rows, NDArray cols) {
CHECK_SAME_DTYPE(csr.indices, rows);
CHECK_SAME_DTYPE(csr.indices, cols);
const int64_t rowlen = rows->shape[0];
const int64_t collen = cols->shape[0];
......@@ -266,8 +255,6 @@ void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data
template <DLDeviceType XPU, typename IdType>
std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray cols) {
CHECK_SAME_DTYPE(csr.indices, rows);
CHECK_SAME_DTYPE(csr.indices, cols);
// TODO(minjie): more efficient implementation for matrix without duplicate entries
const int64_t rowlen = rows->shape[0];
const int64_t collen = cols->shape[0];
......
/*!
* Copyright (c) 2019 by Contributors
* \file array/cpu/array_index_select.cu
* \brief Array index select GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <typename DType, typename IdType>
__global__ void _IndexSelectKernel(const DType* array, const IdType* index,
int64_t length, DType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = array[index[tx]];
tx += stride_x;
}
}
template<DLDeviceType XPU, typename DType, typename IdType>
NDArray IndexSelect(NDArray array, IdArray index) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const DType* array_data = static_cast<DType*>(array->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = array->shape[0];
const int64_t len = index->shape[0];
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
if (len == 0)
return ret;
DType* ret_data = static_cast<DType*>(ret->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_IndexSelectKernel<<<nb, nt, 0, thr_entry->stream>>>(array_data, idx_data, len, ret_data);
return ret;
}
template NDArray IndexSelect<kDLGPU, int32_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, int32_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, int64_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, int64_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, float, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, float, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) {
auto device = runtime::DeviceAPI::Get(array->ctx);
DType ret = 0;
device->CopyDataFromTo(
static_cast<DType*>(array->data) + index, 0, &ret, 0,
sizeof(DType), array->ctx, DLContext{kDLCPU, 0},
array->dtype, nullptr);
return ret;
}
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index);
template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index);
} // namespace impl
} // namespace aten
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* Copyright (c) 2020 by Contributors
* \file array/cuda/array_op_impl.cu
* \brief Array operator GPU implementation
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
#include "../arith.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
int FindNumThreads(int dim, int max_nthrs) {
int ret = max_nthrs;
while (ret > dim) {
ret = ret >> 1;
///////////////////////////// BinaryElewise /////////////////////////////
template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = Op::Call(lhs[tx], rhs[tx]);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
const int64_t len = lhs->shape[0];
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
lhs_data, rhs_data, ret_data, len);
return ret;
}
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, IdArray rhs);
template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
const IdType* lhs, IdType rhs, IdType* out, int64_t length) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = Op::Call(lhs[tx], rhs);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdArray lhs, IdType rhs) {
const int64_t len = lhs->shape[0];
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
lhs_data, rhs, ret_data, len);
return ret;
}
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, int64_t rhs);
template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
IdType lhs, const IdType* rhs, IdType* out, int64_t length) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = Op::Call(lhs, rhs[tx]);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdType lhs, IdArray rhs) {
const int64_t len = rhs->shape[0];
IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
const IdType* rhs_data = static_cast<IdType*>(rhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_BinaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
lhs, rhs_data, ret_data, len);
return ret;
}
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(int64_t lhs, IdArray rhs);
template <typename IdType, typename Op>
__global__ void _UnaryElewiseKernel(
const IdType* lhs, IdType* out, int64_t length) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = Op::Call(lhs[tx]);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType, typename Op>
IdArray UnaryElewise(IdArray lhs) {
const int64_t len = lhs->shape[0];
IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
const IdType* lhs_data = static_cast<IdType*>(lhs->data);
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(len);
int nb = (len + nt - 1) / nt;
_UnaryElewiseKernel<IdType, Op><<<nb, nt, 0, thr_entry->stream>>>(
lhs_data, ret_data, len);
return ret;
}
template IdArray UnaryElewise<kDLGPU, int32_t, arith::Neg>(IdArray lhs);
template IdArray UnaryElewise<kDLGPU, int64_t, arith::Neg>(IdArray lhs);
///////////////////////////// Full /////////////////////////////
template <typename IdType>
__global__ void _FullKernel(
IdType* out, int64_t length, IdType val) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
out[tx] = val;
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
IdArray Full(IdType val, int64_t length, DLContext ctx) {
IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_FullKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, length, val);
return ret;
}
template IdArray Full<kDLGPU, int32_t>(int32_t val, int64_t length, DLContext ctx);
template IdArray Full<kDLGPU, int64_t>(int64_t val, int64_t length, DLContext ctx);
///////////////////////////// Range /////////////////////////////
......@@ -41,7 +234,7 @@ IdArray Range(IdType low, IdType high, DLContext ctx) {
return ret;
IdType* ret_data = static_cast<IdType*>(ret->data);
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = FindNumThreads(length, 1024);
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
_RangeKernel<IdType><<<nb, nt, 0, thr_entry->stream>>>(ret_data, low, length);
return ret;
......@@ -68,7 +261,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
IdArray ret = IdArray::Empty(shape, DLDataType{kDLInt, bits, 1}, arr->ctx);
const int64_t length = ret.NumElements();
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
int nt = FindNumThreads(length, 1024);
int nt = cuda::FindNumThreads(length);
int nb = (length + nt - 1) / nt;
if (bits == 32) {
_CastKernel<IdType, int32_t><<<nb, nt, 0, thr_entry->stream>>>(
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/spmat_op_impl.cu
* \brief Sparse matrix operator CPU implementation
*/
#include <dgl/array.h>
#include <vector>
#include <unordered_set>
#include <numeric>
#include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
///////////////////////////// CSRIsNonZero /////////////////////////////
/*!
* \brief Search adjacency list linearly for each (row, col) pair and
* write the matched position in the indices array to the output.
*
* If there is no match, -1 is written.
* If there are multiple matches, only the first match is written.
*/
template <typename IdType>
__global__ void _LinearSearchKernel(
const IdType* indptr, const IdType* indices,
const IdType* row, const IdType* col,
int64_t row_stride, int64_t col_stride,
int64_t length, IdType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
int rpos = tx, cpos = tx;
while (tx < length) {
out[tx] = -1;
const IdType r = row[rpos], c = col[cpos];
for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
if (indices[i] == c) {
out[tx] = i;
break;
}
}
rpos += row_stride;
cpos += col_stride;
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const auto& ctx = csr.indptr->ctx;
IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
rows = rows.CopyTo(ctx);
cols = cols.CopyTo(ctx);
IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<1, 1, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
rows.Ptr<IdType>(), cols.Ptr<IdType>(),
1, 1, 1,
out.Ptr<IdType>());
out = out.CopyTo(DLContext{kDLCPU, 0});
return *out.Ptr<IdType>() != -1;
}
template bool CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
template bool CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
if (rstlen == 0)
return rst;
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const int nt = cuda::FindNumThreads(rstlen);
const int nb = (rstlen + nt - 1) / nt;
// TODO(minjie): use binary search for sorted csr
_LinearSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
row.Ptr<IdType>(), col.Ptr<IdType>(),
row_stride, col_stride, rstlen,
rst.Ptr<IdType>());
return rst != -1;
}
template NDArray CSRIsNonZero<kDLGPU, int32_t>(CSRMatrix, NDArray, NDArray);
template NDArray CSRIsNonZero<kDLGPU, int64_t>(CSRMatrix, NDArray, NDArray);
///////////////////////////// CSRGetRowNNZ /////////////////////////////
template <DLDeviceType XPU, typename IdType>
int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
const IdType cur = aten::IndexSelect<IdType>(csr.indptr, row);
const IdType next = aten::IndexSelect<IdType>(csr.indptr, row + 1);
return next - cur;
}
template int64_t CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, int64_t);
template int64_t CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, int64_t);
template <typename IdType>
__global__ void _CSRGetRowNNZKernel(
const IdType* vid,
const IdType* indptr,
IdType* out,
int64_t length) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
const IdType vv = vid[tx];
out[tx] = indptr[vv + 1] - indptr[vv];
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const auto len = rows->shape[0];
const IdType* vid_data = static_cast<IdType*>(rows->data);
const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
_CSRGetRowNNZKernel<<<nb, nt, 0, thr_entry->stream>>>(
vid_data, indptr_data, rst_data, len);
return rst;
}
template NDArray CSRGetRowNNZ<kDLGPU, int32_t>(CSRMatrix, NDArray);
template NDArray CSRGetRowNNZ<kDLGPU, int64_t>(CSRMatrix, NDArray);
///////////////////////////// CSRGetRowColumnIndices /////////////////////////////
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
return csr.indices.CreateView({len}, csr.indices->dtype, offset);
}
template NDArray CSRGetRowColumnIndices<kDLGPU, int32_t>(CSRMatrix, int64_t);
template NDArray CSRGetRowColumnIndices<kDLGPU, int64_t>(CSRMatrix, int64_t);
///////////////////////////// CSRGetRowData /////////////////////////////
template <DLDeviceType XPU, typename IdType>
NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
const int64_t offset = aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
if (aten::CSRHasData(csr))
return csr.data.CreateView({len}, csr.data->dtype, offset);
else
return aten::Range(offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
}
template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
} // namespace impl
} // namespace aten
} // namespace dgl
/*!
* Copyright (c) 2020 by Contributors
* \file cuda_utils.h
* \brief Utilities for CUDA kernels.
*/
#ifndef DGL_CUDA_UTILS_H_
#define DGL_CUDA_UTILS_H_
#include <dmlc/logging.h>
namespace dgl {
namespace cuda {
#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
#define CUDA_MAX_NUM_THREADS 1024
/*! \brief Calculate the number of threads needed given the dimension length.
*
* It finds the biggest number that is smaller than min(dim, max_nthrs)
* and is also power of two.
*/
inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
CHECK_NE(dim, 0);
int ret = max_nthrs;
while (ret > dim) {
ret = ret >> 1;
}
return ret;
}
} // namespace cuda
} // namespace dgl
#endif // DGL_CUDA_UTILS_H_
......@@ -753,10 +753,13 @@ def test_flatten(index_dtype):
@unittest.skipIf(F._default_context_str == 'cpu', reason="Need gpu for this test")
@parametrize_dtype
def test_to_device(index_dtype):
hg = create_test_heterograph(index_dtype)
g = create_test_heterograph(index_dtype)
g.nodes['user'].data['h'] = F.copy_to(F.ones((3, 5)), F.cpu())
g.nodes['game'].data['i'] = F.copy_to(F.ones((2, 5)), F.cpu())
g.edges['plays'].data['e'] = F.copy_to(F.ones((4, 4)), F.cpu())
if F.is_cuda_available():
hg = hg.to(F.cuda())
assert hg is not None
g1 = g.to(F.cuda())
assert g1 is not None
@parametrize_dtype
def test_convert_bound(index_dtype):
......
......@@ -86,59 +86,106 @@ TEST(ArrayTest, TestAsNumBits) {
};
template <typename IDX>
void _TestArith() {
void _TestArith(DLContext ctx) {
const int N = 100;
IdArray a = aten::Full(-10, N, sizeof(IDX)*8, CTX);
IdArray b = aten::Full(7, N, sizeof(IDX)*8, CTX);
IdArray a = aten::Full(-10, N, sizeof(IDX)*8, ctx);
IdArray b = aten::Full(7, N, sizeof(IDX)*8, ctx);
IdArray c = aten::Add(a, b);
IdArray c = a + b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -3);
c = aten::Sub(a, b);
c = a - b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -17);
c = aten::Mul(a, b);
c = a * b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -70);
c = aten::Div(a, b);
c = a / b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -1);
c = -a;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 10);
const int val = -3;
c = aten::Add(a, val);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -13);
c = aten::Sub(a, val);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -7);
c = aten::Mul(a, val);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 30);
c = aten::Div(a, val);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
c = aten::Add(val, b);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 4);
c = aten::Sub(val, b);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -10);
c = aten::Mul(val, b);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], -21);
c = aten::Div(val, b);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 0);
a = aten::Range(0, N, sizeof(IDX)*8, CTX);
c = aten::LT(a, 50);
a = aten::Range(0, N, sizeof(IDX)*8, ctx);
c = a < 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i < 50));
c = a > 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i > 50));
c = a >= 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i >= 50));
c = a <= 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i <= 50));
c = a == 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i == 50));
c = a != 50;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i != 50));
}
TEST(ArrayTest, TestArith) {
_TestArith<int32_t>();
_TestArith<int64_t>();
_TestArith<int32_t>(CPU);
_TestArith<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestArith<int32_t>(GPU);
_TestArith<int64_t>(GPU);
#endif
};
template <typename IDX>
......@@ -158,17 +205,21 @@ TEST(ArrayTest, TestHStack) {
}
template <typename IDX>
void _TestIndexSelect() {
IdArray a = aten::Range(0, 100, sizeof(IDX)*8, CTX);
void _TestIndexSelect(DLContext ctx) {
IdArray a = aten::Range(0, 100, sizeof(IDX)*8, ctx);
ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, CTX);
IdArray b = aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX)*8, ctx);
IdArray c = aten::IndexSelect(a, b);
ASSERT_TRUE(ArrayEQ<IDX>(b, c));
}
TEST(ArrayTest, TestIndexSelect) {
_TestIndexSelect<int32_t>();
_TestIndexSelect<int64_t>();
_TestIndexSelect<int32_t>(CPU);
_TestIndexSelect<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestIndexSelect<int32_t>(GPU);
_TestIndexSelect<int64_t>(GPU);
#endif
}
template <typename IDX>
......
......@@ -115,74 +115,90 @@ aten::COOMatrix COO3(DLContext ctx) {
} // namespace
template <typename IDX>
void _TestCSRIsNonZero() {
auto csr = CSR1<IDX>();
void _TestCSRIsNonZero(DLContext ctx) {
auto csr = CSR1<IDX>(ctx);
ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1));
ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0));
IdArray r = aten::VecToIdArray(std::vector<IDX>({2, 2, 0, 0}), sizeof(IDX)*8, CTX);
IdArray c = aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 3}), sizeof(IDX)*8, CTX);
IdArray r = aten::VecToIdArray(std::vector<IDX>({2, 2, 0, 0}), sizeof(IDX)*8, ctx);
IdArray c = aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 3}), sizeof(IDX)*8, ctx);
IdArray x = aten::CSRIsNonZero(csr, r, c);
IdArray tx = aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 0}), sizeof(IDX)*8, CTX);
IdArray tx = aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 0}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
}
TEST(SpmatTest, TestCSRIsNonZero) {
_TestCSRIsNonZero<int32_t>();
_TestCSRIsNonZero<int64_t>();
_TestCSRIsNonZero<int32_t>(CPU);
_TestCSRIsNonZero<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRIsNonZero<int32_t>(GPU);
_TestCSRIsNonZero<int64_t>(GPU);
#endif
}
template <typename IDX>
void _TestCSRGetRowNNZ() {
auto csr = CSR2<IDX>();
void _TestCSRGetRowNNZ(DLContext ctx) {
auto csr = CSR2<IDX>(ctx);
ASSERT_EQ(aten::CSRGetRowNNZ(csr, 0), 3);
ASSERT_EQ(aten::CSRGetRowNNZ(csr, 3), 0);
IdArray r = aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX)*8, CTX);
IdArray r = aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX)*8, ctx);
IdArray x = aten::CSRGetRowNNZ(csr, r);
IdArray tx = aten::VecToIdArray(std::vector<IDX>({3, 0}), sizeof(IDX)*8, CTX);
IdArray tx = aten::VecToIdArray(std::vector<IDX>({3, 0}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
}
TEST(SpmatTest, TestCSRGetRowNNZ) {
_TestCSRGetRowNNZ<int32_t>();
_TestCSRGetRowNNZ<int64_t>();
_TestCSRGetRowNNZ<int32_t>(CPU);
_TestCSRGetRowNNZ<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRGetRowNNZ<int32_t>(GPU);
_TestCSRGetRowNNZ<int64_t>(GPU);
#endif
}
template <typename IDX>
void _TestCSRGetRowColumnIndices() {
auto csr = CSR2<IDX>();
void _TestCSRGetRowColumnIndices(DLContext ctx) {
auto csr = CSR2<IDX>(ctx);
auto x = aten::CSRGetRowColumnIndices(csr, 0);
auto tx = aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX)*8, CTX);
auto tx = aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
x = aten::CSRGetRowColumnIndices(csr, 1);
tx = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX)*8, CTX);
tx = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
x = aten::CSRGetRowColumnIndices(csr, 3);
tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, CTX);
tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
}
TEST(SpmatTest, TestCSRGetRowColumnIndices) {
_TestCSRGetRowColumnIndices<int32_t>();
_TestCSRGetRowColumnIndices<int64_t>();
_TestCSRGetRowColumnIndices<int32_t>(CPU);
_TestCSRGetRowColumnIndices<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRGetRowColumnIndices<int32_t>(GPU);
_TestCSRGetRowColumnIndices<int64_t>(GPU);
#endif
}
template <typename IDX>
void _TestCSRGetRowData() {
auto csr = CSR2<IDX>();
void _TestCSRGetRowData(DLContext ctx) {
auto csr = CSR2<IDX>(ctx);
auto x = aten::CSRGetRowData(csr, 0);
auto tx = aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX)*8, CTX);
auto tx = aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
x = aten::CSRGetRowData(csr, 1);
tx = aten::VecToIdArray(std::vector<IDX>({3}), sizeof(IDX)*8, CTX);
tx = aten::VecToIdArray(std::vector<IDX>({3}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
x = aten::CSRGetRowData(csr, 3);
tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, CTX);
tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
}
TEST(SpmatTest, TestCSRGetRowData) {
_TestCSRGetRowData<int32_t>();
_TestCSRGetRowData<int64_t>();
_TestCSRGetRowData<int32_t>(CPU);
_TestCSRGetRowData<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRGetRowData<int32_t>(GPU);
_TestCSRGetRowData<int64_t>(GPU);
#endif
}
template <typename IDX>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment