"src/vscode:/vscode.git/clone" did not exist on "fb9dcc51ff14e7cec8fc453cf7bfe44ee9349858"
Unverified Commit 870da747 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum

* CSRSliceRows

* fix warning

* operator << for ndarray; CSRSliceRows

* add CSRIsSorted

* add csr_sort

* inplace coosort and outplace csrsort

* WIP: coo is sorted

* mv cuda_utils

* add AllTrue utility

* csr sort

* coo sort

* coo2csr for sorted coo arrays

* CSRToCOO from sorted

* pass tests for the new kernel changes

* cannot use inplace sort

* lint

* try fix msvc error

* Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC

* stash

* revert some hack

* revert some changes

* address comments

* fix

* fix to_block unittest

* add todo note
parent da8632ca
...@@ -10,37 +10,181 @@ ...@@ -10,37 +10,181 @@
#include <numeric> #include <numeric>
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include <iterator>
#include <tuple>
namespace {
template <typename IdType>
struct TupleRef {
TupleRef() = delete;
TupleRef(const TupleRef& other) = default;
TupleRef(TupleRef&& other) = default;
TupleRef(IdType *const r, IdType *const c, IdType *const d)
: row(r), col(c), data(d) {}
TupleRef& operator=(const TupleRef& other) {
*row = *other.row;
*col = *other.col;
*data = *other.data;
return *this;
}
TupleRef& operator=(const std::tuple<IdType, IdType, IdType>& val) {
*row = std::get<0>(val);
*col = std::get<1>(val);
*data = std::get<2>(val);
return *this;
}
operator std::tuple<IdType, IdType, IdType>() const {
return std::make_tuple(*row, *col, *data);
}
void Swap(const TupleRef& other) const {
std::swap(*row, *other.row);
std::swap(*col, *other.col);
std::swap(*data, *other.data);
}
IdType *row, *col, *data;
};
using std::swap;
template <typename IdType>
void swap(const TupleRef<IdType>& r1, const TupleRef<IdType>& r2) {
r1.Swap(r2);
}
template <typename IdType>
struct CooIterator : public std::iterator<std::random_access_iterator_tag,
std::tuple<IdType, IdType, IdType>,
std::ptrdiff_t,
std::tuple<IdType*, IdType*, IdType*>,
TupleRef<IdType>> {
CooIterator() = default;
CooIterator(const CooIterator& other) = default;
CooIterator(CooIterator&& other) = default;
CooIterator(IdType *r, IdType *c, IdType *d): row(r), col(c), data(d) {}
CooIterator& operator=(const CooIterator& other) = default;
CooIterator& operator=(CooIterator&& other) = default;
~CooIterator() = default;
bool operator==(const CooIterator& other) const {
return row == other.row;
}
bool operator!=(const CooIterator& other) const {
return row != other.row;
}
bool operator<(const CooIterator& other) const {
return row < other.row;
}
bool operator>(const CooIterator& other) const {
return row > other.row;
}
bool operator<=(const CooIterator& other) const {
return row <= other.row;
}
bool operator>=(const CooIterator& other) const {
return row >= other.row;
}
CooIterator& operator+=(const std::ptrdiff_t& movement) {
row += movement;
col += movement;
data += movement;
return *this;
}
CooIterator& operator-=(const std::ptrdiff_t& movement) {
row -= movement;
col -= movement;
data -= movement;
return *this;
}
CooIterator& operator++() {
return operator+=(1);
}
CooIterator& operator--() {
return operator-=(1);
}
CooIterator operator++(int) {
CooIterator ret(*this);
operator++();
return ret;
}
CooIterator operator--(int) {
CooIterator ret(*this);
operator--();
return ret;
}
CooIterator operator+(const std::ptrdiff_t& movement) const {
CooIterator ret(*this);
ret += movement;
return ret;
}
CooIterator operator-(const std::ptrdiff_t& movement) const {
CooIterator ret(*this);
ret -= movement;
return ret;
}
std::ptrdiff_t operator-(const CooIterator& other) const {
return row - other.row;
}
TupleRef<IdType> operator*() const {
return TupleRef<IdType>(row, col, data);
}
TupleRef<IdType> operator*() {
return TupleRef<IdType>(row, col, data);
}
IdType *row, *col, *data;
};
} // namespace
namespace dgl { namespace dgl {
namespace aten { namespace aten {
namespace impl { namespace impl {
///////////////////////////// COOSort_ /////////////////////////////
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix coo, bool sort_column) { void COOSort_(COOMatrix* coo, bool sort_column) {
const int64_t nnz = coo.row->shape[0]; const int64_t nnz = coo->row->shape[0];
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data); IdType* coo_row = coo->row.Ptr<IdType>();
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data); IdType* coo_col = coo->col.Ptr<IdType>();
if (!COOHasData(*coo))
// Argsort coo->data = aten::Range(0, nnz, coo->row->dtype.bits, coo->row->ctx);
IdArray new_row = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx); IdType* coo_data = coo->data.Ptr<IdType>();
IdArray new_col = IdArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
IdArray new_idx = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx); typedef std::tuple<IdType, IdType, IdType> Tuple;
IdType* new_row_data = static_cast<IdType*>(new_row->data);
IdType* new_col_data = static_cast<IdType*>(new_col->data); // Arg sort
IdType* new_idx_data = static_cast<IdType*>(new_idx->data);
std::iota(new_idx_data, new_idx_data + nnz, 0);
if (sort_column) { if (sort_column) {
#ifdef PARALLEL_ALGORITHMS #ifdef PARALLEL_ALGORITHMS
__gnu_parallel::sort( __gnu_parallel::sort(
#else #else
std::sort( std::sort(
#endif #endif
new_idx_data, CooIterator<IdType>(coo_row, coo_col, coo_data),
new_idx_data + nnz, CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
[coo_row_data, coo_col_data](const IdType a, const IdType b) { [](const Tuple& a, const Tuple& b) {
return (coo_row_data[a] != coo_row_data[b]) ? return (std::get<0>(a) != std::get<0>(b)) ?
(coo_row_data[a] < coo_row_data[b]) : (std::get<0>(a) < std::get<0>(b)) : (std::get<1>(a) < std::get<1>(b));
(coo_col_data[a] < coo_col_data[b]);
}); });
} else { } else {
#ifdef PARALLEL_ALGORITHMS #ifdef PARALLEL_ALGORITHMS
...@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -48,39 +192,41 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
#else #else
std::sort( std::sort(
#endif #endif
new_idx_data, CooIterator<IdType>(coo_row, coo_col, coo_data),
new_idx_data + nnz, CooIterator<IdType>(coo_row, coo_col, coo_data) + nnz,
[coo_row_data](const IdType a, const IdType b) { [](const Tuple& a, const Tuple& b) {
return coo_row_data[a] < coo_row_data[b]; return std::get<0>(a) < std::get<0>(b);
}); });
} }
// Reorder according to shuffle coo->row_sorted = true;
#pragma omp parallel for coo->col_sorted = sort_column;
for (IdType i = 0; i < nnz; ++i) { }
new_row_data[i] = coo_row_data[new_idx_data[i]];
new_col_data[i] = coo_col_data[new_idx_data[i]];
}
if (COOHasData(coo)) { template void COOSort_<kDLCPU, int32_t>(COOMatrix*, bool);
const IdType* coo_data_data = static_cast<IdType*>(coo.data->data); template void COOSort_<kDLCPU, int64_t>(COOMatrix*, bool);
IdArray new_data = IdArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
IdType* new_data_data = static_cast<IdType*>(new_data->data);
#pragma omp parallel for
for (IdType i = 0; i < nnz; ++i) {
new_data_data[i] = coo_data_data[new_idx_data[i]];
}
new_idx = new_data;
}
return COOMatrix{ ///////////////////////////// COOIsSorted /////////////////////////////
coo.num_rows, coo.num_cols, std::move(new_row), std::move(new_col),
std::move(new_idx), true, sort_column}; template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
IdType* row = coo.row.Ptr<IdType>();
IdType* col = coo.col.Ptr<IdType>();
bool row_sorted = true;
bool col_sorted = true;
for (int64_t i = 1; row_sorted && i < nnz; ++i) {
row_sorted = (row[i - 1] <= row[i]);
col_sorted = col_sorted && (row[i - 1] < row[i] || col[i - 1] <= col[i]);
}
if (!row_sorted)
col_sorted = false;
return {row_sorted, col_sorted};
} }
template COOMatrix COOSort<kDLCPU, int32_t>(COOMatrix, bool); template std::pair<bool, bool> COOIsSorted<kDLCPU, int32_t>(COOMatrix coo);
template COOMatrix COOSort<kDLCPU, int64_t>(COOMatrix, bool); template std::pair<bool, bool> COOIsSorted<kDLCPU, int64_t>(COOMatrix coo);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/csr_sort.cc
* \brief CSR sorting
*/
#include <dgl/array.h>
#include <numeric>
#include <algorithm>
#include <vector>
namespace dgl {
namespace aten {
namespace impl {
///////////////////////////// CSRIsSorted /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
bool ret = true;
#pragma omp parallel for shared(ret)
for (int64_t row = 0; row < csr.num_rows; ++row) {
if (!ret)
continue;
for (IdType i = indptr[row] + 1; i < indptr[row + 1]; ++i) {
if (indices[i - 1] > indices[i]) {
ret = false;
break;
}
}
}
return ret;
}
template bool CSRIsSorted<kDLCPU, int64_t>(CSRMatrix csr);
template bool CSRIsSorted<kDLCPU, int32_t>(CSRMatrix csr);
///////////////////////////// CSRSort /////////////////////////////
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
typedef std::pair<IdType, IdType> ShufflePair;
const int64_t num_rows = csr->num_rows;
const int64_t nnz = csr->indices->shape[0];
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
if (!CSRHasData(*csr)) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
}
std::sort(reorder_vec.begin(), reorder_vec.end(),
[](const ShufflePair &e1, const ShufflePair &e2) {
return e1.first < e2.first;
});
for (int64_t i = 0; i < num_cols; i++) {
col[i] = reorder_vec[i].first;
eid[i] = reorder_vec[i].second;
}
}
}
csr->sorted = true;
}
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) { ...@@ -377,7 +377,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
ret_row_data + indptr_data[i + 1], ret_row_data + indptr_data[i + 1],
i); i);
} }
return COOMatrix{csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data}; return COOMatrix(csr.num_rows, csr.num_cols,
ret_row, csr.indices, csr.data,
true, csr.sorted);
} }
template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr); template COOMatrix CSRToCOO<kDLCPU, int32_t>(CSRMatrix csr);
...@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>( ...@@ -543,49 +545,6 @@ template CSRMatrix CSRSliceMatrix<kDLCPU, int32_t>(
template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>( template CSRMatrix CSRSliceMatrix<kDLCPU, int64_t>(
CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols); CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
///////////////////////////// CSRSort /////////////////////////////
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
typedef std::pair<IdType, IdType> ShufflePair;
const int64_t num_rows = csr->num_rows;
const int64_t nnz = csr->indices->shape[0];
const IdType* indptr_data = static_cast<IdType*>(csr->indptr->data);
IdType* indices_data = static_cast<IdType*>(csr->indices->data);
if (!CSRHasData(*csr)) {
csr->data = aten::Range(0, nnz, csr->indptr->dtype.bits, csr->indptr->ctx);
}
IdType* eid_data = static_cast<IdType*>(csr->data->data);
#pragma omp parallel
{
std::vector<ShufflePair> reorder_vec;
#pragma omp for
for (int64_t row = 0; row < num_rows; row++) {
const int64_t num_cols = indptr_data[row + 1] - indptr_data[row];
IdType *col = indices_data + indptr_data[row];
IdType *eid = eid_data + indptr_data[row];
reorder_vec.resize(num_cols);
for (int64_t i = 0; i < num_cols; i++) {
reorder_vec[i].first = col[i];
reorder_vec[i].second = eid[i];
}
std::sort(reorder_vec.begin(), reorder_vec.end(),
[](const ShufflePair &e1, const ShufflePair &e2) {
return e1.first < e2.first;
});
for (int64_t i = 0; i < num_cols; i++) {
col[i] = reorder_vec[i].first;
eid[i] = reorder_vec[i].second;
}
}
}
csr->sorted = true;
}
template void CSRSort_<kDLCPU, int64_t>(CSRMatrix* csr);
template void CSRSort_<kDLCPU, int32_t>(CSRMatrix* csr);
///////////////////////////// CSRReorder ///////////////////////////// ///////////////////////////// CSRReorder /////////////////////////////
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
......
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
* \file array/cpu/spmat_op_impl.cc * \file array/cpu/spmat_op_impl.cc
* \brief CPU implementation of COO sparse matrix operators * \brief CPU implementation of COO sparse matrix operators
*/ */
#include <dgl/array.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include <unordered_map> #include <unordered_map>
#include <tuple>
#include "array_utils.h" #include "array_utils.h"
namespace dgl { namespace dgl {
...@@ -266,12 +266,43 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -266,12 +266,43 @@ CSRMatrix COOToCSR(COOMatrix coo) {
const IdType* row_data = static_cast<IdType*>(coo.row->data); const IdType* row_data = static_cast<IdType*>(coo.row->data);
const IdType* col_data = static_cast<IdType*>(coo.col->data); const IdType* col_data = static_cast<IdType*>(coo.col->data);
const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr; const IdType* data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx); NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices; NDArray ret_indices;
NDArray ret_data; NDArray ret_data;
bool row_sorted = coo.row_sorted;
bool col_sorted = coo.col_sorted;
if (!row_sorted) {
// It is possible that the flag is simply not set (default value is false),
// so we still perform a linear scan to check the flag.
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
}
if (row_sorted) {
// compute indptr
IdType* Bp = static_cast<IdType*>(ret_indptr->data); IdType* Bp = static_cast<IdType*>(ret_indptr->data);
Bp[0] = 0;
int64_t j = 0;
for (int64_t i = 0; i < N; ++i) {
const int64_t k = j;
for (; j < NNZ && row_data[j] == i; ++j) {}
Bp[i + 1] = Bp[i] + j - k;
}
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
if (!COOHasData(coo))
coo.data = aten::Range(0, NNZ, coo.row->dtype.bits, coo.row->ctx);
// compute indices and data
ret_indices = coo.col;
ret_data = coo.data;
} else {
// compute indptr
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
std::fill(Bp, Bp + N, 0); std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) { for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++; Bp[row_data[i]]++;
...@@ -285,10 +316,7 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -285,10 +316,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
} }
Bp[N] = NNZ; Bp[N] = NNZ;
if (coo.row_sorted == true) { // compute indices and data
ret_indices = coo.col;
ret_data = coo.data;
} else {
ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx); ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx); ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType* Bi = static_cast<IdType*>(ret_indices->data); IdType* Bi = static_cast<IdType*>(ret_indices->data);
...@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -311,7 +339,7 @@ CSRMatrix COOToCSR(COOMatrix coo) {
return CSRMatrix(coo.num_rows, coo.num_cols, return CSRMatrix(coo.num_rows, coo.num_cols,
ret_indptr, ret_indices, ret_data, ret_indptr, ret_indices, ret_data,
coo.col_sorted); col_sorted);
} }
template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo); template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
...@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr, ...@@ -439,7 +467,6 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
// Input COO // Input COO
const IdType* in_rows = static_cast<IdType*>(coo.row->data); const IdType* in_rows = static_cast<IdType*>(coo.row->data);
const IdType* in_cols = static_cast<IdType*>(coo.col->data); const IdType* in_cols = static_cast<IdType*>(coo.col->data);
const IdType* in_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
int64_t num_rows = coo.num_rows; int64_t num_rows = coo.num_rows;
int64_t num_cols = coo.num_cols; int64_t num_cols = coo.num_cols;
int64_t nnz = coo.row->shape[0]; int64_t nnz = coo.row->shape[0];
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_cumsum.cu
* \brief Array cumsum GPU implementation
*/
#include <dgl/array.h>
#include <cub/cub.cuh>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return array;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(array->ctx);
const IdType* in_d = array.Ptr<IdType>();
IdArray ret;
IdType* out_d = nullptr;
if (prepend_zero) {
ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
out_d = ret.Ptr<IdType>() + 1;
} else {
ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
out_d = ret.Ptr<IdType>();
}
// Allocate workspace
size_t workspace_size = 0;
cub::DeviceScan::InclusiveSum(nullptr, workspace_size, in_d, out_d, len, thr_entry->stream);
void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
// Compute cumsum
cub::DeviceScan::InclusiveSum(workspace, workspace_size, in_d, out_d, len, thr_entry->stream);
device->FreeWorkspace(array->ctx, workspace);
return ret;
}
template IdArray CumSum<kDLGPU, int32_t>(IdArray, bool);
template IdArray CumSum<kDLGPU, int64_t>(IdArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h" #include "./utils.h"
namespace dgl { namespace dgl {
using runtime::NDArray; using runtime::NDArray;
...@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray); ...@@ -50,7 +50,7 @@ template NDArray IndexSelect<kDLGPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray); template NDArray IndexSelect<kDLGPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType> template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) { DType IndexSelect(NDArray array, int64_t index) {
auto device = runtime::DeviceAPI::Get(array->ctx); auto device = runtime::DeviceAPI::Get(array->ctx);
DType ret = 0; DType ret = 0;
device->CopyDataFromTo( device->CopyDataFromTo(
...@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) { ...@@ -60,12 +60,12 @@ DType IndexSelect(NDArray array, uint64_t index) {
return ret; return ret;
} }
template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, uint64_t index); template int32_t IndexSelect<kDLGPU, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, uint64_t index); template int64_t IndexSelect<kDLGPU, int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, uint64_t index); template uint32_t IndexSelect<kDLGPU, uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, uint64_t index); template uint64_t IndexSelect<kDLGPU, uint64_t>(NDArray array, int64_t index);
template float IndexSelect<kDLGPU, float>(NDArray array, uint64_t index); template float IndexSelect<kDLGPU, float>(NDArray array, int64_t index);
template double IndexSelect<kDLGPU, double>(NDArray array, uint64_t index); template double IndexSelect<kDLGPU, double>(NDArray array, int64_t index);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h" #include "./utils.h"
#include "../arith.h" #include "../arith.h"
namespace dgl { namespace dgl {
......
...@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType> ...@@ -17,63 +17,43 @@ template <DLDeviceType XPU, typename IdType>
CSRMatrix COOToCSR(COOMatrix coo) { CSRMatrix COOToCSR(COOMatrix coo) {
CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64."; CHECK(sizeof(IdType) == 4) << "CUDA COOToCSR does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx);
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
} }
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream)); CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
bool row_sorted = coo.row_sorted;
NDArray row = coo.row, col = coo.col, data = coo.data; bool col_sorted = coo.col_sorted;
int32_t* row_ptr = static_cast<int32_t*>(row->data); if (!row_sorted) {
int32_t* col_ptr = static_cast<int32_t*>(col->data); // It is possible that the flag is simply not set (default value is false),
int32_t* data_ptr = aten::IsNullArray(data) ? nullptr : static_cast<int32_t*>(data->data); // so we still perform a linear scan to check the flag.
std::tie(row_sorted, col_sorted) = COOIsSorted(coo);
if (!coo.row_sorted) {
// make a copy of row and col because sort is done in-place
row = row.CopyTo(row->ctx);
col = col.CopyTo(col->ctx);
row_ptr = static_cast<int32_t*>(row->data);
col_ptr = static_cast<int32_t*>(col->data);
if (aten::IsNullArray(data)) {
// create the index array
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
data_ptr = static_cast<int32_t*>(data->data);
} }
// sort row if (!row_sorted) {
size_t workspace_size = 0; coo = COOSort(coo);
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
&workspace_size));
void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols,
row->shape[0],
row_ptr,
col_ptr,
data_ptr,
workspace));
device->FreeWorkspace(row->ctx, workspace);
} }
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, row->ctx, row->dtype.bits); const int64_t nnz = coo.row->shape[0];
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
if (!COOHasData(coo))
coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data); int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
CUSPARSE_CALL(cusparseXcoo2csr( CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
row_ptr, coo.row.Ptr<int32_t>(),
row->shape[0], nnz,
coo.num_rows, coo.num_rows,
indptr_ptr, indptr_ptr,
CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_INDEX_BASE_ZERO));
return CSRMatrix(coo.num_rows, coo.num_cols, return CSRMatrix(coo.num_rows, coo.num_cols,
indptr, col, data, false); indptr, coo.col, coo.data, col_sorted);
} }
template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo); template CSRMatrix COOToCSR<kDLGPU, int32_t>(COOMatrix coo);
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
*/ */
#include <dgl/array.h> #include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl { namespace dgl {
...@@ -13,11 +14,16 @@ using runtime::NDArray; ...@@ -13,11 +14,16 @@ using runtime::NDArray;
namespace aten { namespace aten {
namespace impl { namespace impl {
///////////////////////////// COOSort_ /////////////////////////////
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix coo, bool sort_column) { void COOSort_(COOMatrix* coo, bool sort_column) {
// TODO(minjie): Current implementation is based on cusparse which only supports
// int32_t. To support int64_t, we could use the Radix sort algorithm provided
// by CUB.
CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64."; CHECK(sizeof(IdType) == 4) << "CUDA COOSort does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(coo.row->ctx); auto device = runtime::DeviceAPI::Get(coo->row->ctx);
// allocate cusparse handle if needed // allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) { if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
...@@ -25,15 +31,11 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -25,15 +31,11 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream)); CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray row = coo.row.CopyTo(coo.row->ctx); NDArray row = coo->row;
NDArray col = coo.col.CopyTo(coo.col->ctx); NDArray col = coo->col;
NDArray data; if (!aten::COOHasData(*coo))
if (aten::IsNullArray(coo.data)) { coo->data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
// create the index array NDArray data = coo->data;
data = aten::Range(0, row->shape[0], row->dtype.bits, row->ctx);
} else {
data = coo.data.CopyTo(coo.data->ctx);
}
int32_t* row_ptr = static_cast<int32_t*>(row->data); int32_t* row_ptr = static_cast<int32_t*>(row->data);
int32_t* col_ptr = static_cast<int32_t*>(col->data); int32_t* col_ptr = static_cast<int32_t*>(col->data);
int32_t* data_ptr = static_cast<int32_t*>(data->data); int32_t* data_ptr = static_cast<int32_t*>(data->data);
...@@ -42,7 +44,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -42,7 +44,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
size_t workspace_size = 0; size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt( CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols, coo->num_rows, coo->num_cols,
row->shape[0], row->shape[0],
row_ptr, row_ptr,
col_ptr, col_ptr,
...@@ -50,7 +52,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -50,7 +52,7 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
void* workspace = device->AllocWorkspace(row->ctx, workspace_size); void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
CUSPARSE_CALL(cusparseXcoosortByRow( CUSPARSE_CALL(cusparseXcoosortByRow(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
coo.num_rows, coo.num_cols, coo->num_rows, coo->num_cols,
row->shape[0], row->shape[0],
row_ptr, row_ptr,
col_ptr, col_ptr,
...@@ -61,18 +63,18 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -61,18 +63,18 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
if (sort_column) { if (sort_column) {
// First create a row indptr array and then call csrsort // First create a row indptr array and then call csrsort
int32_t* indptr = static_cast<int32_t*>( int32_t* indptr = static_cast<int32_t*>(
device->AllocWorkspace(row->ctx, (coo.num_rows + 1) * sizeof(IdType))); device->AllocWorkspace(row->ctx, (coo->num_rows + 1) * sizeof(IdType)));
CUSPARSE_CALL(cusparseXcoo2csr( CUSPARSE_CALL(cusparseXcoo2csr(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
row_ptr, row_ptr,
row->shape[0], row->shape[0],
coo.num_rows, coo->num_rows,
indptr, indptr,
CUSPARSE_INDEX_BASE_ZERO)); CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt( CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
coo.num_rows, coo->num_rows,
coo.num_cols, coo->num_cols,
row->shape[0], row->shape[0],
indptr, indptr,
col_ptr, col_ptr,
...@@ -82,8 +84,8 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -82,8 +84,8 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseXcsrsort( CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle, thr_entry->cusparse_handle,
coo.num_rows, coo->num_rows,
coo.num_cols, coo->num_cols,
row->shape[0], row->shape[0],
descr, descr,
indptr, indptr,
...@@ -95,13 +97,61 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) { ...@@ -95,13 +97,61 @@ COOMatrix COOSort(COOMatrix coo, bool sort_column) {
device->FreeWorkspace(row->ctx, indptr); device->FreeWorkspace(row->ctx, indptr);
} }
return COOMatrix(coo.num_rows, coo.num_cols, coo->row_sorted = true;
row, col, data, true, sort_column); coo->col_sorted = sort_column;
}
template void COOSort_<kDLGPU, int32_t>(COOMatrix* coo, bool sort_column);
template void COOSort_<kDLGPU, int64_t>(COOMatrix* coo, bool sort_column);
///////////////////////////// COOIsSorted /////////////////////////////
template <typename IdType>
__global__ void _COOIsSortedKernel(
const IdType* row, const IdType* col,
int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < nnz) {
if (tx == 0) {
row_sorted[0] = 1;
col_sorted[0] = 1;
} else {
row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
col_sorted[tx] = static_cast<int8_t>(
row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
}
tx += stride_x;
}
} }
template COOMatrix COOSort<kDLGPU, int32_t>(COOMatrix coo, bool sort_column); template <DLDeviceType XPU, typename IdType>
template COOMatrix COOSort<kDLGPU, int64_t>(COOMatrix coo, bool sort_column); std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
const auto& ctx = coo.row->ctx;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
// be fine.
int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
const int nt = cuda::FindNumThreads(nnz);
const int nb = (nnz + nt - 1) / nt;
_COOIsSortedKernel<<<nb, nt, 0, thr_entry->stream>>>(
coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
nnz, row_flags, col_flags);
const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
device->FreeWorkspace(ctx, row_flags);
device->FreeWorkspace(ctx, col_flags);
return {row_sorted, col_sorted};
}
template std::pair<bool, bool> COOIsSorted<kDLGPU, int32_t>(COOMatrix coo);
template std::pair<bool, bool> COOIsSorted<kDLGPU, int64_t>(COOMatrix coo);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/csr_sort.cc
* \brief Sort COO index
*/
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
/*!
* \brief Check whether each row is sorted.
*/
template <typename IdType>
__global__ void _SegmentIsSorted(
const IdType* indptr, const IdType* indices,
int64_t num_rows, int8_t* flags) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
while (tx < num_rows) {
bool f = true;
for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
f = (indices[i - 1] <= indices[i]);
}
flags[tx] = static_cast<int8_t>(f);
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr) {
const auto& ctx = csr.indptr->ctx;
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(ctx);
// We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
// be fine.
int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
const int nt = cuda::FindNumThreads(csr.num_rows);
const int nb = (csr.num_rows + nt - 1) / nt;
_SegmentIsSorted<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
csr.num_rows, flags);
bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
device->FreeWorkspace(ctx, flags);
return ret;
}
template bool CSRIsSorted<kDLGPU, int32_t>(CSRMatrix csr);
template bool CSRIsSorted<kDLGPU, int64_t>(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
void CSRSort_(CSRMatrix* csr) {
CHECK(sizeof(IdType) == 4) << "CUDA CSRSort_ does not support int64.";
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
// allocate cusparse handle if needed
if (!thr_entry->cusparse_handle) {
CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
}
CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, thr_entry->stream));
NDArray indptr = csr->indptr;
NDArray indices = csr->indices;
const auto& ctx = indptr->ctx;
const int64_t nnz = indices->shape[0];
if (!aten::CSRHasData(*csr))
csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
NDArray data = csr->data;
size_t workspace_size = 0;
CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
&workspace_size));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
cusparseMatDescr_t descr;
CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
CUSPARSE_CALL(cusparseXcsrsort(
thr_entry->cusparse_handle,
csr->num_rows, csr->num_cols, nnz,
descr,
indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
data.Ptr<int32_t>(),
workspace));
csr->sorted = true;
// free resources
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
device->FreeWorkspace(ctx, workspace);
}
template void CSRSort_<kDLGPU, int32_t>(CSRMatrix* csr);
template void CSRSort_<kDLGPU, int64_t>(CSRMatrix* csr);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "macro.cuh" #include "macro.cuh"
#include "atomic.cuh" #include "atomic.cuh"
#include "functor.cuh" #include "functor.cuh"
#include "../../cuda_utils.h" #include "./utils.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
namespace dgl { namespace dgl {
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <unordered_set> #include <unordered_set>
#include <numeric> #include <numeric>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "../../cuda_utils.h" #include "./utils.h"
namespace dgl { namespace dgl {
...@@ -17,8 +17,6 @@ using runtime::NDArray; ...@@ -17,8 +17,6 @@ using runtime::NDArray;
namespace aten { namespace aten {
namespace impl { namespace impl {
///////////////////////////// CSRIsNonZero /////////////////////////////
/*! /*!
* \brief Search adjacency list linearly for each (row, col) pair and * \brief Search adjacency list linearly for each (row, col) pair and
* write the matched position in the indices array to the output. * write the matched position in the indices array to the output.
...@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel( ...@@ -33,7 +31,7 @@ __global__ void _LinearSearchKernel(
int64_t row_stride, int64_t col_stride, int64_t row_stride, int64_t col_stride,
int64_t length, IdType* out) { int64_t length, IdType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x; int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x; const int stride_x = gridDim.x * blockDim.x;
int rpos = tx, cpos = tx; int rpos = tx, cpos = tx;
while (tx < length) { while (tx < length) {
out[tx] = -1; out[tx] = -1;
...@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel( ...@@ -50,6 +48,8 @@ __global__ void _LinearSearchKernel(
} }
} }
///////////////////////////// CSRIsNonZero /////////////////////////////
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
...@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) { ...@@ -169,6 +169,88 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t); template NDArray CSRGetRowData<kDLGPU, int32_t>(CSRMatrix, int64_t);
template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t); template NDArray CSRGetRowData<kDLGPU, int64_t>(CSRMatrix, int64_t);
///////////////////////////// CSRSliceRows /////////////////////////////
template <DLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
const int64_t num_rows = end - start;
const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
const IdType nnz = ed_pos - st_pos;
IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
// indices and data can be view arrays
IdArray ret_indices = csr.indices.CreateView(
{nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
IdArray ret_data;
if (CSRHasData(csr))
ret_data = csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
else
ret_data = aten::Range(st_pos, ed_pos,
csr.indptr->dtype.bits, csr.indptr->ctx);
return CSRMatrix(num_rows, csr.num_cols,
ret_indptr, ret_indices, ret_data,
csr.sorted);
}
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix, int64_t, int64_t);
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix, int64_t, int64_t);
/*!
* \brief Copy data segment to output buffers
*
* For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
* to the out_data from out_indptr[i] ~ out_indptr[i+1]
*
* If the provided `data` array is nullptr, write the read index to the out_data.
*
*/
template <typename IdType, typename DType>
__global__ void _SegmentCopyKernel(
const IdType* indptr, const DType* data,
const IdType* row, int64_t row_stride, int64_t length,
const IdType* out_indptr, DType* out_data) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int stride_x = gridDim.x * blockDim.x;
int rpos = tx;
while (tx < length) {
const IdType r = row[rpos];
DType* out_buf = out_data + out_indptr[tx];
for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
*(out_buf++) = data? data[i] : i;
}
rpos += row_stride;
tx += stride_x;
}
}
template <DLDeviceType XPU, typename IdType>
CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
const int64_t len = rows->shape[0];
IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
// Copy indices.
IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_indices.Ptr<IdType>());
// Copy data.
IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
_SegmentCopyKernel<<<nb, nt, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), CSRHasData(csr)? csr.data.Ptr<IdType>() : nullptr,
rows.Ptr<IdType>(), 1, len,
ret_indptr.Ptr<IdType>(), ret_data.Ptr<IdType>());
return CSRMatrix(len, csr.num_cols,
ret_indptr, ret_indices, ret_data,
csr.sorted);
}
template CSRMatrix CSRSliceRows<kDLGPU, int32_t>(CSRMatrix , NDArray);
template CSRMatrix CSRSliceRows<kDLGPU, int64_t>(CSRMatrix , NDArray);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
......
...@@ -140,6 +140,7 @@ void CusparseCsrmm2( ...@@ -140,6 +140,7 @@ void CusparseCsrmm2(
static_cast<int32_t*>(csr.indptr->data), static_cast<int32_t*>(csr.indptr->data),
static_cast<int32_t*>(csr.indices->data), static_cast<int32_t*>(csr.indices->data),
B_data, n, &beta, trans_out, m)); B_data, n, &beta, trans_out, m));
CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
if (valptr) if (valptr)
device->FreeWorkspace(ctx, valptr); device->FreeWorkspace(ctx, valptr);
// transpose the output matrix // transpose the output matrix
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include "macro.cuh" #include "macro.cuh"
#include "atomic.cuh" #include "atomic.cuh"
#include "../../cuda_utils.h"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h"
namespace dgl { namespace dgl {
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/utils.cu
* \brief Utilities for CUDA kernels.
*/
#include "./utils.h"
#include <cub/cub.cuh>
#include "../../runtime/cuda/cuda_common.h"
namespace dgl {
namespace cuda {
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx) {
auto device = runtime::DeviceAPI::Get(ctx);
int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
// Call CUB's reduction
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceReduce::Min(nullptr, workspace_size, flags, rst, length));
void* workspace = device->AllocWorkspace(ctx, workspace_size);
CUDA_CALL(cub::DeviceReduce::Min(workspace, workspace_size, flags, rst, length));
int8_t cpu_rst = 0;
CUDA_CALL(cudaMemcpy(&cpu_rst, rst, 1, cudaMemcpyDeviceToHost));
device->FreeWorkspace(ctx, workspace);
device->FreeWorkspace(ctx, rst);
return cpu_rst == 1;
}
} // namespace cuda
} // namespace dgl
/*! /*!
* Copyright (c) 2020 by Contributors * Copyright (c) 2020 by Contributors
* \file cuda_utils.h * \file array/cuda/utils.h
* \brief Utilities for CUDA kernels. * \brief Utilities for CUDA kernels.
*/ */
#ifndef DGL_CUDA_UTILS_H_ #ifndef DGL_ARRAY_CUDA_UTILS_H_
#define DGL_CUDA_UTILS_H_ #define DGL_ARRAY_CUDA_UTILS_H_
#include <dmlc/logging.h> #include <dmlc/logging.h>
#include <dlpack/dlpack.h>
namespace dgl { namespace dgl {
namespace cuda { namespace cuda {
...@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) { ...@@ -68,7 +69,18 @@ __device__ __forceinline__ T _ldg(T* addr) {
#endif #endif
} }
/*!
* \brief Return true if the given bool flag array is all true.
* The input bool array is in int8_t type so it is aligned with byte address.
*
* \param flags The bool array.
* \param length The length.
* \param ctx Device context.
* \return True if all the flags are true.
*/
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);
} // namespace cuda } // namespace cuda
} // namespace dgl } // namespace dgl
#endif // DGL_CUDA_UTILS_H_ #endif // DGL_ARRAY_CUDA_UTILS_H_
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
* \file array/kernel.cc * \file array/kernel.cc
* \brief New kernels * \brief New kernels
*/ */
#include <dgl/array.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
......
...@@ -6,9 +6,9 @@ ...@@ -6,9 +6,9 @@
#ifndef DGL_ARRAY_KERNEL_DECL_H_ #ifndef DGL_ARRAY_KERNEL_DECL_H_
#define DGL_ARRAY_KERNEL_DECL_H_ #define DGL_ARRAY_KERNEL_DECL_H_
#include <dgl/array.h>
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/runtime/ndarray.h>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <algorithm> #include <algorithm>
#include <vector> #include <vector>
#include <string> #include <string>
#include <utility>
namespace dgl { namespace dgl {
......
...@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -804,7 +804,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
} }
} }
int msg_count = 0; int msg_count = 0;
for (int i = 0; i < remote_ids.size(); ++i) { for (size_t i = 0; i < remote_ids.size(); ++i) {
if (remote_ids[i].size() != 0) { if (remote_ids[i].size() != 0) {
KVStoreMsg kv_msg; KVStoreMsg kv_msg;
kv_msg.msg_type = MessageType::kPullMsg; kv_msg.msg_type = MessageType::kPullMsg;
...@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -827,9 +827,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
} }
} }
char *return_data = new char[ID_size*row_size]; char *return_data = new char[ID_size*row_size];
const int64_t local_ids_size = local_ids.size();
// Copy local data // Copy local data
#pragma omp parallel for #pragma omp parallel for
for (int64_t i = 0; i < local_ids.size(); ++i) { for (int64_t i = 0; i < local_ids_size; ++i) {
CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size); CHECK_GE(ID_size*row_size, local_ids_orginal[i] * row_size + row_size);
CHECK_GE(data_size, local_ids[i] * row_size + row_size); CHECK_GE(data_size, local_ids[i] * row_size + row_size);
CHECK_GE(local_ids[i], 0); CHECK_GE(local_ids[i], 0);
...@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull") ...@@ -843,7 +844,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_FastPull")
int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t); int64_t id_size = kv_msg->id.GetSize() / sizeof(int64_t);
int part_id = kv_msg->rank / group_count; int part_id = kv_msg->rank / group_count;
char* data_char = static_cast<char*>(kv_msg->data->data); char* data_char = static_cast<char*>(kv_msg->data->data);
for (size_t n = 0; n < id_size; ++n) { for (int64_t n = 0; n < id_size; ++n) {
memcpy(return_data + remote_ids_original[part_id][n] * row_size, memcpy(return_data + remote_ids_original[part_id][n] * row_size,
data_char + n * row_size, data_char + n * row_size,
row_size); row_size);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment