Unverified Commit 870da747 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum

* CSRSliceRows

* fix warning

* operator << for ndarray; CSRSliceRows

* add CSRIsSorted

* add csr_sort

* inplace coosort and outplace csrsort

* WIP: coo is sorted

* mv cuda_utils

* add AllTrue utility

* csr sort

* coo sort

* coo2csr for sorted coo arrays

* CSRToCOO from sorted

* pass tests for the new kernel changes

* cannot use inplace sort

* lint

* try fix msvc error

* Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC

* stash

* revert some hack

* revert some changes

* address comments

* fix

* fix to_block unittest

* add todo note
parent da8632ca
......@@ -13,6 +13,10 @@
[submodule "third_party/METIS"]
path = third_party/METIS
url = https://github.com/KarypisLab/METIS.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
branch = 1.8.0
[submodule "third_party/phmap"]
path = third_party/phmap
url = https://github.com/greg7mdp/parallel-hashmap.git
......@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
include_directories("third_party/dmlc-core/include")
include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/cub/")
include_directories("third_party/phmap/")
# initial variables
set(DGL_LINKER_LIBS "")
......
......@@ -13,6 +13,7 @@
#include <utility>
#include <vector>
#include <tuple>
#include <string>
#include "./types.h"
namespace dgl {
......@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
* \tparam ValueType The type of return value.
*/
template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index);
ValueType IndexSelect(NDArray array, int64_t index);
/*!
* \brief Return the data under the index. In numpy notation, A[I]
*/
NDArray IndexSelect(NDArray array, IdArray index);
/*!
* \brief Return the data from `start` (inclusive) to `end` (exclusive).
*/
NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
/*!
* \brief Permute the elements of an array according to given indices.
*
......@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
*/
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
/*!
* \brief Return the cumulative summation (or inclusive sum) of the input array.
*
* The first element out[0] is equal to the first element of the input array
* array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
* Hence, the result array length is the same as the input array length.
*
* If prepend_zero is true, then the first element is zero and the result array
* length is the input array length plus one. This is useful for creating
* an indptr array over a count array.
*
* \param array The 1D input array.
* \return Array after cumsum.
*/
IdArray CumSum(IdArray array, bool prepend_zero = false);
/*!
* \brief Return a string that prints out some debug information.
*/
std::string ToDebugString(NDArray array);
// inline implementations
template <typename T>
IdArray VecToIdArray(const std::vector<T>& vec,
......
......@@ -116,6 +116,16 @@ struct COOMatrix {
CHECK_NO_OVERFLOW(row->dtype, num_rows);
CHECK_NO_OVERFLOW(row->dtype, num_cols);
}
/*! \brief Return a copy of this matrix on the give device context. */
inline COOMatrix CopyTo(const DLContext& ctx) const {
if (ctx == row->ctx)
return *this;
return COOMatrix(num_rows, num_cols,
row.CopyTo(ctx), col.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
row_sorted, col_sorted);
}
};
///////////////////////// COO routines //////////////////////////
......@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
return !IsNullArray(csr.data);
}
/*!
* \brief Check whether the COO is sorted.
*
* It returns two flags: one for whether the row is sorted;
* the other for whether the columns of each row is sorted
* if the first flag is true.
*
* Complexity: O(NNZ)
*/
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
......@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
* the result CSR matrix stores a shuffle index for how the entries
* will be reordered in CSR. The i^th entry in the result CSR corresponds
* to the CSR.data[i] th entry in the input COO.
*
* Conversion complexity: O(nnz)
*
* - The function first check whether the input COO matrix is sorted
* using a linear scan.
* - If the COO matrix is row sorted, the conversion can be done very
* efficiently in a sequential scan. The result indices and data arrays
* are directly equal to the column and data arrays from the input.
* - If the COO matrix is further column sorted, the result CSR is
* also column sorted.
* - Otherwise, the conversion is more costly but still is O(nnz).
*
* \param coo Input COO matrix.
* \return CSR matrix.
*/
CSRMatrix COOToCSR(COOMatrix coo);
......@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
*/
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
/*!
* \brief Sort the indices of a COO matrix in-place.
*
* The function sorts row indices in ascending order. If sort_column is true,
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The coo matrix to sort.
* \param sort_column True if column index should be sorted too.
*/
void COOSort_(COOMatrix* mat, bool sort_column = false);
/*!
* \brief Sort the indices of a COO matrix.
*
......@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The input coo matrix
* \param sort_column True if column index should be sorted too.
* \return COO matrix with index sorted.
*/
COOMatrix COOSort(COOMatrix mat, bool sort_column = false);
inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
if ((mat.row_sorted && !sort_column) || mat.col_sorted)
return mat;
COOMatrix ret(mat.num_rows, mat.num_cols,
mat.row.Clone(), mat.col.Clone(),
COOHasData(mat)? mat.data.Clone() : mat.data,
mat.row_sorted, mat.col_sorted);
COOSort_(&ret, sort_column);
return ret;
}
/*!
* \brief Remove entries from COO matrix by entry indices (data indices)
......
......@@ -106,6 +106,17 @@ struct CSRMatrix {
}
CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
CHECK_EQ(indptr->shape[0], num_rows + 1);
}
/*! \brief Return a copy of this matrix on the give device context. */
inline CSRMatrix CopyTo(const DLContext& ctx) const {
if (ctx == indptr->ctx)
return *this;
return CSRMatrix(num_rows, num_cols,
indptr.CopyTo(ctx), indices.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
sorted);
}
};
......@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
return !IsNullArray(csr.data);
}
/*! \brief Whether the column indices of each row is sorted. */
bool CSRIsSorted(CSRMatrix csr);
/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
/*!
......@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);
/*!
* \brief Convert CSR matrix to COO matrix.
*
* Complexity: O(nnz)
*
* - If data_as_order is false, the column and data arrays of the
* result COO are equal to the indices and data arrays of the
* input CSR. The result COO is also row sorted.
* - If the input CSR is further sorted, the result COO is also
* column sorted.
*
* \param csr Input csr matrix
* \param data_as_order If true, the data array in the input csr matrix contains the order
* by which the resulting COO tuples are stored. In this case, the
......@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
/*!
* \brief Slice rows of the given matrix and return.
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
*
* The sliced row IDs are relabeled to starting from zero.
*
* Examples:
* num_rows = 4
......@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
* num_cols = 4
* indptr = [0, 1, 1]
* indices = [2]
*
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
* \return sliced rows stored in a CSR matrix
*/
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
......@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
* In numpy notation, given matrix M, row index array I, col index array J
* This function returns the submatrix M[I, J].
*
* The sliced row and column IDs are relabeled to starting from zero.
*
* \param csr The input csr matrix
* \param rows The row index to select
* \param cols The col index to select
......@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
bool CSRHasDuplicate(CSRMatrix csr);
/*!
* \brief Sort the column index at each row in the ascending order.
* \brief Sort the column index at each row in ascending order in-place.
*
* Only the indices and data arrays (if available) will be mutated. The indptr array
* stays the same.
*
* Examples:
* num_rows = 4
......@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
*/
void CSRSort_(CSRMatrix* csr);
/*!
* \brief Sort the column index at each row in ascending order.
*
* Return a new CSR matrix with sorted column indices and data arrays.
*/
inline CSRMatrix CSRSort(CSRMatrix csr) {
if (csr.sorted)
return csr;
CSRMatrix ret(csr.num_rows, csr.num_cols,
csr.indptr, csr.indices.Clone(),
CSRHasData(csr)? csr.data.Clone() : csr.data,
csr.sorted);
CSRSort_(&ret);
return ret;
}
/*!
* \brief Reorder the rows and colmns according to the new row and column order.
* \param csr The input csr matrix.
......
......@@ -252,4 +252,8 @@
CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
} while (0);
#define CHECK_IS_ID_ARRAY(VAR) \
CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
<< "Expected argument " << (#VAR) << " to be an 1D integer array.";
#endif // DGL_ATEN_MACRO_H_
......@@ -10,6 +10,7 @@
#include <vector>
#include <utility>
#include <algorithm>
#include <memory>
#include "./runtime/object.h"
#include "array.h"
......
......@@ -12,6 +12,7 @@
#include <utility>
#include <tuple>
#include <algorithm>
#include <memory>
#include "runtime/ndarray.h"
#include "graph_interface.h"
#include "lazy.h"
......
......@@ -8,6 +8,7 @@
#include <vector>
#include <string>
#include <memory>
#include "./runtime/object.h"
#include "graph_interface.h"
......
......@@ -11,6 +11,7 @@
#include <string>
#include <utility>
#include <vector>
#include <memory>
#include "c_runtime_api.h"
#include "dlpack/dlpack.h"
......@@ -157,6 +158,10 @@ class NDArray {
* \return The array under another context.
*/
inline NDArray CopyTo(const DLContext& ctx) const;
/*!
* \brief Return a new array with a copy of the content.
*/
inline NDArray Clone() const;
/*!
* \brief Load NDArray from stream
* \param stream The input data stream
......@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
return ret;
}
inline NDArray NDArray::Clone() const {
CHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
return this->CopyTo(dptr->ctx);
}
inline int NDArray::use_count() const {
if (data_ == nullptr) return 0;
return data_->ref_counter_.load(std::memory_order_relaxed);
......@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
///////////////// Operator overloading for DLDataType /////////////////
/*! \brief Check whether two data types are the same.*/
......
......@@ -13,6 +13,7 @@
#include <string>
#include <limits>
#include <memory>
#include <utility>
#include <type_traits>
#include "c_runtime_api.h"
#include "module.h"
......
......@@ -10,6 +10,7 @@
#include <dgl/graph_serializer.h>
#include <dmlc/io.h>
#include <dmlc/serializer.h>
#include <memory>
namespace dmlc {
namespace serializer {
......
......@@ -17,31 +17,36 @@
#include <tuple>
#include <utility>
#include <vector>
#include <memory>
#include "dmlc/logging.h"
namespace dgl {
/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
dmlc::MemoryStringStream. This class supports serializing and deserializing
NDArrays stored in shared memory. If the stream is created for
sending/recving data through network, the data pointer of the NDArray will be
transmitted directly without and copy. Otherwise, the stream is for
sending/recving data to another process on the same machine, so if an NDArray
is stored in shared memory, it will just record the shared memory name
instead of the actual data buffer.
For example:
std::string blob;
// Send to local
StreamWithBuffer strm(&blob, false);
// Send to remote
StreamWithBuffer strm(&blob, true);
// Receive from local
StreamWithBuffer strm(&blob, false);
// Receive from remote
std::vector<void*> ptr_list
StreamWithBuffer strm(&blob, ptr_list);
*/
/*!
*
* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
* dmlc::MemoryStringStream. This class supports serializing and deserializing
* NDArrays stored in shared memory. If the stream is created for
* sending/recving data through network, the data pointer of the NDArray will be
* transmitted directly without and copy. Otherwise, the stream is for
* sending/recving data to another process on the same machine, so if an NDArray
* is stored in shared memory, it will just record the shared memory name
* instead of the actual data buffer.
*
* For example:
*
* std::string blob;
* // Send to local
* StreamWithBuffer strm(&blob, false);
* // Send to remote
* StreamWithBuffer strm(&blob, true);
* // Receive from local
* StreamWithBuffer strm(&blob, false);
* // Receive from remote
* std::vector<void*> ptr_list
* StreamWithBuffer strm(&blob, ptr_list);
*/
class StreamWithBuffer : public dmlc::SeekStream {
public:
// Buffer type. Storing NDArray to maintain the reference counting to ensure
......
......@@ -8,6 +8,8 @@
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h>
#include <dgl/runtime/shared_mem.h>
#include <dgl/runtime/device_api.h>
#include <sstream>
#include "../c_api_common.h"
#include "./array_op.h"
#include "./arith.h"
......@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
}
template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index) {
ValueType IndexSelect(NDArray array, int64_t index) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(index >= 0 && index < array.NumElements())
<< "Index " << index << " is out of bound.";
ValueType ret = 0;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
......@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
});
return ret;
}
template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<float>(NDArray array, uint64_t index);
template double IndexSelect<double>(NDArray array, uint64_t index);
template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
template float IndexSelect<float>(NDArray array, int64_t index);
template double IndexSelect<double>(NDArray array, int64_t index);
NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(start >= 0 && start < array.NumElements())
<< "Index " << start << " is out of bound.";
CHECK(end >= 0 && end <= array.NumElements())
<< "Index " << end << " is out of bound.";
CHECK_LE(start, end);
auto device = runtime::DeviceAPI::Get(array->ctx);
const int64_t len = end - start;
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
device->CopyDataFromTo(array->data, start * sizeof(DType),
ret->data, 0, len * sizeof(DType),
array->ctx, ret->ctx, array->dtype, nullptr);
});
return ret;
}
NDArray Scatter(NDArray array, IdArray indices) {
NDArray ret;
......@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
return ret;
}
IdArray CumSum(IdArray array, bool prepend_zero) {
IdArray ret;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
});
});
return ret;
}
std::string ToDebugString(NDArray array) {
std::ostringstream oss;
NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
oss << "array([";
ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
oss << a.Ptr<DType>()[i] << ", ";
}
});
if (a.NumElements() > 10)
oss << "...";
oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
return oss.str();
}
///////////////////////// CSR routines //////////////////////////
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
......@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
return ret;
}
bool CSRIsSorted(CSRMatrix csr) {
if (csr.indices->shape[0] <= 1)
return true;
bool ret = false;
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
ret = impl::CSRIsSorted<XPU, IdType>(csr);
});
return ret;
}
NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
......@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
CHECK_GE(end, start);
CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
});
return ret;
......@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
CHECK_SAME_DTYPE(csr.indices, rows);
CHECK_SAME_CONTEXT(csr.indices, rows);
CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", {
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
});
return ret;
......@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
}
void CSRSort_(CSRMatrix* csr) {
ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", {
if (csr->sorted)
return;
ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
impl::CSRSort_<XPU, IdType>(csr);
});
}
......@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
return ret;
}
COOMatrix COOSort(COOMatrix mat, bool sort_column) {
COOMatrix ret;
ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", {
ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, {
ret = impl::COOSort<XPU, IdType>(mat, sort_column);
void COOSort_(COOMatrix* mat, bool sort_column) {
if ((mat->row_sorted && !sort_column) || mat->col_sorted)
return;
ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
impl::COOSort_<XPU, IdType>(mat, sort_column);
});
});
}
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
if (coo.row->shape[0] <= 1)
return {true, true};
std::pair<bool, bool> ret;
ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
ret = impl::COOIsSorted<XPU, IdType>(coo);
});
return ret;
}
......@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")
} // namespace aten
} // namespace dgl
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
return os << dgl::aten::ToDebugString(array);
}
......@@ -3,8 +3,8 @@
* \file array/array_aritch.cc
* \brief DGL array arithmetic operations
*/
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/container.h>
#include "../c_api_common.h"
#include "./array_op.h"
......
......@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
NDArray IndexSelect(NDArray array, IdArray index);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index);
DType IndexSelect(NDArray array, int64_t index);
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Scatter(NDArray array, IdArray indices);
......@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
template <DLDeviceType XPU, typename DType, typename IdType>
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero);
// sparse arrays
template <DLDeviceType XPU, typename IdType>
......@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);
......@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix mat, bool sort_column);
void COOSort_(COOMatrix* mat, bool sort_column);
template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
COOMatrix COORemove(COOMatrix coo, IdArray entries);
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_cumsum.cc
* \brief Array cumsum CPU implementation
*/
#include <dgl/array.h>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return array;
if (prepend_zero) {
IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = 0;
for (int64_t i = 0; i < len; ++i)
out_d[i + 1] = out_d[i] + in_d[i];
return ret;
} else {
IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = in_d[0];
for (int64_t i = 1; i < len; ++i)
out_d[i] = out_d[i - 1] + in_d[i];
return ret;
}
}
template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl
......@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) {
DType IndexSelect(NDArray array, int64_t index) {
const DType* data = static_cast<DType*>(array->data);
return data[index];
}
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index);
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index);
template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index);
template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
} // namespace impl
} // namespace aten
} // namespace dgl
......@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
} // namespace impl
} // namespace aten
} // namespace dgl
......@@ -6,12 +6,12 @@
#ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
#define DGL_ARRAY_CPU_ARRAY_UTILS_H_
#include <dgl/array.h>
#include <dgl/aten/types.h>
#include <parallel_hashmap/phmap.h>
#include <vector>
#include <unordered_map>
#include <utility>
#include "../../c_api_common.h"
#include "../third_party/phmap/parallel_hashmap/phmap.h"
namespace dgl {
namespace aten {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment