Unverified Commit 870da747 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA][Kernel] More CUDA kernels; Standardize the behavior for sorted COO/CSR (#1704)

* add cub; array cumsum

* CSRSliceRows

* fix warning

* operator << for ndarray; CSRSliceRows

* add CSRIsSorted

* add csr_sort

* inplace coosort and outplace csrsort

* WIP: coo is sorted

* mv cuda_utils

* add AllTrue utility

* csr sort

* coo sort

* coo2csr for sorted coo arrays

* CSRToCOO from sorted

* pass tests for the new kernel changes

* cannot use inplace sort

* lint

* try fix msvc error

* Fix g.copy_to and g.asnumbits; ToBlock no longer uses CSC

* stash

* revert some hack

* revert some changes

* address comments

* fix

* fix to_block unittest

* add todo note
parent da8632ca
...@@ -13,6 +13,10 @@ ...@@ -13,6 +13,10 @@
[submodule "third_party/METIS"] [submodule "third_party/METIS"]
path = third_party/METIS path = third_party/METIS
url = https://github.com/KarypisLab/METIS.git url = https://github.com/KarypisLab/METIS.git
[submodule "third_party/cub"]
path = third_party/cub
url = https://github.com/NVlabs/cub.git
branch = 1.8.0
[submodule "third_party/phmap"] [submodule "third_party/phmap"]
path = third_party/phmap path = third_party/phmap
url = https://github.com/greg7mdp/parallel-hashmap.git url = https://github.com/greg7mdp/parallel-hashmap.git
...@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/") ...@@ -44,6 +44,8 @@ include_directories("third_party/METIS/include/")
include_directories("third_party/dmlc-core/include") include_directories("third_party/dmlc-core/include")
include_directories("third_party/minigun/minigun") include_directories("third_party/minigun/minigun")
include_directories("third_party/minigun/third_party/moderngpu/src") include_directories("third_party/minigun/third_party/moderngpu/src")
include_directories("third_party/cub/")
include_directories("third_party/phmap/")
# initial variables # initial variables
set(DGL_LINKER_LIBS "") set(DGL_LINKER_LIBS "")
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <tuple> #include <tuple>
#include <string>
#include "./types.h" #include "./types.h"
namespace dgl { namespace dgl {
...@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2); ...@@ -131,9 +132,18 @@ IdArray HStack(IdArray arr1, IdArray arr2);
* \tparam ValueType The type of return value. * \tparam ValueType The type of return value.
*/ */
template<typename ValueType> template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index); ValueType IndexSelect(NDArray array, int64_t index);
/*!
* \brief Return the data under the index. In numpy notation, A[I]
*/
NDArray IndexSelect(NDArray array, IdArray index); NDArray IndexSelect(NDArray array, IdArray index);
/*!
* \brief Return the data from `start` (inclusive) to `end` (exclusive).
*/
NDArray IndexSelect(NDArray array, int64_t start, int64_t end);
/*! /*!
* \brief Permute the elements of an array according to given indices. * \brief Permute the elements of an array according to given indices.
* *
...@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value); ...@@ -238,6 +248,27 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, ValueType pad_value);
*/ */
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths); std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
/*!
* \brief Return the cumulative summation (or inclusive sum) of the input array.
*
* The first element out[0] is equal to the first element of the input array
* array[0]. The rest elements are defined recursively, out[i] = out[i-1] + array[i].
* Hence, the result array length is the same as the input array length.
*
* If prepend_zero is true, then the first element is zero and the result array
* length is the input array length plus one. This is useful for creating
* an indptr array over a count array.
*
* \param array The 1D input array.
* \return Array after cumsum.
*/
IdArray CumSum(IdArray array, bool prepend_zero = false);
/*!
* \brief Return a string that prints out some debug information.
*/
std::string ToDebugString(NDArray array);
// inline implementations // inline implementations
template <typename T> template <typename T>
IdArray VecToIdArray(const std::vector<T>& vec, IdArray VecToIdArray(const std::vector<T>& vec,
......
...@@ -116,6 +116,16 @@ struct COOMatrix { ...@@ -116,6 +116,16 @@ struct COOMatrix {
CHECK_NO_OVERFLOW(row->dtype, num_rows); CHECK_NO_OVERFLOW(row->dtype, num_rows);
CHECK_NO_OVERFLOW(row->dtype, num_cols); CHECK_NO_OVERFLOW(row->dtype, num_cols);
} }
/*! \brief Return a copy of this matrix on the give device context. */
inline COOMatrix CopyTo(const DLContext& ctx) const {
if (ctx == row->ctx)
return *this;
return COOMatrix(num_rows, num_cols,
row.CopyTo(ctx), col.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
row_sorted, col_sorted);
}
}; };
///////////////////////// COO routines ////////////////////////// ///////////////////////// COO routines //////////////////////////
...@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) { ...@@ -141,6 +151,17 @@ inline bool COOHasData(COOMatrix csr) {
return !IsNullArray(csr.data); return !IsNullArray(csr.data);
} }
/*!
* \brief Check whether the COO is sorted.
*
* It returns two flags: one for whether the row is sorted;
* the other for whether the columns of each row is sorted
* if the first flag is true.
*
* Complexity: O(NNZ)
*/
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */ /*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col); runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
...@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo); ...@@ -161,6 +182,20 @@ COOMatrix COOTranspose(COOMatrix coo);
* the result CSR matrix stores a shuffle index for how the entries * the result CSR matrix stores a shuffle index for how the entries
* will be reordered in CSR. The i^th entry in the result CSR corresponds * will be reordered in CSR. The i^th entry in the result CSR corresponds
* to the CSR.data[i] th entry in the input COO. * to the CSR.data[i] th entry in the input COO.
*
* Conversion complexity: O(nnz)
*
* - The function first check whether the input COO matrix is sorted
* using a linear scan.
* - If the COO matrix is row sorted, the conversion can be done very
* efficiently in a sequential scan. The result indices and data arrays
* are directly equal to the column and data arrays from the input.
* - If the COO matrix is further column sorted, the result CSR is
* also column sorted.
* - Otherwise, the conversion is more costly but still is O(nnz).
*
* \param coo Input COO matrix.
* \return CSR matrix.
*/ */
CSRMatrix COOToCSR(COOMatrix coo); CSRMatrix COOToCSR(COOMatrix coo);
...@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo); ...@@ -195,6 +230,21 @@ bool COOHasDuplicate(COOMatrix coo);
*/ */
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo); std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
/*!
* \brief Sort the indices of a COO matrix in-place.
*
* The function sorts row indices in ascending order. If sort_column is true,
* col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data.
*
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The coo matrix to sort.
* \param sort_column True if column index should be sorted too.
*/
void COOSort_(COOMatrix* mat, bool sort_column = false);
/*! /*!
* \brief Sort the indices of a COO matrix. * \brief Sort the indices of a COO matrix.
* *
...@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo); ...@@ -202,11 +252,23 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
* col indices are sorted in ascending order too. The data array of the returned COOMatrix * col indices are sorted in ascending order too. The data array of the returned COOMatrix
* stores the shuffled index which could be used to fetch edge data. * stores the shuffled index which could be used to fetch edge data.
* *
* Complexity: O(N*log(N)) time and O(1) space, where N is the number of nonzeros.
* TODO(minjie): The time complexity could be improved to O(N) by using a O(N) space.
*
* \param mat The input coo matrix * \param mat The input coo matrix
* \param sort_column True if column index should be sorted too. * \param sort_column True if column index should be sorted too.
* \return COO matrix with index sorted. * \return COO matrix with index sorted.
*/ */
COOMatrix COOSort(COOMatrix mat, bool sort_column = false); inline COOMatrix COOSort(COOMatrix mat, bool sort_column = false) {
if ((mat.row_sorted && !sort_column) || mat.col_sorted)
return mat;
COOMatrix ret(mat.num_rows, mat.num_cols,
mat.row.Clone(), mat.col.Clone(),
COOHasData(mat)? mat.data.Clone() : mat.data,
mat.row_sorted, mat.col_sorted);
COOSort_(&ret, sort_column);
return ret;
}
/*! /*!
* \brief Remove entries from COO matrix by entry indices (data indices) * \brief Remove entries from COO matrix by entry indices (data indices)
......
...@@ -106,6 +106,17 @@ struct CSRMatrix { ...@@ -106,6 +106,17 @@ struct CSRMatrix {
} }
CHECK_NO_OVERFLOW(indptr->dtype, num_rows); CHECK_NO_OVERFLOW(indptr->dtype, num_rows);
CHECK_NO_OVERFLOW(indptr->dtype, num_cols); CHECK_NO_OVERFLOW(indptr->dtype, num_cols);
CHECK_EQ(indptr->shape[0], num_rows + 1);
}
/*! \brief Return a copy of this matrix on the give device context. */
inline CSRMatrix CopyTo(const DLContext& ctx) const {
if (ctx == indptr->ctx)
return *this;
return CSRMatrix(num_rows, num_cols,
indptr.CopyTo(ctx), indices.CopyTo(ctx),
aten::IsNullArray(data)? data : data.CopyTo(ctx),
sorted);
} }
}; };
...@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) { ...@@ -134,6 +145,9 @@ inline bool CSRHasData(CSRMatrix csr) {
return !IsNullArray(csr.data); return !IsNullArray(csr.data);
} }
/*! \brief Whether the column indices of each row is sorted. */
bool CSRIsSorted(CSRMatrix csr);
/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */ /* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col); runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
/*! /*!
...@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr); ...@@ -155,6 +169,15 @@ CSRMatrix CSRTranspose(CSRMatrix csr);
/*! /*!
* \brief Convert CSR matrix to COO matrix. * \brief Convert CSR matrix to COO matrix.
*
* Complexity: O(nnz)
*
* - If data_as_order is false, the column and data arrays of the
* result COO are equal to the indices and data arrays of the
* input CSR. The result COO is also row sorted.
* - If the input CSR is further sorted, the result COO is also
* column sorted.
*
* \param csr Input csr matrix * \param csr Input csr matrix
* \param data_as_order If true, the data array in the input csr matrix contains the order * \param data_as_order If true, the data array in the input csr matrix contains the order
* by which the resulting COO tuples are stored. In this case, the * by which the resulting COO tuples are stored. In this case, the
...@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order); ...@@ -166,9 +189,8 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
/*! /*!
* \brief Slice rows of the given matrix and return. * \brief Slice rows of the given matrix and return.
* \param csr CSR matrix *
* \param start Start row id (inclusive) * The sliced row IDs are relabeled to starting from zero.
* \param end End row id (exclusive)
* *
* Examples: * Examples:
* num_rows = 4 * num_rows = 4
...@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order); ...@@ -182,6 +204,11 @@ COOMatrix CSRToCOO(CSRMatrix csr, bool data_as_order);
* num_cols = 4 * num_cols = 4
* indptr = [0, 1, 1] * indptr = [0, 1, 1]
* indices = [2] * indices = [2]
*
* \param csr CSR matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
* \return sliced rows stored in a CSR matrix
*/ */
CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end); CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end);
CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows); CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
...@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows); ...@@ -192,6 +219,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, runtime::NDArray rows);
* In numpy notation, given matrix M, row index array I, col index array J * In numpy notation, given matrix M, row index array I, col index array J
* This function returns the submatrix M[I, J]. * This function returns the submatrix M[I, J].
* *
* The sliced row and column IDs are relabeled to starting from zero.
*
* \param csr The input csr matrix * \param csr The input csr matrix
* \param rows The row index to select * \param rows The row index to select
* \param cols The col index to select * \param cols The col index to select
...@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray ...@@ -203,7 +232,10 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
bool CSRHasDuplicate(CSRMatrix csr); bool CSRHasDuplicate(CSRMatrix csr);
/*! /*!
* \brief Sort the column index at each row in the ascending order. * \brief Sort the column index at each row in ascending order in-place.
*
* Only the indices and data arrays (if available) will be mutated. The indptr array
* stays the same.
* *
* Examples: * Examples:
* num_rows = 4 * num_rows = 4
...@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr); ...@@ -218,6 +250,22 @@ bool CSRHasDuplicate(CSRMatrix csr);
*/ */
void CSRSort_(CSRMatrix* csr); void CSRSort_(CSRMatrix* csr);
/*!
* \brief Sort the column index at each row in ascending order.
*
* Return a new CSR matrix with sorted column indices and data arrays.
*/
inline CSRMatrix CSRSort(CSRMatrix csr) {
if (csr.sorted)
return csr;
CSRMatrix ret(csr.num_rows, csr.num_cols,
csr.indptr, csr.indices.Clone(),
CSRHasData(csr)? csr.data.Clone() : csr.data,
csr.sorted);
CSRSort_(&ret);
return ret;
}
/*! /*!
* \brief Reorder the rows and colmns according to the new row and column order. * \brief Reorder the rows and colmns according to the new row and column order.
* \param csr The input csr matrix. * \param csr The input csr matrix.
......
...@@ -252,4 +252,8 @@ ...@@ -252,4 +252,8 @@
CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \ CHECK_LE((val), 0x7FFFFFFFL) << "int32 overflow for argument " << (#val) << "."; \
} while (0); } while (0);
#define CHECK_IS_ID_ARRAY(VAR) \
CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
<< "Expected argument " << (#VAR) << " to be an 1D integer array.";
#endif // DGL_ATEN_MACRO_H_ #endif // DGL_ATEN_MACRO_H_
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <algorithm> #include <algorithm>
#include <memory>
#include "./runtime/object.h" #include "./runtime/object.h"
#include "array.h" #include "array.h"
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <utility> #include <utility>
#include <tuple> #include <tuple>
#include <algorithm> #include <algorithm>
#include <memory>
#include "runtime/ndarray.h" #include "runtime/ndarray.h"
#include "graph_interface.h" #include "graph_interface.h"
#include "lazy.h" #include "lazy.h"
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <memory>
#include "./runtime/object.h" #include "./runtime/object.h"
#include "graph_interface.h" #include "graph_interface.h"
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <memory>
#include "c_runtime_api.h" #include "c_runtime_api.h"
#include "dlpack/dlpack.h" #include "dlpack/dlpack.h"
...@@ -157,6 +158,10 @@ class NDArray { ...@@ -157,6 +158,10 @@ class NDArray {
* \return The array under another context. * \return The array under another context.
*/ */
inline NDArray CopyTo(const DLContext& ctx) const; inline NDArray CopyTo(const DLContext& ctx) const;
/*!
* \brief Return a new array with a copy of the content.
*/
inline NDArray Clone() const;
/*! /*!
* \brief Load NDArray from stream * \brief Load NDArray from stream
* \param stream The input data stream * \param stream The input data stream
...@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const { ...@@ -410,6 +415,12 @@ inline NDArray NDArray::CopyTo(const DLContext& ctx) const {
return ret; return ret;
} }
inline NDArray NDArray::Clone() const {
CHECK(data_ != nullptr);
const DLTensor* dptr = operator->();
return this->CopyTo(dptr->ctx);
}
inline int NDArray::use_count() const { inline int NDArray::use_count() const {
if (data_ == nullptr) return 0; if (data_ == nullptr) return 0;
return data_->ref_counter_.load(std::memory_order_relaxed); return data_->ref_counter_.load(std::memory_order_relaxed);
...@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2) ...@@ -627,6 +638,8 @@ dgl::runtime::NDArray operator <= (int64_t lhs, const dgl::runtime::NDArray& a2)
dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2); dgl::runtime::NDArray operator == (int64_t lhs, const dgl::runtime::NDArray& a2);
dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2); dgl::runtime::NDArray operator != (int64_t lhs, const dgl::runtime::NDArray& a2);
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array);
///////////////// Operator overloading for DLDataType ///////////////// ///////////////// Operator overloading for DLDataType /////////////////
/*! \brief Check whether two data types are the same.*/ /*! \brief Check whether two data types are the same.*/
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <string> #include <string>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <utility>
#include <type_traits> #include <type_traits>
#include "c_runtime_api.h" #include "c_runtime_api.h"
#include "module.h" #include "module.h"
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <dgl/graph_serializer.h> #include <dgl/graph_serializer.h>
#include <dmlc/io.h> #include <dmlc/io.h>
#include <dmlc/serializer.h> #include <dmlc/serializer.h>
#include <memory>
namespace dmlc { namespace dmlc {
namespace serializer { namespace serializer {
......
...@@ -17,31 +17,36 @@ ...@@ -17,31 +17,36 @@
#include <tuple> #include <tuple>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <memory>
#include "dmlc/logging.h" #include "dmlc/logging.h"
namespace dgl { namespace dgl {
/* StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or /*!
dmlc::MemoryStringStream. This class supports serializing and deserializing *
NDArrays stored in shared memory. If the stream is created for * StreamWithBuffer is backed up by dmlc::MemoryFixedSizeStream or
sending/recving data through network, the data pointer of the NDArray will be * dmlc::MemoryStringStream. This class supports serializing and deserializing
transmitted directly without and copy. Otherwise, the stream is for * NDArrays stored in shared memory. If the stream is created for
sending/recving data to another process on the same machine, so if an NDArray * sending/recving data through network, the data pointer of the NDArray will be
is stored in shared memory, it will just record the shared memory name * transmitted directly without and copy. Otherwise, the stream is for
instead of the actual data buffer. * sending/recving data to another process on the same machine, so if an NDArray
For example: * is stored in shared memory, it will just record the shared memory name
std::string blob; * instead of the actual data buffer.
// Send to local *
StreamWithBuffer strm(&blob, false); * For example:
// Send to remote *
StreamWithBuffer strm(&blob, true); * std::string blob;
// Receive from local * // Send to local
StreamWithBuffer strm(&blob, false); * StreamWithBuffer strm(&blob, false);
// Receive from remote * // Send to remote
std::vector<void*> ptr_list * StreamWithBuffer strm(&blob, true);
StreamWithBuffer strm(&blob, ptr_list); * // Receive from local
*/ * StreamWithBuffer strm(&blob, false);
* // Receive from remote
* std::vector<void*> ptr_list
* StreamWithBuffer strm(&blob, ptr_list);
*/
class StreamWithBuffer : public dmlc::SeekStream { class StreamWithBuffer : public dmlc::SeekStream {
public: public:
// Buffer type. Storing NDArray to maintain the reference counting to ensure // Buffer type. Storing NDArray to maintain the reference counting to ensure
......
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <dgl/runtime/shared_mem.h> #include <dgl/runtime/shared_mem.h>
#include <dgl/runtime/device_api.h>
#include <sstream>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./array_op.h" #include "./array_op.h"
#include "./arith.h" #include "./arith.h"
...@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) { ...@@ -100,8 +102,10 @@ NDArray IndexSelect(NDArray array, IdArray index) {
} }
template<typename ValueType> template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index) { ValueType IndexSelect(NDArray array, int64_t index) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array."; CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(index >= 0 && index < array.NumElements())
<< "Index " << index << " is out of bound.";
ValueType ret = 0; ValueType ret = 0;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", { ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "IndexSelect", {
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", { ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
...@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) { ...@@ -110,12 +114,30 @@ ValueType IndexSelect(NDArray array, uint64_t index) {
}); });
return ret; return ret;
} }
template int32_t IndexSelect<int32_t>(NDArray array, uint64_t index); template int32_t IndexSelect<int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<int64_t>(NDArray array, uint64_t index); template int64_t IndexSelect<int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<uint32_t>(NDArray array, uint64_t index); template uint32_t IndexSelect<uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index); template uint64_t IndexSelect<uint64_t>(NDArray array, int64_t index);
template float IndexSelect<float>(NDArray array, uint64_t index); template float IndexSelect<float>(NDArray array, int64_t index);
template double IndexSelect<double>(NDArray array, uint64_t index); template double IndexSelect<double>(NDArray array, int64_t index);
NDArray IndexSelect(NDArray array, int64_t start, int64_t end) {
CHECK_EQ(array->ndim, 1) << "Only support select values from 1D array.";
CHECK(start >= 0 && start < array.NumElements())
<< "Index " << start << " is out of bound.";
CHECK(end >= 0 && end <= array.NumElements())
<< "Index " << end << " is out of bound.";
CHECK_LE(start, end);
auto device = runtime::DeviceAPI::Get(array->ctx);
const int64_t len = end - start;
NDArray ret = NDArray::Empty({len}, array->dtype, array->ctx);
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
device->CopyDataFromTo(array->data, start * sizeof(DType),
ret->data, 0, len * sizeof(DType),
array->ctx, ret->ctx, array->dtype, nullptr);
});
return ret;
}
NDArray Scatter(NDArray array, IdArray indices) { NDArray Scatter(NDArray array, IdArray indices) {
NDArray ret; NDArray ret;
...@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) { ...@@ -181,6 +203,31 @@ std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths) {
return ret; return ret;
} }
IdArray CumSum(IdArray array, bool prepend_zero) {
IdArray ret;
ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, "CumSum", {
ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
ret = impl::CumSum<XPU, IdType>(array, prepend_zero);
});
});
return ret;
}
std::string ToDebugString(NDArray array) {
std::ostringstream oss;
NDArray a = array.CopyTo(DLContext{kDLCPU, 0});
oss << "array([";
ATEN_DTYPE_SWITCH(a->dtype, DType, "array", {
for (int64_t i = 0; i < std::min<int64_t>(a.NumElements(), 10L); ++i) {
oss << a.Ptr<DType>()[i] << ", ";
}
});
if (a.NumElements() > 10)
oss << "...";
oss << "], dtype=" << array->dtype << ", ctx=" << array->ctx << ")";
return oss.str();
}
///////////////////////// CSR routines ////////////////////////// ///////////////////////// CSR routines //////////////////////////
bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
...@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) { ...@@ -250,6 +297,16 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
return ret; return ret;
} }
bool CSRIsSorted(CSRMatrix csr) {
if (csr.indices->shape[0] <= 1)
return true;
bool ret = false;
ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRIsSorted", {
ret = impl::CSRIsSorted<XPU, IdType>(csr);
});
return ret;
}
NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) { NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col) {
CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row; CHECK(row >= 0 && row < csr.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col; CHECK(col >= 0 && col < csr.num_cols) << "Invalid col index: " << col;
...@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) { ...@@ -318,7 +375,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end; CHECK(end >= 0 && end <= csr.num_rows) << "Invalid end index: " << end;
CHECK_GE(end, start); CHECK_GE(end, start);
CSRMatrix ret; CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", { ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end); ret = impl::CSRSliceRows<XPU, IdType>(csr, start, end);
}); });
return ret; return ret;
...@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) { ...@@ -328,7 +385,7 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
CHECK_SAME_DTYPE(csr.indices, rows); CHECK_SAME_DTYPE(csr.indices, rows);
CHECK_SAME_CONTEXT(csr.indices, rows); CHECK_SAME_CONTEXT(csr.indices, rows);
CSRMatrix ret; CSRMatrix ret;
ATEN_CSR_SWITCH(csr, XPU, IdType, "CSRSliceRows", { ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, "CSRSliceRows", {
ret = impl::CSRSliceRows<XPU, IdType>(csr, rows); ret = impl::CSRSliceRows<XPU, IdType>(csr, rows);
}); });
return ret; return ret;
...@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) { ...@@ -347,7 +404,9 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, NDArray rows, NDArray cols) {
} }
void CSRSort_(CSRMatrix* csr) { void CSRSort_(CSRMatrix* csr) {
ATEN_CSR_SWITCH(*csr, XPU, IdType, "CSRSort_", { if (csr->sorted)
return;
ATEN_CSR_SWITCH_CUDA(*csr, XPU, IdType, "CSRSort_", {
impl::CSRSort_<XPU, IdType>(csr); impl::CSRSort_<XPU, IdType>(csr);
}); });
} }
...@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) { ...@@ -509,13 +568,23 @@ COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
return ret; return ret;
} }
COOMatrix COOSort(COOMatrix mat, bool sort_column) { void COOSort_(COOMatrix* mat, bool sort_column) {
COOMatrix ret; if ((mat->row_sorted && !sort_column) || mat->col_sorted)
ATEN_XPU_SWITCH_CUDA(mat.row->ctx.device_type, XPU, "COOSort", { return;
ATEN_ID_TYPE_SWITCH(mat.row->dtype, IdType, { ATEN_XPU_SWITCH_CUDA(mat->row->ctx.device_type, XPU, "COOSort_", {
ret = impl::COOSort<XPU, IdType>(mat, sort_column); ATEN_ID_TYPE_SWITCH(mat->row->dtype, IdType, {
impl::COOSort_<XPU, IdType>(mat, sort_column);
}); });
}); });
}
std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
if (coo.row->shape[0] <= 1)
return {true, true};
std::pair<bool, bool> ret;
ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, "COOIsSorted", {
ret = impl::COOIsSorted<XPU, IdType>(coo);
});
return ret; return ret;
} }
...@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray") ...@@ -709,3 +778,7 @@ DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLExistSharedMemArray")
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
std::ostream& operator << (std::ostream& os, dgl::runtime::NDArray array) {
return os << dgl::aten::ToDebugString(array);
}
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
* \file array/array_aritch.cc * \file array/array_aritch.cc
* \brief DGL array arithmetic operations * \brief DGL array arithmetic operations
*/ */
#include <dgl/array.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/ndarray.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./array_op.h" #include "./array_op.h"
......
...@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType> ...@@ -44,7 +44,7 @@ template <DLDeviceType XPU, typename DType, typename IdType>
NDArray IndexSelect(NDArray array, IdArray index); NDArray IndexSelect(NDArray array, IdArray index);
template <DLDeviceType XPU, typename DType> template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index); DType IndexSelect(NDArray array, int64_t index);
template <DLDeviceType XPU, typename DType, typename IdType> template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Scatter(NDArray array, IdArray indices); NDArray Scatter(NDArray array, IdArray indices);
...@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value); ...@@ -61,6 +61,9 @@ std::tuple<NDArray, IdArray, IdArray> Pack(NDArray array, DType pad_value);
template <DLDeviceType XPU, typename DType, typename IdType> template <DLDeviceType XPU, typename DType, typename IdType>
std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths); std::pair<NDArray, IdArray> ConcatSlices(NDArray array, IdArray lengths);
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero);
// sparse arrays // sparse arrays
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
...@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row); ...@@ -84,6 +87,9 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row); runtime::NDArray CSRGetRowData(CSRMatrix csr, int64_t row);
template <DLDeviceType XPU, typename IdType>
bool CSRIsSorted(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col); runtime::NDArray CSRGetData(CSRMatrix csr, int64_t row, int64_t col);
...@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType> ...@@ -187,7 +193,10 @@ template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo); std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix mat, bool sort_column); void COOSort_(COOMatrix* mat, bool sort_column);
template <DLDeviceType XPU, typename IdType>
std::pair<bool, bool> COOIsSorted(COOMatrix coo);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COORemove(COOMatrix coo, IdArray entries); COOMatrix COORemove(COOMatrix coo, IdArray entries);
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_cumsum.cc
* \brief Array cumsum CPU implementation
*/
#include <dgl/array.h>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
IdArray CumSum(IdArray array, bool prepend_zero) {
const int64_t len = array.NumElements();
if (len == 0)
return array;
if (prepend_zero) {
IdArray ret = aten::NewIdArray(len + 1, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = 0;
for (int64_t i = 0; i < len; ++i)
out_d[i + 1] = out_d[i] + in_d[i];
return ret;
} else {
IdArray ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
const IdType* in_d = array.Ptr<IdType>();
IdType* out_d = ret.Ptr<IdType>();
out_d[0] = in_d[0];
for (int64_t i = 1; i < len; ++i)
out_d[i] = out_d[i - 1] + in_d[i];
return ret;
}
}
template IdArray CumSum<kDLCPU, int32_t>(IdArray, bool);
template IdArray CumSum<kDLCPU, int64_t>(IdArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray); ...@@ -35,20 +35,16 @@ template NDArray IndexSelect<kDLCPU, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray); template NDArray IndexSelect<kDLCPU, double, int64_t>(NDArray, IdArray);
template <DLDeviceType XPU, typename DType> template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index) { DType IndexSelect(NDArray array, int64_t index) {
const DType* data = static_cast<DType*>(array->data); const DType* data = static_cast<DType*>(array->data);
return data[index]; return data[index];
} }
template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, uint64_t index); template int32_t IndexSelect<kDLCPU, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, uint64_t index); template int64_t IndexSelect<kDLCPU, int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<kDLCPU, uint32_t>(NDArray array, uint64_t index); template float IndexSelect<kDLCPU, float>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDLCPU, uint64_t>(NDArray array, uint64_t index); template double IndexSelect<kDLCPU, double>(NDArray array, int64_t index);
template float IndexSelect<kDLCPU, float>(NDArray array, uint64_t index);
template double IndexSelect<kDLCPU, double>(NDArray array, uint64_t index);
}; // namespace impl } // namespace impl
} // namespace aten
}; // namespace aten } // namespace dgl
}; // namespace dgl
...@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in ...@@ -76,8 +76,6 @@ template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, int64_t>(NDArray, in
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float); template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, float>(NDArray, float);
template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double); template std::tuple<NDArray, IdArray, IdArray> Pack<kDLCPU, double>(NDArray, double);
}; // namespace impl } // namespace impl
} // namespace aten
}; // namespace aten } // namespace dgl
}; // namespace dgl
...@@ -6,12 +6,12 @@ ...@@ -6,12 +6,12 @@
#ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_ #ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
#define DGL_ARRAY_CPU_ARRAY_UTILS_H_ #define DGL_ARRAY_CPU_ARRAY_UTILS_H_
#include <dgl/array.h> #include <dgl/aten/types.h>
#include <parallel_hashmap/phmap.h>
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <utility>
#include "../../c_api_common.h" #include "../../c_api_common.h"
#include "../third_party/phmap/parallel_hashmap/phmap.h"
namespace dgl { namespace dgl {
namespace aten { namespace aten {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment