Unverified Commit e0d2250e authored by Andrzej Kotłowski's avatar Andrzej Kotłowski Committed by GitHub
Browse files

[Performance] Improve COOToCSR implementation (#5508)


Co-authored-by: default avatarHongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
parent 484fabdc
......@@ -9,7 +9,9 @@ from .. import utils
@utils.benchmark("time", timeout=600)
@utils.parametrize_cpu("graph_name", ["cora", "livejournal", "friendster"])
@utils.parametrize_cpu(
"graph_name", ["cora", "pubmed", "ogbn-arxiv", "livejournal", "friendster"]
)
@utils.parametrize_gpu("graph_name", ["cora", "livejournal"])
@utils.parametrize(
"format",
......@@ -27,6 +29,10 @@ def track_time(graph_name, format):
device = utils.get_bench_device()
graph = utils.get_graph(graph_name, from_format)
graph = graph.to(device)
if format == ("coo", "csr") and graph_name == "friendster":
# Mark graph as sorted to check performance for COO matrix marked as
# sorted. Note that friendster dataset is already sorted.
graph = dgl.graph(graph.edges(), row_sorted=True)
graph = graph.formats([from_format])
# dry run
graph.formats([to_format])
......
......@@ -412,18 +412,23 @@ CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
template <class IdType>
CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
// Unsigned version of the original integer index data type.
// It avoids overflow in (N + num_threads) and (n_start + n_chunk) below.
typedef typename std::make_unsigned<IdType>::type UIdType;
const UIdType N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
const IdType *const col_data = static_cast<IdType *>(coo.col->data);
const IdType *const data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indptr = NDArray::Empty(
{static_cast<int64_t>(N) + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
Bp[0] = 0;
Bp[N] = 0;
IdType *const Bi = static_cast<IdType *>(ret_indices->data);
IdType *const Bx = static_cast<IdType *>(ret_data->data);
......@@ -433,10 +438,18 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
IdType *const Sx = static_cast<IdType *>(sorted_data->data);
IdType *const Si = static_cast<IdType *>(sorted_data_pos->data);
// Lower number of threads if cost of parallelization is grater than gain
// from making calculation parallel.
const int64_t min_chunk_size = 1000;
const int64_t num_threads_for_batch = 2 + (NNZ + N) / min_chunk_size;
const int num_threads_required = std::min(
static_cast<int64_t>(omp_get_max_threads()), num_threads_for_batch);
// record row_idx in each thread.
std::vector<std::vector<int64_t>> p_sum;
std::vector<std::vector<int64_t>> p_sum(
num_threads_required, std::vector<int64_t>(num_threads_required));
#pragma omp parallel
#pragma omp parallel num_threads(num_threads_required)
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
......@@ -446,25 +459,18 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
const int64_t nz_start = thread_id * nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
const int64_t n_chunk = (N + num_threads - 1) / num_threads;
const int64_t n_start = thread_id * n_chunk;
const int64_t n_end = std::min(N, n_start + n_chunk);
const UIdType n_chunk = (N + num_threads - 1) / num_threads;
const UIdType n_start = thread_id * n_chunk;
const UIdType n_end = std::min(N, n_start + n_chunk);
// init Bp as zero and one shift is always applied when accessing Bp as
// its length is N+1.
for (auto i = n_start; i < n_end; ++i) {
Bp[i + 1] = 0;
Bp[i] = 0;
}
#pragma omp master
{ p_sum.resize(num_threads); }
#pragma omp barrier
// iterate on NNZ data and count row_idx.
p_sum[thread_id].resize(num_threads, 0);
for (auto i = nz_start; i < nz_end; ++i) {
const int64_t row_idx = row_data[i];
const int64_t row_thread_id = row_idx / n_chunk;
const IdType row_idx = row_data[i];
const IdType row_thread_id = row_idx / n_chunk;
++p_sum[thread_id][row_thread_id];
}
......@@ -473,8 +479,8 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
// accumulate row_idx.
{
int64_t cum = 0;
for (size_t j = 0; j < p_sum.size(); ++j) {
for (size_t i = 0; i < p_sum.size(); ++i) {
for (int j = 0; j < num_threads; ++j) {
for (int i = 0; i < num_threads; ++i) {
auto tmp = p_sum[i][j];
p_sum[i][j] = cum;
cum += tmp;
......@@ -483,12 +489,16 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
CHECK_EQ(cum, NNZ);
}
#pragma omp barrier
const int64_t i_start = p_sum[0][thread_id];
const int64_t i_end =
thread_id + 1 == num_threads ? NNZ : p_sum[0][thread_id + 1];
#pragma omp barrier
// sort data by row_idx and place into Sx/Si.
std::vector<int64_t> data_pos(p_sum[thread_id]);
auto &data_pos = p_sum[thread_id];
for (auto i = nz_start; i < nz_end; ++i) {
const int64_t row_idx = row_data[i];
const int64_t row_thread_id = row_idx / n_chunk;
const IdType row_idx = row_data[i];
const IdType row_thread_id = row_idx / n_chunk;
const int64_t pos = data_pos[row_thread_id]++;
Sx[pos] = data == nullptr ? i : data[i];
Si[pos] = i;
......@@ -498,32 +508,26 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
// Now we're able to do coo2csr on sorted data in each thread in parallel.
// compute data number on each row_idx.
const int64_t i_start = p_sum[0][thread_id];
const int64_t i_end =
thread_id + 1 == num_threads ? NNZ : p_sum[0][thread_id + 1];
for (auto i = i_start; i < i_end; ++i) {
const int64_t row_idx = row_data[Si[i]];
const UIdType row_idx = row_data[Si[i]];
++Bp[row_idx + 1];
}
// accumulate on each row
IdType cumsum = 0;
for (auto i = n_start; i < n_end; ++i) {
const auto tmp = Bp[i + 1];
Bp[i + 1] = cumsum;
IdType cumsum = i_start;
for (auto i = n_start + 1; i <= n_end; ++i) {
const auto tmp = Bp[i];
Bp[i] = cumsum;
cumsum += tmp;
}
// update Bi/Bp/Bx
for (auto i = i_start; i < i_end; ++i) {
const int64_t row_idx = row_data[Si[i]];
const int64_t dest = (Bp[row_idx + 1]++) + i_start;
const UIdType row_idx = row_data[Si[i]];
const int64_t dest = (Bp[row_idx + 1]++);
Bi[dest] = col_data[Si[i]];
Bx[dest] = Sx[i];
}
for (auto i = n_start; i < n_end; ++i) {
Bp[i + 1] += i_start;
}
}
return CSRMatrix(
coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
......@@ -532,14 +536,19 @@ CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
template <class IdType>
CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
// Unsigned version of the original integer index data type.
// It avoids overflow in (N + num_threads) and (n_start + n_chunk) below.
typedef typename std::make_unsigned<IdType>::type UIdType;
const UIdType N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
const IdType *const col_data = static_cast<IdType *>(coo.col->data);
const IdType *const data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indptr = NDArray::Empty(
{static_cast<int64_t>(N) + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
......@@ -561,9 +570,9 @@ CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
const int64_t nz_start = thread_id * nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
const int64_t n_chunk = (N + num_threads - 1) / num_threads;
const int64_t n_start = thread_id * n_chunk;
const int64_t n_end = std::min(N, n_start + n_chunk);
const UIdType n_chunk = (N + num_threads - 1) / num_threads;
const UIdType n_start = thread_id * n_chunk;
const UIdType n_end = std::min(N, n_start + n_chunk);
#pragma omp master
{
......@@ -581,11 +590,12 @@ CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
#pragma omp barrier
// compute prefixsum in parallel
int64_t sum = 0;
for (int64_t i = n_start; i < n_end; ++i) {
for (UIdType i = n_start; i < n_end; ++i) {
IdType tmp = 0;
for (int j = 0; j < num_threads; ++j) {
std::swap(tmp, local_ptrs[j][i]);
tmp += local_ptrs[j][i];
auto previous = local_ptrs[j][i];
local_ptrs[j][i] = tmp;
tmp += previous;
}
sum += tmp;
Bp[i + 1] = sum;
......@@ -595,7 +605,7 @@ CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
#pragma omp barrier
#pragma omp master
{
for (int64_t i = 0; i < num_threads; ++i) {
for (int i = 0; i < num_threads; ++i) {
thread_prefixsum[i + 1] += thread_prefixsum[i];
}
CHECK_EQ(thread_prefixsum[num_threads], NNZ);
......@@ -603,7 +613,7 @@ CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
#pragma omp barrier
sum = thread_prefixsum[thread_id];
for (int64_t i = n_start; i < n_end; ++i) {
for (UIdType i = n_start; i < n_end; ++i) {
Bp[i + 1] += sum;
}
......@@ -622,35 +632,131 @@ CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
coo.col_sorted);
}
} // namespace
// complexity: time O(NNZ), space O(1)
template <typename IdType>
CSRMatrix UnSortedSmallCOOToCSR(COOMatrix coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *row_data = static_cast<IdType *>(coo.row->data);
const IdType *col_data = static_cast<IdType *>(coo.col->data);
const IdType *data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *Bp = static_cast<IdType *>(ret_indptr->data);
IdType *Bi = static_cast<IdType *>(ret_indices->data);
IdType *Bx = static_cast<IdType *>(ret_data->data);
// Count elements in each row
std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++;
}
// Convert to indexes
for (IdType i = 0, cumsum = 0; i < N; ++i) {
const IdType temp = Bp[i];
Bp[i] = cumsum;
cumsum += temp;
}
for (int64_t i = 0; i < NNZ; ++i) {
const IdType r = row_data[i];
Bi[Bp[r]] = col_data[i];
Bx[Bp[r]] = data ? data[i] : i;
Bp[r]++;
}
// Restore the indptr
for (int64_t i = N; i > 0; --i) {
Bp[i] = Bp[i - 1];
}
Bp[0] = 0;
return CSRMatrix(
coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
coo.col_sorted);
}
enum class COOToCSRAlg {
sorted = 0,
unsortedSmall,
unsortedSparse,
unsortedDense
};
/**
Implementation and Complexity details. N: num_nodes, NNZ: num_edges, P:
num_threads.
1. If row is sorted in COO, SortedCOOToCSR<> is applied. Time: O(NNZ/P).
Space: O(1).
2. If row is NOT sorted in COO and graph is sparse (low average degree),
UnSortedSparseCOOToCSR<> is applied. Time: O(NNZ/P + N/P + P^2), space O(NNZ +
P^2).
3. If row is NOT sorted in COO and graph is dense (medium/high average
degree), UnSortedDenseCOOToCSR<> is applied. Time: O(NNZ/P + N/P), space O(NNZ +
N*P).
*/
* Chose COO to CSR format conversion algorithm for given COO matrix according
* to heuristic based on measured performance.
*
* Implementation and complexity details. N: num_nodes, NNZ: num_edges, P:
* num_threads.
* 1. If row is sorted in COO, SortedCOOToCSR<> is applied. Time: O(NNZ/P),
* space: O(1).
* 2 If row is NOT sorted in COO and graph is small (small number of NNZ),
* UnSortedSmallCOOToCSR<> is applied. Time: O(NNZ), space O(N).
* 3 If row is NOT sorted in COO and graph is sparse (low average degree),
* UnSortedSparseCOOToCSR<> is applied. Time: O(NNZ/P + N/P + P^2),
* space O(NNZ + P^2).
* 4. If row is NOT sorted in COO and graph is dense (medium/high average
* degree), UnSortedDenseCOOToCSR<> is applied. Time: O(NNZ/P + N/P),
* space O(NNZ + N*P).
*
* Note:
* If you change this function, change also _TestCOOToCSRAlgs in
* tests/cpp/test_spmat_coo.cc
*/
template <typename IdType>
inline COOToCSRAlg WhichCOOToCSR(const COOMatrix &coo) {
if (coo.row_sorted) {
return COOToCSRAlg::sorted;
} else {
#ifdef _WIN32
// On Windows omp_get_max_threads() gives larger value than later OMP can
// spawn.
int64_t num_threads;
#pragma omp parallel
#pragma master
{ num_threads = omp_get_num_threads(); }
#else
const int64_t num_threads = omp_get_max_threads();
#endif
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
// Parameters below are heuristically chosen according to measured
// performance.
const int64_t type_scale = sizeof(IdType) >> 1;
const int64_t small = 50 * num_threads * type_scale * type_scale;
if (NNZ < small || num_threads == 1) {
// For relatively small number of non zero elements cost of spread
// algorithm between threads is bigger than improvements from using
// many cores
return COOToCSRAlg::unsortedSmall;
} else if (type_scale * NNZ < num_threads * N) {
// For relatively small number of non zero elements in matrix, sparse
// parallel version of algorithm is more efficient than dense.
return COOToCSRAlg::unsortedSparse;
}
return COOToCSRAlg::unsortedDense;
}
}
} // namespace
template <DGLDeviceType XPU, typename IdType>
CSRMatrix COOToCSR(COOMatrix coo) {
if (!coo.row_sorted) {
const int64_t num_threads = omp_get_max_threads();
const int64_t num_nodes = coo.num_rows;
const int64_t num_edges = coo.row->shape[0];
// Besides graph density, num_threads is also taken into account. Below
// criteria is set-up according to the time/space complexity difference
// between these 2 algorithms.
if (num_threads * num_nodes > 4 * num_edges) {
switch (WhichCOOToCSR<IdType>(coo)) {
case COOToCSRAlg::sorted:
return SortedCOOToCSR<IdType>(coo);
case COOToCSRAlg::unsortedSmall:
default:
return UnSortedSmallCOOToCSR<IdType>(coo);
case COOToCSRAlg::unsortedSparse:
return UnSortedSparseCOOToCSR<IdType>(coo);
}
return UnSortedDenseCOOToCSR<IdType>(coo);
case COOToCSRAlg::unsortedDense:
return UnSortedDenseCOOToCSR<IdType>(coo);
}
return SortedCOOToCSR<IdType>(coo);
}
template CSRMatrix COOToCSR<kDGLCPU, int32_t>(COOMatrix coo);
......
#include <dgl/array.h>
#include <dmlc/omp.h>
#include <gtest/gtest.h>
#include <omp.h>
#include <random>
#include "./common.h"
......@@ -132,6 +135,41 @@ aten::COOMatrix COO3(DGLContext ctx) {
std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx));
}
template <typename IDX>
aten::COOMatrix COORandomized(IDX rows_and_cols, int64_t nnz, int seed) {
std::vector<IDX> vec_rows(nnz);
std::vector<IDX> vec_cols(nnz);
std::vector<IDX> vec_data(nnz);
#pragma omp parallel
{
const int64_t num_threads = omp_get_num_threads();
const int64_t thread_id = omp_get_thread_num();
const int64_t chunk = nnz / num_threads;
const int64_t size = (thread_id == num_threads - 1)
? nnz - chunk * (num_threads - 1)
: chunk;
auto rows = vec_rows.data() + thread_id * chunk;
auto cols = vec_cols.data() + thread_id * chunk;
auto data = vec_data.data() + thread_id * chunk;
std::mt19937_64 gen64(seed + thread_id);
std::mt19937 gen32(seed + thread_id);
for (int64_t i = 0; i < size; ++i) {
rows[i] = gen64() % rows_and_cols;
cols[i] = gen64() % rows_and_cols;
data[i] = gen32() % 90 + 1;
}
}
return aten::COOMatrix(
rows_and_cols, rows_and_cols,
aten::VecToIdArray(vec_rows, sizeof(IDX) * 8, CTX),
aten::VecToIdArray(vec_cols, sizeof(IDX) * 8, CTX),
aten::VecToIdArray(vec_data, sizeof(IDX) * 8, CTX), false, false);
}
struct SparseCOOCSR {
static constexpr uint64_t NUM_ROWS = 100;
static constexpr uint64_t NUM_COLS = 150;
......@@ -163,13 +201,6 @@ struct SparseCOOCSR {
}
};
bool isSparseCOO(
const int64_t &num_threads, const int64_t &num_nodes,
const int64_t &num_edges) {
// refer to COOToCSR<>() in ~dgl/src/array/cpu/spmat_op_impl_coo for details.
return num_threads * num_nodes > 4 * num_edges;
}
template <typename IDX>
aten::COOMatrix RowSorted_NullData_COO(DGLContext ctx = CTX) {
// [[0, 1, 1, 0, 0],
......@@ -212,8 +243,6 @@ void _TestCOOToCSR(DGLContext ctx) {
auto csr = CSR1<IDX>(ctx);
auto tcsr = aten::COOToCSR(coo);
ASSERT_FALSE(coo.row_sorted);
ASSERT_FALSE(
isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
ASSERT_EQ(csr.num_rows, tcsr.num_rows);
ASSERT_EQ(csr.num_cols, tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
......@@ -289,8 +318,6 @@ void _TestCOOToCSR(DGLContext ctx) {
csr = SparseCOOCSR::CSRSparse<IDX>(ctx);
tcsr = aten::COOToCSR(coo);
ASSERT_FALSE(coo.row_sorted);
ASSERT_TRUE(
isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
ASSERT_EQ(csr.num_rows, tcsr.num_rows);
ASSERT_EQ(csr.num_cols, tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
......@@ -448,10 +475,10 @@ void _TestCOOGetData(DGLContext ctx) {
TEST(SpmatTest, COOGetData) {
_TestCOOGetData<int32_t>(CPU);
_TestCOOGetData<int64_t>(CPU);
//#ifdef DGL_USE_CUDA
// #ifdef DGL_USE_CUDA
//_TestCOOGetData<int32_t>(GPU);
//_TestCOOGetData<int64_t>(GPU);
//#endif
// #endif
}
template <typename IDX>
......@@ -477,3 +504,69 @@ TEST(SpmatTest, COOGetDataAndIndices) {
_TestCOOGetDataAndIndices<int32_t>();
_TestCOOGetDataAndIndices<int64_t>();
}
template <typename IDX>
void _TestCOOToCSRAlgs() {
// Compare results between different CPU COOToCSR implementations.
// NNZ is chosen to be bigger than the limit for the "small" matrix algorithm.
// N is set to lay on border between "sparse" and "dense" algorithm choice.
const int64_t num_threads = std::min(256, omp_get_max_threads());
const int64_t min_num_threads = 3;
if (num_threads < min_num_threads) {
std::cerr << "[ ] [ INFO ]"
<< "This test requires at least 3 OMP threads to work properly"
<< std::endl;
GTEST_SKIP();
return;
}
// Select N and NNZ for COO matrix in a way than depending on number of
// threads different algorithm will be used.
// See WhichCOOToCSR in src/array/cpu/spmat_op_impl_coo.cc for details
const int64_t type_scale = sizeof(IDX) >> 1;
const int64_t small = 50 * num_threads * type_scale * type_scale;
// NNZ should be bigger than limit for small matrix algorithm
const int64_t nnz = small + 1234;
// N is chosen to lay on sparse/dense border
const int64_t n = type_scale * nnz / num_threads;
const IDX rows_nad_cols = n + 1; // should be bigger than sparse/dense border
// Note that it will be better to set the seed to a random value when gtest
// allows to use --gtest_random_seed without --gtest_shuffle and report this
// value for reproduction. This way we can find unforeseen situations and
// potential bugs.
const auto seed = 123321;
auto coo = COORandomized<IDX>(rows_nad_cols, nnz, seed);
omp_set_num_threads(1);
// UnSortedSmallCOOToCSR will be used
auto tcsr_small = aten::COOToCSR(coo);
ASSERT_EQ(coo.num_rows, tcsr_small.num_rows);
ASSERT_EQ(coo.num_cols, tcsr_small.num_cols);
omp_set_num_threads(num_threads - 1);
// UnSortedDenseCOOToCSR will be used
auto tcsr_dense = aten::COOToCSR(coo);
ASSERT_EQ(tcsr_small.num_rows, tcsr_dense.num_rows);
ASSERT_EQ(tcsr_small.num_cols, tcsr_dense.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indptr, tcsr_dense.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indices, tcsr_dense.indices));
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.data, tcsr_dense.data));
omp_set_num_threads(num_threads);
// UnSortedSparseCOOToCSR will be used
auto tcsr_sparse = aten::COOToCSR(coo);
ASSERT_EQ(tcsr_small.num_rows, tcsr_sparse.num_rows);
ASSERT_EQ(tcsr_small.num_cols, tcsr_sparse.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indptr, tcsr_sparse.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indices, tcsr_sparse.indices));
ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.data, tcsr_sparse.data));
return;
}
TEST(SpmatTest, COOToCSRAlgs) {
_TestCOOToCSRAlgs<int32_t>();
_TestCOOToCSRAlgs<int64_t>();
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment