Unverified Commit f4c79f7f authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Performance] improve coo2csr space complexity when row is not sorted (#3326)



* [Performance] improve coo2csr space complexity when row is not sorted

* [Perf] replace std::vector<> by NDArray

* keep both impl of unsorted coo to csr and choose according to graph density dynamically

* refine criteria to choose btw Unsorted algos
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-34-27.us-west-2.compute.internal>
parent a609b4f0
......@@ -302,189 +302,335 @@ template COOMatrix COOTranspose<kDLCPU, int32_t>(COOMatrix coo);
template COOMatrix COOTranspose<kDLCPU, int64_t>(COOMatrix coo);
///////////////////////////// COOToCSR /////////////////////////////
namespace {
// complexity: time O(NNZ), space O(1) if the coo is row sorted,
// time O(NNZ/p + N), space O(NNZ + N*p) otherwise, where p is the number of
// threads.
template <DLDeviceType XPU, typename IdType>
CSRMatrix COOToCSR(COOMatrix coo) {
template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType* const row_data = static_cast<IdType*>(coo.row->data);
const IdType* const col_data = static_cast<IdType*>(coo.col->data);
const IdType* const data = COOHasData(coo)? static_cast<IdType*>(coo.data->data) : nullptr;
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
const IdType *const col_data = static_cast<IdType *>(coo.col->data);
const IdType *const data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices;
NDArray ret_data;
NDArray ret_indices = coo.col;
NDArray ret_data = data == nullptr
? NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx)
: coo.data;
// compute indptr
IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
Bp[0] = 0;
IdType *const fill_data =
data ? nullptr : static_cast<IdType *>(coo.data->data);
if (NNZ > 0) {
auto num_threads = omp_get_max_threads();
parallel_for(0, num_threads, [&](int b, int e) {
for (auto thread_id = b; thread_id < e; ++thread_id) {
// We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
const int64_t nz_start = thread_id * nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
// Each thread searchs the row array for a change, and marks it's
// location in Bp. Threads, other than the first, start at the last
// index covered by the previous, in order to detect changes in the row
// array between thread partitions. This means that each thread after
// the first, searches the range [nz_start-1, nz_end). That is,
// if we had 10 non-zeros, and 4 threads, the indexes searched by each
// thread would be:
// 0: [0, 1, 2]
// 1: [2, 3, 4, 5]
// 2: [5, 6, 7, 8]
// 3: [8, 9]
//
// That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
// change in row would be captured by one thread:
//
// 0: [0, 0, 1] - row 0
// 1: [1, 2, 2, 2] - row 1
// 2: [2, 4, 5, 5] - rows 2, 3, and 4
// 3: [5, 6] - rows 5 and 6
//
int64_t row = 0;
if (nz_start < nz_end) {
row = nz_start == 0 ? 0 : row_data[nz_start - 1];
for (int64_t i = nz_start; i < nz_end; ++i) {
while (row != row_data[i]) {
++row;
Bp[row] = i;
}
}
const bool row_sorted = coo.row_sorted;
const bool col_sorted = coo.col_sorted;
// We will not detect the row change for the last row, nor any empty
// rows at the end of the matrix, so the last active thread needs
// mark all remaining rows in Bp with NNZ.
if (nz_end == NNZ) {
while (row < N) {
++row;
Bp[row] = NNZ;
}
}
if (row_sorted) {
// compute indptr
IdType* const Bp = static_cast<IdType*>(ret_indptr->data);
Bp[0] = 0;
if (fill_data) {
// TODO(minjie): Many of our current implementation assumes that CSR
// must have
// a data array. This is a temporary workaround. Remove this
// after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
std::iota(fill_data + nz_start, fill_data + nz_end, nz_start);
}
}
}
});
} else {
std::fill(Bp, Bp + N + 1, 0);
}
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
}
template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
const IdType *const col_data = static_cast<IdType *>(coo.col->data);
const IdType *const data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
Bp[0] = 0;
IdType *const Bi = static_cast<IdType *>(ret_indices->data);
IdType *const Bx = static_cast<IdType *>(ret_data->data);
// store sorted data and original index.
NDArray sorted_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray sorted_data_pos = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *const Sx = static_cast<IdType *>(sorted_data->data);
IdType *const Si = static_cast<IdType *>(sorted_data_pos->data);
// record row_idx in each thread.
std::vector<std::vector<int64_t>> p_sum;
if (!data) {
// Leave empty, and populate from inside of parallel block
coo.data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
CHECK_LT(thread_id, num_threads);
const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
const int64_t nz_start = thread_id * nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
const int64_t n_chunk = (N + num_threads - 1) / num_threads;
const int64_t n_start = thread_id * n_chunk;
const int64_t n_end = std::min(N, n_start + n_chunk);
// init Bp as zero and one shift is always applied when accessing Bp as
// its length is N+1.
for (auto i = n_start; i < n_end; ++i) {
Bp[i + 1] = 0;
}
IdType * const fill_data = data ? nullptr : static_cast<IdType*>(coo.data->data);
if (NNZ > 0) {
auto num_threads = omp_get_max_threads();
parallel_for(0, num_threads, [&](int b, int e) {
for (auto thread_id = b; thread_id < e; ++thread_id) {
// We partition the set the of non-zeros among the threads
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
// Each thread searchs the row array for a change, and marks it's
// location in Bp. Threads, other than the first, start at the last
// index covered by the previous, in order to detect changes in the row
// array between thread partitions. This means that each thread after
// the first, searches the range [nz_start-1, nz_end). That is,
// if we had 10 non-zeros, and 4 threads, the indexes searched by each
// thread would be:
// 0: [0, 1, 2]
// 1: [2, 3, 4, 5]
// 2: [5, 6, 7, 8]
// 3: [8, 9]
//
// That way, if the row array were [0, 0, 1, 2, 2, 2, 4, 5, 5, 6], each
// change in row would be captured by one thread:
//
// 0: [0, 0, 1] - row 0
// 1: [1, 2, 2, 2] - row 1
// 2: [2, 4, 5, 5] - rows 2, 3, and 4
// 3: [5, 6] - rows 5 and 6
//
int64_t row = 0;
if (nz_start < nz_end) {
row = nz_start == 0 ? 0 : row_data[nz_start-1];
for (int64_t i = nz_start; i < nz_end; ++i) {
while (row != row_data[i]) {
++row;
Bp[row] = i;
}
}
// We will not detect the row change for the last row, nor any empty
// rows at the end of the matrix, so the last active thread needs
// mark all remaining rows in Bp with NNZ.
if (nz_end == NNZ) {
while (row < N) {
++row;
Bp[row] = NNZ;
}
}
#pragma omp master
{ p_sum.resize(num_threads); }
#pragma omp barrier
if (fill_data) {
// TODO(minjie): Many of our current implementation assumes that CSR must have
// a data array. This is a temporary workaround. Remove this after:
// - The old immutable graph implementation is deprecated.
// - The old binary reduce kernel is deprecated.
std::iota(fill_data+nz_start,
fill_data+nz_end,
nz_start);
}
}
// iterate on NNZ data and count row_idx.
p_sum[thread_id].resize(num_threads, 0);
for (auto i = nz_start; i < nz_end; ++i) {
const int64_t row_idx = row_data[i];
const int64_t row_thread_id = row_idx / n_chunk;
++p_sum[thread_id][row_thread_id];
}
#pragma omp barrier
#pragma omp master
// accumulate row_idx.
{
int64_t cum = 0;
for (size_t j = 0; j < p_sum.size(); ++j) {
for (size_t i = 0; i < p_sum.size(); ++i) {
auto tmp = p_sum[i][j];
p_sum[i][j] = cum;
cum += tmp;
}
});
} else {
std::fill(Bp, Bp+N+1, 0);
}
CHECK_EQ(cum, NNZ);
}
#pragma omp barrier
// compute indices and data
ret_indices = coo.col;
ret_data = coo.data;
} else {
// compute indptr
IdType* const Bp = static_cast<IdType*>(ret_indptr->data);
Bp[0] = 0;
// sort data by row_idx and place into Sx/Si.
std::vector<int64_t> data_pos(p_sum[thread_id]);
for (auto i = nz_start; i < nz_end; ++i) {
const int64_t row_idx = row_data[i];
const int64_t row_thread_id = row_idx / n_chunk;
const int64_t pos = data_pos[row_thread_id]++;
Sx[pos] = data == nullptr ? i : data[i];
Si[pos] = i;
}
// compute indices and data
ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType* const Bi = static_cast<IdType*>(ret_indices->data);
IdType* const Bx = static_cast<IdType*>(ret_data->data);
#pragma omp barrier
// the offset within each row, that each thread will write to
std::vector<std::vector<IdType>> local_ptrs;
std::vector<int64_t> thread_prefixsum;
// Now we're able to do coo2csr on sorted data in each thread in parallel.
// compute data number on each row_idx.
const int64_t i_start = p_sum[0][thread_id];
const int64_t i_end =
thread_id + 1 == num_threads ? NNZ : p_sum[0][thread_id + 1];
for (auto i = i_start; i < i_end; ++i) {
const int64_t row_idx = row_data[Si[i]];
++Bp[row_idx + 1];
}
// accumulate on each row
IdType cumsum = 0;
for (auto i = n_start; i < n_end; ++i) {
const auto tmp = Bp[i + 1];
Bp[i + 1] = cumsum;
cumsum += tmp;
}
// update Bi/Bp/Bx
for (auto i = i_start; i < i_end; ++i) {
const int64_t row_idx = row_data[Si[i]];
const int64_t dest = (Bp[row_idx + 1]++) + i_start;
Bi[dest] = col_data[Si[i]];
Bx[dest] = Sx[i];
}
for (auto i = n_start; i < n_end; ++i) {
Bp[i + 1] += i_start;
}
}
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
}
template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType *const row_data = static_cast<IdType *>(coo.row->data);
const IdType *const col_data = static_cast<IdType *>(coo.col->data);
const IdType *const data =
COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
IdType *const Bp = static_cast<IdType *>(ret_indptr->data);
Bp[0] = 0;
IdType *const Bi = static_cast<IdType *>(ret_indices->data);
IdType *const Bx = static_cast<IdType *>(ret_data->data);
// the offset within each row, that each thread will write to
std::vector<std::vector<IdType>> local_ptrs;
std::vector<int64_t> thread_prefixsum;
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
CHECK_LT(thread_id, num_threads);
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
CHECK_LT(thread_id, num_threads);
const int64_t nz_chunk = (NNZ+num_threads-1)/num_threads;
const int64_t nz_start = thread_id*nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start+nz_chunk);
const int64_t nz_chunk = (NNZ + num_threads - 1) / num_threads;
const int64_t nz_start = thread_id * nz_chunk;
const int64_t nz_end = std::min(NNZ, nz_start + nz_chunk);
const int64_t n_chunk = (N+num_threads-1)/num_threads;
const int64_t n_start = thread_id*n_chunk;
const int64_t n_end = std::min(N, n_start+n_chunk);
const int64_t n_chunk = (N + num_threads - 1) / num_threads;
const int64_t n_start = thread_id * n_chunk;
const int64_t n_end = std::min(N, n_start + n_chunk);
#pragma omp master
{
local_ptrs.resize(num_threads);
thread_prefixsum.resize(num_threads+1);
}
{
local_ptrs.resize(num_threads);
thread_prefixsum.resize(num_threads + 1);
}
#pragma omp barrier
local_ptrs[thread_id].resize(N, 0);
local_ptrs[thread_id].resize(N, 0);
for (int64_t i = nz_start; i < nz_end; ++i) {
++local_ptrs[thread_id][row_data[i]];
}
for (int64_t i = nz_start; i < nz_end; ++i) {
++local_ptrs[thread_id][row_data[i]];
}
#pragma omp barrier
// compute prefixsum in parallel
int64_t sum = 0;
for (int64_t i = n_start; i < n_end; ++i) {
IdType tmp = 0;
for (int j = 0; j < num_threads; ++j) {
std::swap(tmp, local_ptrs[j][i]);
tmp += local_ptrs[j][i];
}
sum += tmp;
Bp[i+1] = sum;
// compute prefixsum in parallel
int64_t sum = 0;
for (int64_t i = n_start; i < n_end; ++i) {
IdType tmp = 0;
for (int j = 0; j < num_threads; ++j) {
std::swap(tmp, local_ptrs[j][i]);
tmp += local_ptrs[j][i];
}
thread_prefixsum[thread_id+1] = sum;
sum += tmp;
Bp[i + 1] = sum;
}
thread_prefixsum[thread_id + 1] = sum;
#pragma omp barrier
#pragma omp master
{
for (int64_t i = 0; i < num_threads; ++i) {
thread_prefixsum[i+1] += thread_prefixsum[i];
}
CHECK_EQ(thread_prefixsum[num_threads], NNZ);
{
for (int64_t i = 0; i < num_threads; ++i) {
thread_prefixsum[i + 1] += thread_prefixsum[i];
}
CHECK_EQ(thread_prefixsum[num_threads], NNZ);
}
#pragma omp barrier
sum = thread_prefixsum[thread_id];
for (int64_t i = n_start; i < n_end; ++i) {
Bp[i+1] += sum;
}
sum = thread_prefixsum[thread_id];
for (int64_t i = n_start; i < n_end; ++i) {
Bp[i + 1] += sum;
}
#pragma omp barrier
for (int64_t i = nz_start; i < nz_end; ++i) {
const IdType r = row_data[i];
const int64_t index = Bp[r] + local_ptrs[thread_id][r]++;
Bi[index] = col_data[i];
Bx[index] = data ? data[i] : i;
}
for (int64_t i = nz_start; i < nz_end; ++i) {
const IdType r = row_data[i];
const int64_t index = Bp[r] + local_ptrs[thread_id][r]++;
Bi[index] = col_data[i];
Bx[index] = data ? data[i] : i;
}
CHECK_EQ(Bp[N], NNZ);
}
CHECK_EQ(Bp[N], NNZ);
return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
ret_data, coo.col_sorted);
}
return CSRMatrix(coo.num_rows, coo.num_cols,
ret_indptr, ret_indices, ret_data,
col_sorted);
} // namespace
/*
Implementation and Complexity details. N: num_nodes, NNZ: num_edges, P:
num_threads.
1. If row is sorted in COO, SortedCOOToCSR<> is applied. Time: O(NNZ/P).
Space: O(1).
2. If row is NOT sorted in COO and graph is sparse (low average degree),
UnSortedSparseCOOToCSR<> is applied. Time: O(NNZ/P + N/P + P^2), space O(NNZ +
P^2).
3. If row is NOT sorted in COO and graph is dense (medium/high average
degree), UnSortedDenseCOOToCSR<> is applied. Time: O(NNZ/P + N/P), space O(NNZ +
N*P).
*/
template <DLDeviceType XPU, typename IdType>
CSRMatrix COOToCSR(COOMatrix coo) {
if (!coo.row_sorted) {
const int64_t num_threads = omp_get_num_threads();
const int64_t num_nodes = coo.num_rows;
const int64_t num_edges = coo.row->shape[0];
// Besides graph density, num_threads is also taken into account. Below
// criteria is set-up according to the time/space complexity difference
// between these 2 algorithms.
if (num_threads * num_nodes > 4 * num_edges) {
return UnSortedSparseCOOToCSR<IdType>(coo);
}
return UnSortedDenseCOOToCSR<IdType>(coo);
}
return SortedCOOToCSR<IdType>(coo);
}
template CSRMatrix COOToCSR<kDLCPU, int32_t>(COOMatrix coo);
......
#include <gtest/gtest.h>
#include <dmlc/omp.h>
#include <dgl/array.h>
#include "./common.h"
......@@ -17,7 +18,7 @@ aten::CSRMatrix CSR1(DLContext ctx = CTX) {
return aten::CSRMatrix(
4, 5,
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
false);
}
......@@ -112,6 +113,41 @@ aten::COOMatrix COO3(DLContext ctx) {
aten::VecToIdArray(std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX)*8, ctx));
}
struct SparseCOOCSR {
static constexpr uint64_t NUM_ROWS = 100;
static constexpr uint64_t NUM_COLS = 150;
static constexpr uint64_t NUM_NZ = 5;
template <typename IDX>
static aten::COOMatrix COOSparse(const DLContext &ctx = CTX) {
return aten::COOMatrix(NUM_ROWS, NUM_COLS,
aten::VecToIdArray(std::vector<IDX>({0, 1, 2, 3, 4}),
sizeof(IDX) * 8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
sizeof(IDX) * 8, ctx));
}
template <typename IDX>
static aten::CSRMatrix CSRSparse(const DLContext &ctx = CTX) {
auto &&indptr = std::vector<IDX>(NUM_ROWS + 1, NUM_NZ);
for (size_t i = 0; i < NUM_NZ; ++i) {
indptr[i + 1] = static_cast<IDX>(i + 1);
}
indptr[0] = 0;
return aten::CSRMatrix(NUM_ROWS, NUM_COLS,
aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
sizeof(IDX) * 8, ctx),
aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 1, 1}),
sizeof(IDX) * 8, ctx),
false);
}
};
bool isSparseCOO(const int64_t &num_threads, const int64_t &num_nodes,
const int64_t &num_edges) {
// refer to COOToCSR<>() in ~dgl/src/array/cpu/spmat_op_impl_coo for details.
return num_threads * num_nodes > 4 * num_edges;
}
} // namespace
template <typename IDX>
......@@ -119,9 +155,13 @@ void _TestCOOToCSR(DLContext ctx) {
auto coo = COO1<IDX>(ctx);
auto csr = CSR1<IDX>(ctx);
auto tcsr = aten::COOToCSR(coo);
ASSERT_EQ(coo.num_rows, csr.num_rows);
ASSERT_EQ(coo.num_cols, csr.num_cols);
ASSERT_FALSE(coo.row_sorted);
ASSERT_FALSE(
isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
ASSERT_EQ(csr.num_rows, tcsr.num_rows);
ASSERT_EQ(csr.num_cols, tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));
coo = COO2<IDX>(ctx);
csr = CSR2<IDX>(ctx);
......@@ -135,6 +175,7 @@ void _TestCOOToCSR(DLContext ctx) {
auto rs_coo = aten::COOSort(coo, false);
auto rs_csr = CSR1<IDX>(ctx);
auto rs_tcsr = aten::COOToCSR(rs_coo);
ASSERT_TRUE(rs_coo.row_sorted);
ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
......@@ -173,6 +214,17 @@ void _TestCOOToCSR(DLContext ctx) {
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
coo = SparseCOOCSR::COOSparse<IDX>(ctx);
csr = SparseCOOCSR::CSRSparse<IDX>(ctx);
tcsr = aten::COOToCSR(coo);
ASSERT_FALSE(coo.row_sorted);
ASSERT_TRUE(
isSparseCOO(omp_get_num_threads(), coo.num_rows, coo.row->shape[0]));
ASSERT_EQ(csr.num_rows, tcsr.num_rows);
ASSERT_EQ(csr.num_cols, tcsr.num_cols);
ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));
}
TEST(SpmatTest, COOToCSR) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment