Unverified Commit f4608c22 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA][Kernel] A bunch of int64 kernels for COO and CSR (#1883)

* COO sort

* COOToCSR

* CSR2COO

* CSRSort; CSRTranspose

* pass all CSR tests

* lint

* remove int32 conversion

* fix tensorflow nn tests

* turn on CI

* fix

* addreess comments
parent 5b515cf6
...@@ -375,7 +375,9 @@ __global__ void _SegmentMaskKernel( ...@@ -375,7 +375,9 @@ __global__ void _SegmentMaskKernel(
* of each needle so that the insertion still gives sorted order. * of each needle so that the insertion still gives sorted order.
* *
* It essentially perform binary search to find lower bound for each needle * It essentially perform binary search to find lower bound for each needle
* elements. * elements. Require the largest elements in the hay is larger than the given
* needle elements. Commonly used in searching for row IDs of a given set of
* coordinates.
*/ */
template <typename IdType> template <typename IdType>
__global__ void _SortedSearchKernel( __global__ void _SortedSearchKernel(
...@@ -435,7 +437,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co ...@@ -435,7 +437,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits); IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
const int nt2 = cuda::FindNumThreads(idx->shape[0]); const int nt2 = cuda::FindNumThreads(idx->shape[0]);
const int nb2 = (idx->shape[0] + nt - 1) / nt; const int nb2 = (idx->shape[0] + nt - 1) / nt;
_SortedSearchKernel<<<nb, nt, 0, thr_entry->stream>>>( _SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.num_rows, csr.indptr.Ptr<IdType>(), csr.num_rows,
idx.Ptr<IdType>(), idx->shape[0], idx.Ptr<IdType>(), idx->shape[0],
ret_row.Ptr<IdType>()); ret_row.Ptr<IdType>());
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <vector> #include <vector>
#include <set> #include <set>
#include <tuple> #include <tuple>
#include <memory>
#include "./unit_graph.h" #include "./unit_graph.h"
#include "shared_mem_manager.h" #include "shared_mem_manager.h"
......
...@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu': ...@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64]) parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
else: else:
# only test int32 on GPU because many graph operators are not supported for int64. # only test int32 on GPU because many graph operators are not supported for int64.
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32]) parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
def check_fail(fn, *args, **kwargs): def check_fail(fn, *args, **kwargs):
try: try:
......
...@@ -111,6 +111,10 @@ void _TestArith(DLContext ctx) { ...@@ -111,6 +111,10 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU); c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i) for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 10); ASSERT_EQ(Ptr<IDX>(c)[i], 10);
c = (-a) % b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
const int val = -3; const int val = -3;
c = aten::Add(a, val); c = aten::Add(a, val);
...@@ -129,6 +133,11 @@ void _TestArith(DLContext ctx) { ...@@ -129,6 +133,11 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU); c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i) for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3); ASSERT_EQ(Ptr<IDX>(c)[i], 3);
c = b % 3;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 1);
c = aten::Add(val, b); c = aten::Add(val, b);
c = c.CopyTo(CPU); c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i) for (int i = 0; i < N; ++i)
...@@ -145,6 +154,10 @@ void _TestArith(DLContext ctx) { ...@@ -145,6 +154,10 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU); c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i) for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 0); ASSERT_EQ(Ptr<IDX>(c)[i], 0);
c = 3 % b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
a = aten::Range(0, N, sizeof(IDX)*8, ctx); a = aten::Range(0, N, sizeof(IDX)*8, ctx);
c = a < 50; c = a < 50;
...@@ -179,7 +192,7 @@ void _TestArith(DLContext ctx) { ...@@ -179,7 +192,7 @@ void _TestArith(DLContext ctx) {
} }
TEST(ArrayTest, TestArith) { TEST(ArrayTest, Arith) {
_TestArith<int32_t>(CPU); _TestArith<int32_t>(CPU);
_TestArith<int64_t>(CPU); _TestArith<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
...@@ -1327,17 +1340,17 @@ void _TestLineGraphCOO(DLContext ctx) { ...@@ -1327,17 +1340,17 @@ void _TestLineGraphCOO(DLContext ctx) {
* [0, 0, 0, 0, 0, 0]] * [0, 0, 0, 0, 0, 0]]
*/ */
IdArray a_row = IdArray a_row =
aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, ctx);
IdArray a_col = IdArray a_col =
aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, ctx);
IdArray b_row = IdArray b_row =
aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, ctx);
IdArray b_col = IdArray b_col =
aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, ctx);
IdArray c_row = IdArray c_row =
aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, ctx);
IdArray c_col = IdArray c_col =
aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, ctx);
const aten::COOMatrix &coo_a = aten::COOMatrix( const aten::COOMatrix &coo_a = aten::COOMatrix(
4, 4,
...@@ -1365,15 +1378,15 @@ void _TestLineGraphCOO(DLContext ctx) { ...@@ -1365,15 +1378,15 @@ void _TestLineGraphCOO(DLContext ctx) {
ASSERT_FALSE(l_coo2.col_sorted); ASSERT_FALSE(l_coo2.col_sorted);
IdArray a_data = IdArray a_data =
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, ctx);
b_row = b_row =
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, ctx);
b_col = b_col =
aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, ctx);
c_row = c_row =
aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, ctx);
c_col = c_col =
aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, CTX); aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, ctx);
const aten::COOMatrix &coo_ad = aten::COOMatrix( const aten::COOMatrix &coo_ad = aten::COOMatrix(
4, 4,
4, 4,
...@@ -1403,3 +1416,44 @@ TEST(LineGraphTest, LineGraphCOO) { ...@@ -1403,3 +1416,44 @@ TEST(LineGraphTest, LineGraphCOO) {
_TestLineGraphCOO<int32_t>(CPU); _TestLineGraphCOO<int32_t>(CPU);
_TestLineGraphCOO<int64_t>(CPU); _TestLineGraphCOO<int64_t>(CPU);
} }
template <typename IDX>
void _TestSort(DLContext ctx) {
// case 1
IdArray a =
aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX)*8, ctx);
IdArray sorted_a =
aten::VecToIdArray(std::vector<IDX>({0, 3, 5, 6, 7, 8, 9}), sizeof(IDX)*8, ctx);
IdArray sorted_idx =
aten::VecToIdArray(std::vector<IDX>({5, 4, 3, 1, 2, 0, 6}), 64, ctx);
IdArray sorted, idx;
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
// case 2: empty array
a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
sorted_a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
sorted_idx = aten::VecToIdArray(std::vector<IDX>({}), 64, ctx);
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
// case 3: array with one element
a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
sorted_a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
sorted_idx = aten::VecToIdArray(std::vector<IDX>({0}), 64, ctx);
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
}
TEST(ArrayTest, Sort) {
_TestSort<int32_t>(CPU);
_TestSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestSort<int32_t>(GPU);
_TestSort<int64_t>(GPU);
#endif
}
...@@ -180,6 +180,7 @@ TEST(SpmatTest, COOToCSR) { ...@@ -180,6 +180,7 @@ TEST(SpmatTest, COOToCSR) {
_TestCOOToCSR<int64_t>(CPU); _TestCOOToCSR<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCOOToCSR<int32_t>(GPU); _TestCOOToCSR<int32_t>(GPU);
_TestCOOToCSR<int64_t>(GPU);
#endif #endif
} }
...@@ -265,6 +266,7 @@ TEST(SpmatTest, COOSort) { ...@@ -265,6 +266,7 @@ TEST(SpmatTest, COOSort) {
_TestCOOSort<int64_t>(CPU); _TestCOOSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCOOSort<int32_t>(GPU); _TestCOOSort<int32_t>(GPU);
_TestCOOSort<int64_t>(GPU);
#endif #endif
} }
......
...@@ -241,6 +241,7 @@ TEST(SpmatTest, CSRGetData) { ...@@ -241,6 +241,7 @@ TEST(SpmatTest, CSRGetData) {
_TestCSRGetData<int64_t>(CPU); _TestCSRGetData<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCSRGetData<int32_t>(GPU); _TestCSRGetData<int32_t>(GPU);
_TestCSRGetData<int64_t>(GPU);
#endif #endif
} }
...@@ -287,11 +288,12 @@ void _TestCSRTranspose(DLContext ctx) { ...@@ -287,11 +288,12 @@ void _TestCSRTranspose(DLContext ctx) {
ASSERT_TRUE(ArrayEQ<IDX>(csr_t.data, td)); ASSERT_TRUE(ArrayEQ<IDX>(csr_t.data, td));
} }
TEST(SpmatTest, TestCSRTranspose) { TEST(SpmatTest, CSRTranspose) {
_TestCSRTranspose<int32_t>(CPU); _TestCSRTranspose<int32_t>(CPU);
_TestCSRTranspose<int64_t>(CPU); _TestCSRTranspose<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCSRTranspose<int32_t>(GPU); _TestCSRTranspose<int32_t>(GPU);
_TestCSRTranspose<int64_t>(GPU);
#endif #endif
} }
...@@ -335,6 +337,7 @@ TEST(SpmatTest, CSRToCOO) { ...@@ -335,6 +337,7 @@ TEST(SpmatTest, CSRToCOO) {
_TestCSRToCOO<int64_t>(CPU); _TestCSRToCOO<int64_t>(CPU);
#if DGL_USE_CUDA #if DGL_USE_CUDA
_TestCSRToCOO<int32_t>(GPU); _TestCSRToCOO<int32_t>(GPU);
_TestCSRToCOO<int64_t>(GPU);
#endif #endif
} }
...@@ -441,6 +444,7 @@ TEST(SpmatTest, CSRSliceMatrix) { ...@@ -441,6 +444,7 @@ TEST(SpmatTest, CSRSliceMatrix) {
_TestCSRSliceMatrix<int64_t>(CPU); _TestCSRSliceMatrix<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCSRSliceMatrix<int32_t>(GPU); _TestCSRSliceMatrix<int32_t>(GPU);
_TestCSRSliceMatrix<int64_t>(GPU);
#endif #endif
} }
...@@ -457,6 +461,7 @@ TEST(SpmatTest, CSRHasDuplicate) { ...@@ -457,6 +461,7 @@ TEST(SpmatTest, CSRHasDuplicate) {
_TestCSRHasDuplicate<int64_t>(CPU); _TestCSRHasDuplicate<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCSRHasDuplicate<int32_t>(GPU); _TestCSRHasDuplicate<int32_t>(GPU);
_TestCSRHasDuplicate<int64_t>(GPU);
#endif #endif
} }
...@@ -480,6 +485,7 @@ TEST(SpmatTest, CSRSort) { ...@@ -480,6 +485,7 @@ TEST(SpmatTest, CSRSort) {
_TestCSRSort<int64_t>(CPU); _TestCSRSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
_TestCSRSort<int32_t>(GPU); _TestCSRSort<int32_t>(GPU);
_TestCSRSort<int64_t>(GPU);
#endif #endif
} }
......
...@@ -247,7 +247,7 @@ def test_partial_edge_softmax(): ...@@ -247,7 +247,7 @@ def test_partial_edge_softmax():
grad = F.randn((300, 1)) grad = F.randn((300, 1))
import numpy as np import numpy as np
eids = np.random.choice(900, 300, replace=False).astype('int64') eids = np.random.choice(900, 300, replace=False).astype('int64')
eids = F.zerocopy_from_numpy(eids) eids = F.tensor(eids)
# compute partial edge softmax # compute partial edge softmax
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
tape.watch(score) tape.watch(score)
...@@ -255,7 +255,7 @@ def test_partial_edge_softmax(): ...@@ -255,7 +255,7 @@ def test_partial_edge_softmax():
grads = tape.gradient(y_1, [score]) grads = tape.gradient(y_1, [score])
grad_1 = grads[0] grad_1 = grads[0]
# compute edge softmax on edge subgraph # compute edge softmax on edge subgraph
subg = g.edge_subgraph(eids) subg = g.edge_subgraph(eids, preserve_nodes=True)
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
tape.watch(score) tape.watch(score)
y_2 = nn.edge_softmax(subg, score) y_2 = nn.edge_softmax(subg, score)
...@@ -348,8 +348,8 @@ def test_rgcn(): ...@@ -348,8 +348,8 @@ def test_rgcn():
rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True) rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True)
rgc_basis_low.weight = rgc_basis.weight rgc_basis_low.weight = rgc_basis.weight
rgc_basis_low.w_comp = rgc_basis.w_comp rgc_basis_low.w_comp = rgc_basis.w_comp
h = tf.constant(np.random.randint(0, I, (100,))) h = tf.constant(np.random.randint(0, I, (100,))) * 1
r = tf.constant(etype) r = tf.constant(etype) * 1
h_new = rgc_basis(g, h, r) h_new = rgc_basis(g, h, r)
h_new_low = rgc_basis_low(g, h, r) h_new_low = rgc_basis_low(g, h, r)
assert list(h_new.shape) == [100, O] assert list(h_new.shape) == [100, O]
......
...@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu': ...@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64]) parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
else: else:
# only test int32 on GPU because many graph operators are not supported for int64. # only test int32 on GPU because many graph operators are not supported for int64.
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32]) parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
from .checks import * from .checks import *
from .graph_cases import get_cases from .graph_cases import get_cases
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment