Unverified Commit f4608c22 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[CUDA][Kernel] A bunch of int64 kernels for COO and CSR (#1883)

* COO sort

* COOToCSR

* CSR2COO

* CSRSort; CSRTranspose

* pass all CSR tests

* lint

* remove int32 conversion

* fix tensorflow nn tests

* turn on CI

* fix

* addreess comments
parent 5b515cf6
......@@ -375,7 +375,9 @@ __global__ void _SegmentMaskKernel(
* of each needle so that the insertion still gives sorted order.
*
* It essentially perform binary search to find lower bound for each needle
* elements.
* elements. Require the largest elements in the hay is larger than the given
* needle elements. Commonly used in searching for row IDs of a given set of
* coordinates.
*/
template <typename IdType>
__global__ void _SortedSearchKernel(
......@@ -435,7 +437,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
const int nt2 = cuda::FindNumThreads(idx->shape[0]);
const int nb2 = (idx->shape[0] + nt - 1) / nt;
_SortedSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
_SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
csr.indptr.Ptr<IdType>(), csr.num_rows,
idx.Ptr<IdType>(), idx->shape[0],
ret_row.Ptr<IdType>());
......
......@@ -15,6 +15,7 @@
#include <vector>
#include <set>
#include <tuple>
#include <memory>
#include "./unit_graph.h"
#include "shared_mem_manager.h"
......
......@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
else:
# only test int32 on GPU because many graph operators are not supported for int64.
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32])
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
def check_fail(fn, *args, **kwargs):
try:
......
......@@ -111,6 +111,10 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 10);
c = (-a) % b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
const int val = -3;
c = aten::Add(a, val);
......@@ -129,6 +133,11 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
c = b % 3;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 1);
c = aten::Add(val, b);
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
......@@ -145,6 +154,10 @@ void _TestArith(DLContext ctx) {
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 0);
c = 3 % b;
c = c.CopyTo(CPU);
for (int i = 0; i < N; ++i)
ASSERT_EQ(Ptr<IDX>(c)[i], 3);
a = aten::Range(0, N, sizeof(IDX)*8, ctx);
c = a < 50;
......@@ -179,7 +192,7 @@ void _TestArith(DLContext ctx) {
}
TEST(ArrayTest, TestArith) {
TEST(ArrayTest, Arith) {
_TestArith<int32_t>(CPU);
_TestArith<int64_t>(CPU);
#ifdef DGL_USE_CUDA
......@@ -1327,17 +1340,17 @@ void _TestLineGraphCOO(DLContext ctx) {
* [0, 0, 0, 0, 0, 0]]
*/
IdArray a_row =
aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, ctx);
IdArray a_col =
aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, ctx);
IdArray b_row =
aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, ctx);
IdArray b_col =
aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, ctx);
IdArray c_row =
aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, ctx);
IdArray c_col =
aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, ctx);
const aten::COOMatrix &coo_a = aten::COOMatrix(
4,
......@@ -1365,15 +1378,15 @@ void _TestLineGraphCOO(DLContext ctx) {
ASSERT_FALSE(l_coo2.col_sorted);
IdArray a_data =
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, ctx);
b_row =
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, ctx);
b_col =
aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, ctx);
c_row =
aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, ctx);
c_col =
aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, CTX);
aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, ctx);
const aten::COOMatrix &coo_ad = aten::COOMatrix(
4,
4,
......@@ -1403,3 +1416,44 @@ TEST(LineGraphTest, LineGraphCOO) {
_TestLineGraphCOO<int32_t>(CPU);
_TestLineGraphCOO<int64_t>(CPU);
}
template <typename IDX>
void _TestSort(DLContext ctx) {
// case 1
IdArray a =
aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX)*8, ctx);
IdArray sorted_a =
aten::VecToIdArray(std::vector<IDX>({0, 3, 5, 6, 7, 8, 9}), sizeof(IDX)*8, ctx);
IdArray sorted_idx =
aten::VecToIdArray(std::vector<IDX>({5, 4, 3, 1, 2, 0, 6}), 64, ctx);
IdArray sorted, idx;
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
// case 2: empty array
a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
sorted_a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
sorted_idx = aten::VecToIdArray(std::vector<IDX>({}), 64, ctx);
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
// case 3: array with one element
a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
sorted_a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
sorted_idx = aten::VecToIdArray(std::vector<IDX>({0}), 64, ctx);
std::tie(sorted, idx) = aten::Sort(a);
ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
}
TEST(ArrayTest, Sort) {
_TestSort<int32_t>(CPU);
_TestSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestSort<int32_t>(GPU);
_TestSort<int64_t>(GPU);
#endif
}
......@@ -180,6 +180,7 @@ TEST(SpmatTest, COOToCSR) {
_TestCOOToCSR<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCOOToCSR<int32_t>(GPU);
_TestCOOToCSR<int64_t>(GPU);
#endif
}
......@@ -265,6 +266,7 @@ TEST(SpmatTest, COOSort) {
_TestCOOSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCOOSort<int32_t>(GPU);
_TestCOOSort<int64_t>(GPU);
#endif
}
......
......@@ -241,6 +241,7 @@ TEST(SpmatTest, CSRGetData) {
_TestCSRGetData<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRGetData<int32_t>(GPU);
_TestCSRGetData<int64_t>(GPU);
#endif
}
......@@ -287,11 +288,12 @@ void _TestCSRTranspose(DLContext ctx) {
ASSERT_TRUE(ArrayEQ<IDX>(csr_t.data, td));
}
TEST(SpmatTest, TestCSRTranspose) {
TEST(SpmatTest, CSRTranspose) {
_TestCSRTranspose<int32_t>(CPU);
_TestCSRTranspose<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRTranspose<int32_t>(GPU);
_TestCSRTranspose<int64_t>(GPU);
#endif
}
......@@ -335,6 +337,7 @@ TEST(SpmatTest, CSRToCOO) {
_TestCSRToCOO<int64_t>(CPU);
#if DGL_USE_CUDA
_TestCSRToCOO<int32_t>(GPU);
_TestCSRToCOO<int64_t>(GPU);
#endif
}
......@@ -441,6 +444,7 @@ TEST(SpmatTest, CSRSliceMatrix) {
_TestCSRSliceMatrix<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRSliceMatrix<int32_t>(GPU);
_TestCSRSliceMatrix<int64_t>(GPU);
#endif
}
......@@ -457,6 +461,7 @@ TEST(SpmatTest, CSRHasDuplicate) {
_TestCSRHasDuplicate<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRHasDuplicate<int32_t>(GPU);
_TestCSRHasDuplicate<int64_t>(GPU);
#endif
}
......@@ -480,6 +485,7 @@ TEST(SpmatTest, CSRSort) {
_TestCSRSort<int64_t>(CPU);
#ifdef DGL_USE_CUDA
_TestCSRSort<int32_t>(GPU);
_TestCSRSort<int64_t>(GPU);
#endif
}
......
......@@ -247,7 +247,7 @@ def test_partial_edge_softmax():
grad = F.randn((300, 1))
import numpy as np
eids = np.random.choice(900, 300, replace=False).astype('int64')
eids = F.zerocopy_from_numpy(eids)
eids = F.tensor(eids)
# compute partial edge softmax
with tf.GradientTape() as tape:
tape.watch(score)
......@@ -255,7 +255,7 @@ def test_partial_edge_softmax():
grads = tape.gradient(y_1, [score])
grad_1 = grads[0]
# compute edge softmax on edge subgraph
subg = g.edge_subgraph(eids)
subg = g.edge_subgraph(eids, preserve_nodes=True)
with tf.GradientTape() as tape:
tape.watch(score)
y_2 = nn.edge_softmax(subg, score)
......@@ -348,8 +348,8 @@ def test_rgcn():
rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True)
rgc_basis_low.weight = rgc_basis.weight
rgc_basis_low.w_comp = rgc_basis.w_comp
h = tf.constant(np.random.randint(0, I, (100,)))
r = tf.constant(etype)
h = tf.constant(np.random.randint(0, I, (100,))) * 1
r = tf.constant(etype) * 1
h_new = rgc_basis(g, h, r)
h_new_low = rgc_basis_low(g, h, r)
assert list(h_new.shape) == [100, O]
......
......@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
else:
# only test int32 on GPU because many graph operators are not supported for int64.
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32])
parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
from .checks import *
from .graph_cases import get_cases
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment