[CUDA][Kernel] A bunch of int64 kernels for COO and CSR (#1883)

* COO sort * COOToCSR * CSR2COO * CSRSort; CSRTranspose * pass all CSR tests * lint * remove int32 conversion * fix tensorflow nn tests * turn on CI * fix * addreess comments

[CUDA][Kernel] A bunch of int64 kernels for COO and CSR (#1883)
* COO sort * COOToCSR * CSR2COO * CSRSort; CSRTranspose * pass all CSR tests * lint * remove int32 conversion * fix tensorflow nn tests * turn on CI * fix * addreess comments
f4608c22 · Minjie Wang · GitHub · 5b515cf6 · f4608c22 · f4608c22
Unverified Commit f4608c22 authored Jul 30, 2020 by Minjie Wang Committed by GitHub Jul 30, 2020
8 changed files
--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -375,7 +375,9 @@ __global__ void _SegmentMaskKernel(
 * of each needle so that the insertion still gives sorted order.
 *
 * It essentially perform binary search to find lower bound for each needle
- * elements.
+ * elements. Require the largest elements in the hay is larger than the given
+ * needle elements. Commonly used in searching for row IDs of a given set of
+ * coordinates.
 */
 template <typename IdType>
 __global__ void _SortedSearchKernel(
@@ -435,7 +437,7 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray row, NDArray co
  IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
  const int nt2 = cuda::FindNumThreads(idx->shape[0]);
  const int nb2 = (idx->shape[0] + nt - 1) / nt;
-  _SortedSearchKernel<<<nb, nt, 0, thr_entry->stream>>>(
+  _SortedSearchKernel<<<nb2, nt2, 0, thr_entry->stream>>>(
      csr.indptr.Ptr<IdType>(), csr.num_rows,
      idx.Ptr<IdType>(), idx->shape[0],
      ret_row.Ptr<IdType>());

--- a/src/graph/heterograph.h
+++ b/src/graph/heterograph.h
@@ -15,6 +15,7 @@
 #include <vector>
 #include <set>
 #include <tuple>
+#include <memory>
 #include "./unit_graph.h"
 #include "shared_mem_manager.h"


--- a/tests/compute/utils.py
+++ b/tests/compute/utils.py
@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
 else:
    # only test int32 on GPU because many graph operators are not supported for int64.
-    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32])
+    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])

 def check_fail(fn, *args, **kwargs):
    try:

--- a/tests/cpp/test_aten.cc
+++ b/tests/cpp/test_aten.cc
@@ -111,6 +111,10 @@ void _TestArith(DLContext ctx) {
  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 10);
+  c = (-a) % b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], 3);

  const int val = -3;
  c = aten::Add(a, val);
@@ -129,6 +133,11 @@ void _TestArith(DLContext ctx) {
  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 3);
+  c = b % 3;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], 1);
+
  c = aten::Add(val, b);
  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
@@ -145,6 +154,10 @@ void _TestArith(DLContext ctx) {
  c = c.CopyTo(CPU);
  for (int i = 0; i < N; ++i)
    ASSERT_EQ(Ptr<IDX>(c)[i], 0);
+  c = 3 % b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i)
+    ASSERT_EQ(Ptr<IDX>(c)[i], 3);

  a = aten::Range(0, N, sizeof(IDX)*8, ctx);
  c = a < 50;
@@ -179,7 +192,7 @@ void _TestArith(DLContext ctx) {

 }

-TEST(ArrayTest, TestArith) {
+TEST(ArrayTest, Arith) {
  _TestArith<int32_t>(CPU);
  _TestArith<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
@@ -1327,17 +1340,17 @@ void _TestLineGraphCOO(DLContext ctx) {
   *      [0, 0, 0, 0, 0, 0]]
   */
  IdArray a_row =
-    aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType)*8, ctx);
  IdArray a_col =
-    aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType)*8, ctx);
  IdArray b_row =
-    aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType)*8, ctx);
  IdArray b_col =
-    aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType)*8, ctx);
  IdArray c_row =
-    aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType)*8, ctx);
  IdArray c_col =
-    aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType)*8, ctx);

  const aten::COOMatrix &coo_a = aten::COOMatrix(
    4,
@@ -1365,15 +1378,15 @@ void _TestLineGraphCOO(DLContext ctx) {
  ASSERT_FALSE(l_coo2.col_sorted);

  IdArray a_data =
-    aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType)*8, ctx);
  b_row =
-    aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType)*8, ctx);
  b_col =
-    aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType)*8, ctx);
  c_row =
-    aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType)*8, ctx);
  c_col =
-    aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, CTX);
+    aten::VecToIdArray(std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType)*8, ctx);
  const aten::COOMatrix &coo_ad = aten::COOMatrix(
    4,
    4,
@@ -1403,3 +1416,44 @@ TEST(LineGraphTest, LineGraphCOO) {
  _TestLineGraphCOO<int32_t>(CPU);
  _TestLineGraphCOO<int64_t>(CPU);
 }
+
+template <typename IDX>
+void _TestSort(DLContext ctx) {
+  // case 1
+  IdArray a =
+    aten::VecToIdArray(std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX)*8, ctx);
+  IdArray sorted_a =
+    aten::VecToIdArray(std::vector<IDX>({0, 3, 5, 6, 7, 8, 9}), sizeof(IDX)*8, ctx);
+  IdArray sorted_idx =
+    aten::VecToIdArray(std::vector<IDX>({5, 4, 3, 1, 2, 0, 6}), 64, ctx);
+
+  IdArray sorted, idx;
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+
+  // case 2: empty array
+  a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+  sorted_a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+  sorted_idx = aten::VecToIdArray(std::vector<IDX>({}), 64, ctx);
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+
+  // case 3: array with one element
+  a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
+  sorted_a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX)*8, ctx);
+  sorted_idx = aten::VecToIdArray(std::vector<IDX>({0}), 64, ctx);
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+}
+
+TEST(ArrayTest, Sort) {
+  _TestSort<int32_t>(CPU);
+  _TestSort<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestSort<int32_t>(GPU);
+  _TestSort<int64_t>(GPU);
+#endif
+}
--- a/tests/cpp/test_spmat_coo.cc
+++ b/tests/cpp/test_spmat_coo.cc
@@ -180,6 +180,7 @@ TEST(SpmatTest, COOToCSR) {
  _TestCOOToCSR<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCOOToCSR<int32_t>(GPU);
+  _TestCOOToCSR<int64_t>(GPU);
 #endif
 }

@@ -265,6 +266,7 @@ TEST(SpmatTest, COOSort) {
  _TestCOOSort<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCOOSort<int32_t>(GPU);
+  _TestCOOSort<int64_t>(GPU);
 #endif
 }


--- a/tests/cpp/test_spmat_csr.cc
+++ b/tests/cpp/test_spmat_csr.cc
@@ -241,6 +241,7 @@ TEST(SpmatTest, CSRGetData) {
  _TestCSRGetData<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCSRGetData<int32_t>(GPU);
+  _TestCSRGetData<int64_t>(GPU);
 #endif
 }

@@ -287,11 +288,12 @@ void _TestCSRTranspose(DLContext ctx) {
  ASSERT_TRUE(ArrayEQ<IDX>(csr_t.data, td));
 }

-TEST(SpmatTest, TestCSRTranspose) {
+TEST(SpmatTest, CSRTranspose) {
  _TestCSRTranspose<int32_t>(CPU);
  _TestCSRTranspose<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCSRTranspose<int32_t>(GPU);
+  _TestCSRTranspose<int64_t>(GPU);
 #endif
 }

@@ -335,6 +337,7 @@ TEST(SpmatTest, CSRToCOO) {
  _TestCSRToCOO<int64_t>(CPU);
 #if DGL_USE_CUDA
  _TestCSRToCOO<int32_t>(GPU);
+  _TestCSRToCOO<int64_t>(GPU);
 #endif
 }

@@ -441,6 +444,7 @@ TEST(SpmatTest, CSRSliceMatrix) {
  _TestCSRSliceMatrix<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCSRSliceMatrix<int32_t>(GPU);
+  _TestCSRSliceMatrix<int64_t>(GPU);
 #endif
 }

@@ -457,6 +461,7 @@ TEST(SpmatTest, CSRHasDuplicate) {
  _TestCSRHasDuplicate<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCSRHasDuplicate<int32_t>(GPU);
+  _TestCSRHasDuplicate<int64_t>(GPU);
 #endif
 }

@@ -480,6 +485,7 @@ TEST(SpmatTest, CSRSort) {
  _TestCSRSort<int64_t>(CPU);
 #ifdef DGL_USE_CUDA
  _TestCSRSort<int32_t>(GPU);
+  _TestCSRSort<int64_t>(GPU);
 #endif
 }


--- a/tests/tensorflow/test_nn.py
+++ b/tests/tensorflow/test_nn.py
@@ -247,7 +247,7 @@ def test_partial_edge_softmax():
    grad = F.randn((300, 1))
    import numpy as np
    eids = np.random.choice(900, 300, replace=False).astype('int64')
-    eids = F.zerocopy_from_numpy(eids)
+    eids = F.tensor(eids)
    # compute partial edge softmax
    with tf.GradientTape() as tape:
        tape.watch(score)
@@ -255,7 +255,7 @@ def test_partial_edge_softmax():
        grads = tape.gradient(y_1, [score])
    grad_1 = grads[0]
    # compute edge softmax on edge subgraph
-    subg = g.edge_subgraph(eids)
+    subg = g.edge_subgraph(eids, preserve_nodes=True)
    with tf.GradientTape() as tape:
        tape.watch(score)
        y_2 = nn.edge_softmax(subg, score)
@@ -348,8 +348,8 @@ def test_rgcn():
    rgc_basis_low = nn.RelGraphConv(I, O, R, "basis", B, low_mem=True)
    rgc_basis_low.weight = rgc_basis.weight
    rgc_basis_low.w_comp = rgc_basis.w_comp
-    h = tf.constant(np.random.randint(0, I, (100,)))
-    r = tf.constant(etype)
+    h = tf.constant(np.random.randint(0, I, (100,))) * 1
+    r = tf.constant(etype) * 1
    h_new = rgc_basis(g, h, r)
    h_new_low = rgc_basis_low(g, h, r)
    assert list(h_new.shape) == [100, O]

--- a/tests/test_utils/__init__.py
+++ b/tests/test_utils/__init__.py
@@ -5,7 +5,7 @@ if F._default_context_str == 'cpu':
    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])
 else:
    # only test int32 on GPU because many graph operators are not supported for int64.
-    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32])
+    parametrize_dtype = pytest.mark.parametrize("idtype", [F.int32, F.int64])

 from .checks import *
 from .graph_cases import get_cases