[Feature] Improve sampling speed; Better pickle/unpickle; other fixes (#1299)

* improve performance of sample_neighbors * some more improve * test script * benchmarks * multi process * update more tests * WIP * adding two API for state saving * add create from state * upd test * missing file * wip: pickle/unpickle * more c apis * find the problem of empty data array * add null array; pickling speed is bad * still bad perf * still bad perf * wip * fix the pickle speed test; now everything looks good * minor fix * bugfix * some lint fix * address comments * more fix * fix lint * add utest for random.choice * add utest for dgl.rand_graph * fix cpp utests * try fix ci * fix bug in TF backend * upd choice docstring * address comments * upd * try fix compile * add comment

[Feature] Improve sampling speed; Better pickle/unpickle; other fixes (#1299)
* improve performance of sample_neighbors * some more improve * test script * benchmarks * multi process * update more tests * WIP * adding two API for state saving * add create from state * upd test * missing file * wip: pickle/unpickle * more c apis * find the problem of empty data array * add null array; pickling speed is bad * still bad perf * still bad perf * wip * fix the pickle speed test; now everything looks good * minor fix * bugfix * some lint fix * address comments * more fix * fix lint * add utest for random.choice * add utest for dgl.rand_graph * fix cpp utests * try fix ci * fix bug in TF backend * upd choice docstring * address comments * upd * try fix compile * add comment
5dd35580 · Minjie Wang · GitHub · 00ba4094 · 5dd35580 · 5dd35580
Unverified Commit 5dd35580 authored Mar 02, 2020 by Minjie Wang Committed by GitHub Mar 02, 2020
5 changed files
--- a/tests/compute/test_pickle.py
+++ b/tests/compute/test_pickle.py
@@ -268,3 +268,4 @@ if __name__ == '__main__':
    test_pickling_graph()
    test_pickling_nodeflow()
    test_pickling_batched_graph()
+    test_pickling_heterograph()
--- a/tests/compute/test_random.py
+++ b/tests/compute/test_random.py
+import dgl
+import backend as F
+import numpy as np
+import unittest
+@unittest.skipIf(F._default_context_str == 'gpu', reason="GPU random choice not implemented")
+def test_random_choice():
+    # test 1
+    a = F.arange(0, 100)
+    x = dgl.random.choice(a, 10, replace=True, prob=None)
+    assert len(x) == 10
+    for i in range(len(x)):
+        assert x[i] >= 0 and x[i] < 100
+    # test 2, replace=False, small num
+    a = F.arange(0, 100)
+    x = dgl.random.choice(a, 10, replace=False, prob=None)
+    assert len(x) == 10
+    for i in range(len(x)):
+        assert x[i] >= 0 and x[i] < 100
+    # test 3, replace=False, large num
+    a = F.arange(0, 100)
+    x = dgl.random.choice(a, 100, replace=False, prob=None)
+    assert len(x) == 100
+    assert np.array_equal(np.sort(F.asnumpy(x)), F.asnumpy(a))
+    # test 4, first arg is integer
+    x = dgl.random.choice(100, 100, replace=False, prob=None)
+    assert len(x) == 100
+    assert np.array_equal(np.sort(F.asnumpy(x)), F.asnumpy(a))
+    # test 5, with prob
+    prob = np.ones((100,))
+    prob[37:40] = 0.
+    prob -= prob.min()
+    prob /= prob.sum()
+    prob = F.tensor(prob)
+    x = dgl.random.choice(100, 97, replace=False, prob=prob)
+    assert len(x) == 97
+    for i in range(len(x)):
+        assert x[i] < 37 or x[i] >= 40
+if __name__ == '__main__':
+    test_random_choice()
--- a/tests/compute/test_sampling.py
+++ b/tests/compute/test_sampling.py
@@ -271,7 +271,7 @@ def _test_sample_neighbors(hypersparse):
    # test different fanouts for different relations
    for i in range(10):
-        subg = dgl.sampling.sample_neighbors(hg, {'user' : [0,1], 'game' : 0}, [1, 2, 0, 2])
+        subg = dgl.sampling.sample_neighbors(hg, {'user' : [0,1], 'game' : 0}, [1, 2, 0, 2], replace=True)
        assert len(subg.ntypes) == 3
        assert len(subg.etypes) == 4
        assert subg['follow'].number_of_edges() == 2

--- a/tests/cpp/test_rowwise.cc
+++ b/tests/cpp/test_rowwise.cc
@@ -137,7 +137,7 @@ TEST(RowwiseTest, TestCSRSampling) {
 template <typename Idx, typename FloatType>
 void _TestCSRSamplingUniform(bool has_data) {
  auto mat = CSR<Idx>(has_data);
-  FloatArray prob;
+  FloatArray prob = aten::NullArray();
  IdArray rows = NDArray::FromVector(std::vector<Idx>({0, 3}));
  for (int k = 0; k < 10; ++k) {
    auto rst = CSRRowWiseSampling(mat, rows, 2, prob, true);
@@ -229,7 +229,7 @@ TEST(RowwiseTest, TestCOOSampling) {
 template <typename Idx, typename FloatType>
 void _TestCOOSamplingUniform(bool has_data) {
  auto mat = COO<Idx>(has_data);
-  FloatArray prob;
+  FloatArray prob = aten::NullArray();
  IdArray rows = NDArray::FromVector(std::vector<Idx>({0, 3}));
  for (int k = 0; k < 10; ++k) {
    auto rst = COORowWiseSampling(mat, rows, 2, prob, true);

--- a/tests/cpp/test_spmat.cc
+++ b/tests/cpp/test_spmat.cc
@@ -14,14 +14,12 @@ aten::CSRMatrix CSR1() {
  //  [0, 0, 1, 1, 0],
  //  [0, 0, 0, 0, 0]]
  // data: [0, 2, 3, 1, 4]
-  aten::CSRMatrix csr;
+  return aten::CSRMatrix(
-  csr.num_rows = 4;
+      4, 5,
-  csr.num_cols = 5;
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, CTX),
-  csr.indptr = aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, CTX),
-  csr.indices = aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 1, 4}), sizeof(IDX)*8, CTX),
-  csr.data = aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 1, 4}), sizeof(IDX)*8, CTX);
+      false);
-  csr.sorted = false;
-  return csr;
 }
 template <typename IDX>
@@ -32,14 +30,12 @@ aten::CSRMatrix CSR2() {
  //  [0, 0, 1, 1, 0],
  //  [0, 0, 0, 0, 0]]
  // data: [0, 2, 5, 3, 1, 4]
-  aten::CSRMatrix csr;
+  return aten::CSRMatrix(
-  csr.num_rows = 4;
+      4, 5,
-  csr.num_cols = 5;
+      aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX)*8, CTX),
-  csr.indptr = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, CTX),
-  csr.indices = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, CTX),
-  csr.data = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, CTX);
+      false);
-  csr.sorted = false;
-  return csr;
 }
 template <typename IDX>
@@ -51,12 +47,10 @@ aten::COOMatrix COO1() {
  // data: [0, 2, 3, 1, 4]
  // row : [0, 2, 0, 1, 2]
  // col : [1, 2, 2, 0, 3]
-  aten::COOMatrix coo;
+  return aten::COOMatrix(
-  coo.num_rows = 4;
+      4, 5,
-  coo.num_cols = 5;
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX)*8, CTX),
-  coo.row = aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX)*8, CTX));
-  coo.col = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX)*8, CTX);
-  return coo;
 }
 template <typename IDX>
@@ -69,12 +63,10 @@ aten::COOMatrix COO2() {
  // data: [0, 2, 5, 3, 1, 4]
  // row : [0, 2, 0, 1, 2, 0]
  // col : [1, 2, 2, 0, 3, 2]
-  aten::COOMatrix coo;
+  return aten::COOMatrix(
-  coo.num_rows = 4;
+      4, 5,
-  coo.num_cols = 5;
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX)*8, CTX),
-  coo.row = aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX)*8, CTX);
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX)*8, CTX));
-  coo.col = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX)*8, CTX);
-  return coo;
 }
 }