[Feature] No-Uniform Edge Sampler (#1087)

* Add weight based edge sampler * Can run, edge weight work. TODO: test node weight * Fix node weight sample * Fix y * Update doc * Fix syntex * Fix * Fix GPU test for sampler * Fix test * Fix * Refactor EdgeSampler to act as class object not function that it can record its own private states. * clean * Fix * Fix * Fix run bug on kg app * update * update test * test * Simply python API and fix some C code * Fix * Fix * Fix syntex * Fix * Update API description

[Feature] No-Uniform Edge Sampler (#1087)
* Add weight based edge sampler * Can run, edge weight work. TODO: test node weight * Fix node weight sample * Fix y * Update doc * Fix syntex * Fix * Fix GPU test for sampler * Fix test * Fix * Refactor EdgeSampler to act as class object not function that it can record its own private states. * clean * Fix * Fix * Fix run bug on kg app * update * update test * test * Simply python API and fix some C code * Fix * Fix * Fix syntex * Fix * Update API description
632a9af8 · xiang song(charlie.song) · Da Zheng · dd65ee21 · 632a9af8 · 632a9af8
Commit 632a9af8 authored Dec 17, 2019 by xiang song(charlie.song) Committed by Da Zheng Dec 16, 2019
Showing with 745 additions and 103 deletions

python/dgl/contrib/sampling/sampler.py python/dgl/contrib/sampling/sampler.py +64 -13

src/graph/sampler.cc src/graph/sampler.cc +459 -88

tests/compute/test_sampler.py tests/compute/test_sampler.py +222 -2

No files found.
--- a/python/dgl/contrib/sampling/sampler.py
+++ b/python/dgl/contrib/sampling/sampler.py
@@ -7,6 +7,7 @@ from numbers import Integral
 import traceback

 from ..._ffi.function import _init_api
+from ..._ffi.object import register_object, ObjectBase
 from ..._ffi.ndarray import empty
 from ... import utils
 from ...nodeflow import NodeFlow
@@ -509,6 +510,16 @@ class EdgeSampler(object):
    The sampler returns EdgeSubgraph, where a user can access the unique head nodes
    and tail nodes directly.

+    This sampler allows to non-uniformly sample positive edges and negative edges. 
+    For non-uniformly sampling positive edges, users need to provide an array of m 
+    elements (m is the number of edges), i.e. edge_weight, each of which represents 
+    the sampling probability of an edge. For non-uniformly sampling negative edges, 
+    users need to provide an array of n elements, i.e. node_weight and the sampler 
+    samples nodes based on the sampling probability to corrupt a positive edge. If 
+    both edge_weight and node_weight are not provided, a uniformed sampler is used.
+    if only edge_weight is provided, the sampler will take uniform sampling when 
+    corrupt positive edges. 
+
    When the flag `return_false_neg` is turned on, the sampler will also check
    if the generated negative edges are true negative edges and will return
    a vector that indicates false negative edges. The vector is stored in
@@ -519,6 +530,11 @@ class EdgeSampler(object):
    edge only if the triple (source node, destination node and relation)
    matches one of the edges in the graph.

+    For uniform sampling, the sampler generates only num_of_edges/batch_size 
+    samples. 
+
+    For uniform sampling, the sampler generates samples infinitly.
+
    Parameters
    ----------
    g : DGLGraph
@@ -527,6 +543,11 @@ class EdgeSampler(object):
        The batch size (i.e, the number of edges from the graph)
    seed_edges : tensor, optional
        A list of edges where we sample from.
+    edge_weight : tensor, optional
+        The weight of each edge which decide the change of certain edge being sampled.
+    node_weight : tensor, optional
+        The weight of each node which decide the change of certain node being sampled.
+        Used in negative sampling. If not provided, uniform node sampling is used.
    shuffle : bool, optional
        whether randomly shuffle the list of edges where we sample from.
    num_workers : int, optional
@@ -564,6 +585,8 @@ class EdgeSampler(object):
            g,
            batch_size,
            seed_edges=None,
+            edge_weight=None,
+            node_weight=None,
            shuffle=False,
            num_workers=1,
            prefetch=False,
@@ -596,6 +619,16 @@ class EdgeSampler(object):
            self._seed_edges = seed_edges
        if shuffle:
            self._seed_edges = F.rand_shuffle(self._seed_edges)
+        if edge_weight is None:
+            self._is_uniform = True
+        else:
+            self._is_uniform = False
+            self._edge_weight = F.zerocopy_to_dgl_ndarray(edge_weight[self._seed_edges])
+            if node_weight is None:
+                self._node_weight = empty((0,), 'float32')
+            else:
+                self._node_weight = F.zerocopy_to_dgl_ndarray(node_weight)
+
        self._seed_edges = utils.toindex(self._seed_edges)

        if prefetch:
@@ -606,6 +639,30 @@ class EdgeSampler(object):
        self._negative_mode = negative_mode
        self._neg_sample_size = neg_sample_size
        self._exclude_positive = exclude_positive
+        if self._is_uniform:
+            self._sampler = _CAPI_CreateUniformEdgeSampler(
+                self.g._graph,
+                self.seed_edges.todgltensor(),
+                self.batch_size,        # batch size
+                self._num_workers,      # num batches
+                self._negative_mode,
+                self._neg_sample_size,
+                self._exclude_positive,
+                self._return_false_neg,
+                self._relations)
+        else:
+            self._sampler = _CAPI_CreateWeightedEdgeSampler(
+                self.g._graph,
+                self._seed_edges.todgltensor(),
+                self._edge_weight,
+                self._node_weight,
+                self._batch_size,       # batch size
+                self._num_workers,      # num batches
+                self._negative_mode,
+                self._neg_sample_size,
+                self._exclude_positive,
+                self._return_false_neg,
+                self._relations)

    def fetch(self, current_index):
        '''
@@ -616,24 +673,19 @@ class EdgeSampler(object):
        Parameters
        ----------
        current_index : int
-            How many batches the sampler has generated so far.
+            deprecated, not used actually.

        Returns
        -------
        list[GraphIndex] or list[(GraphIndex, GraphIndex)]
            Next "bunch" of edges to be processed.
        '''
-        subgs = _CAPI_UniformEdgeSampling(
-            self.g._graph,
-            self.seed_edges.todgltensor(),
-            current_index, # start batch id
-            self.batch_size,        # batch size
-            self._num_workers,      # num batches
-            self._negative_mode,
-            self._neg_sample_size,
-            self._exclude_positive,
-            self._return_false_neg,
-            self._relations)
+        if self._is_uniform:
+            subgs = _CAPI_FetchUniformEdgeSample(
+                self._sampler)
+        else:
+            subgs = _CAPI_FetchWeightedEdgeSample(
+                self._sampler)

        if len(subgs) == 0:
            return []
@@ -673,7 +725,6 @@ class EdgeSampler(object):
    def batch_size(self):
        return self._batch_size

-
 def create_full_nodeflow(g, num_layers, add_self_loop=False):
    """Convert a full graph to NodeFlow to run a L-layer GNN model.


--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
--- a/tests/compute/test_sampler.py
+++ b/tests/compute/test_sampler.py
@@ -237,6 +237,7 @@ def check_head_tail(g):

 def check_negative_sampler(mode, exclude_positive, neg_size):
    g = generate_rand_graph(100)
+    num_edges = g.number_of_edges()
    etype = np.random.randint(0, 10, size=g.number_of_edges(), dtype=np.int64)
    g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())

@@ -249,7 +250,10 @@ def check_negative_sampler(mode, exclude_positive, neg_size):

    EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
    # Test the homogeneous graph.
-    for pos_edges, neg_edges in EdgeSampler(g, 50,
+    total_samples = 0
+    batch_size = 50
+    max_samples = num_edges
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
                                            negative_mode=mode,
                                            neg_sample_size=neg_size,
                                            exclude_positive=exclude_positive,
@@ -284,8 +288,13 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
        else:
            assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)

+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
    # Test the knowledge graph.
-    for _, neg_edges in EdgeSampler(g, 50,
+    total_samples = 0
+    for _, neg_edges in EdgeSampler(g, batch_size,
                                    negative_mode=mode,
                                    neg_sample_size=neg_size,
                                    exclude_positive=exclude_positive,
@@ -304,12 +313,223 @@ def check_negative_sampler(mode, exclude_positive, neg_size):
                etype = g.edata['etype'][eid]
                exist = neg_edges.edata['etype'][i] == etype
                assert F.asnumpy(exists[i]) == F.asnumpy(exist)
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
+def check_weighted_negative_sampler(mode, exclude_positive, neg_size):
+    g = generate_rand_graph(100)
+    num_edges = g.number_of_edges()
+    num_nodes = g.number_of_nodes()
+    edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
+    node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
+    etype = np.random.randint(0, 10, size=num_edges, dtype=np.int64)
+    g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
+
+    pos_gsrc, pos_gdst, pos_geid = g.all_edges(form='all', order='eid')
+    pos_map = {}
+    for i in range(len(pos_geid)):
+        pos_d = int(F.asnumpy(pos_gdst[i]))
+        pos_e = int(F.asnumpy(pos_geid[i]))
+        pos_map[(pos_d, pos_e)] = int(F.asnumpy(pos_gsrc[i]))
+    EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
+
+    # Correctness check
+    # Test the homogeneous graph.
+    batch_size = 50
+    total_samples = 0
+    max_samples = num_edges
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
+                                            edge_weight=edge_weight,
+                                            negative_mode=mode,
+                                            neg_sample_size=neg_size,
+                                            exclude_positive=exclude_positive,
+                                            return_false_neg=True):
+        pos_lsrc, pos_ldst, pos_leid = pos_edges.all_edges(form='all', order='eid')
+        assert_array_equal(F.asnumpy(pos_edges.parent_eid[pos_leid]),
+                           F.asnumpy(g.edge_ids(pos_edges.parent_nid[pos_lsrc],
+                                                pos_edges.parent_nid[pos_ldst])))
+        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
+
+        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_eid = neg_edges.parent_eid[neg_leid]
+        for i in range(len(neg_eid)):
+            neg_d = int(F.asnumpy(neg_dst[i]))
+            neg_e = int(F.asnumpy(neg_eid[i]))
+            assert (neg_d, neg_e) in pos_map
+            if exclude_positive:
+                assert int(F.asnumpy(neg_src[i])) != pos_map[(neg_d, neg_e)]
+
+        check_head_tail(neg_edges)
+        pos_tails = pos_edges.parent_nid[pos_edges.tail_nid]
+        neg_tails = neg_edges.parent_nid[neg_edges.tail_nid]
+        pos_tails = np.sort(F.asnumpy(pos_tails))
+        neg_tails = np.sort(F.asnumpy(neg_tails))
+        np.testing.assert_equal(pos_tails, neg_tails)
+
+        exist = neg_edges.edata['false_neg']
+        if exclude_positive:
+            assert np.sum(F.asnumpy(exist) == 0) == len(exist)
+        else:
+            assert F.array_equal(g.has_edges_between(neg_src, neg_dst), exist)
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
+    # Test the knowledge graph with edge weight provied.
+    total_samples = 0
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
+                                            edge_weight=edge_weight,
+                                            negative_mode=mode,
+                                            neg_sample_size=neg_size,
+                                            exclude_positive=exclude_positive,
+                                            relations=g.edata['etype'],
+                                            return_false_neg=True):
+        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
+        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_eid = neg_edges.parent_eid[neg_leid]
+        exists = neg_edges.edata['false_neg']
+        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
+        for i in range(len(neg_eid)):
+            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
+            if g.has_edge_between(u, v):
+                eid = g.edge_id(u, v)
+                etype = g.edata['etype'][eid]
+                exist = neg_edges.edata['etype'][i] == etype
+                assert F.asnumpy(exists[i]) == F.asnumpy(exist)
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
+    # Test the knowledge graph with edge/node weight provied.
+    total_samples = 0
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
+                                            edge_weight=edge_weight,
+                                            node_weight=node_weight,
+                                            negative_mode=mode,
+                                            neg_sample_size=neg_size,
+                                            exclude_positive=exclude_positive,
+                                            relations=g.edata['etype'],
+                                            return_false_neg=True):
+        neg_lsrc, neg_ldst, neg_leid = neg_edges.all_edges(form='all', order='eid')
+        neg_src = neg_edges.parent_nid[neg_lsrc]
+        neg_dst = neg_edges.parent_nid[neg_ldst]
+        neg_eid = neg_edges.parent_eid[neg_leid]
+        exists = neg_edges.edata['false_neg']
+        neg_edges.edata['etype'] = g.edata['etype'][neg_eid]
+        for i in range(len(neg_eid)):
+            u, v = F.asnumpy(neg_src[i]), F.asnumpy(neg_dst[i])
+            if g.has_edge_between(u, v):
+                eid = g.edge_id(u, v)
+                etype = g.edata['etype'][eid]
+                exist = neg_edges.edata['etype'][i] == etype
+                assert F.asnumpy(exists[i]) == F.asnumpy(exist)
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
+    # Check Rate
+    dgl.random.seed(0)
+    g = generate_rand_graph(1000)
+    num_edges = g.number_of_edges()
+    num_nodes = g.number_of_nodes()
+    edge_weight = F.copy_to(F.tensor(np.full((num_edges,), 1, dtype=np.float32)), F.cpu())
+    edge_weight[0] = F.sum(edge_weight, dim=0)
+    node_weight = F.copy_to(F.tensor(np.full((num_nodes,), 1, dtype=np.float32)), F.cpu())
+    node_weight[-1] = F.sum(node_weight, dim=0) / 200
+    etype = np.random.randint(0, 20, size=num_edges, dtype=np.int64)
+    g.edata['etype'] = F.copy_to(F.tensor(etype), F.cpu())
+
+    # Test w/o node weight.
+    max_samples = num_edges / 5
+    # Test the knowledge graph with edge weight provied.
+    total_samples = 0
+    edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
+    node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
+                                            edge_weight=edge_weight,
+                                            negative_mode=mode,
+                                            neg_sample_size=neg_size,
+                                            exclude_positive=False,
+                                            relations=g.edata['etype'],
+                                            return_false_neg=True):
+        _, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
+        neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
+        if 'head' in mode:
+            neg_src = neg_edges.parent_nid[neg_lsrc]
+            np.add.at(node_sampled, F.asnumpy(neg_src), 1)
+        else:
+            neg_dst = neg_edges.parent_nid[neg_ldst]
+            np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
+        np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
+
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+    # Check rate here
+    edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
+    edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
+    edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
+    assert np.allclose(edge_rate_0, 0.5, atol=0.05)
+    assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
+
+    node_rate_0 = node_sampled[0] / node_sampled.sum()
+    node_tail_half_cnt = node_sampled[node_sampled.shape[0] // 2:-1].sum()
+    node_rate_tail_half = node_tail_half_cnt / node_sampled.sum()
+    assert node_rate_0 < 0.02
+    assert np.allclose(node_rate_tail_half, 0.5, atol=0.02)
+
+    # Test the knowledge graph with edge/node weight provied.
+    total_samples = 0
+    edge_sampled = np.full((num_edges,), 0, dtype=np.int32)
+    node_sampled = np.full((num_nodes,), 0, dtype=np.int32)
+    for pos_edges, neg_edges in EdgeSampler(g, batch_size,
+                                            edge_weight=edge_weight,
+                                            node_weight=node_weight,
+                                            negative_mode=mode,
+                                            neg_sample_size=neg_size,
+                                            exclude_positive=False,
+                                            relations=g.edata['etype'],
+                                            return_false_neg=True):
+        _, _, pos_leid = pos_edges.all_edges(form='all', order='eid')
+        neg_lsrc, neg_ldst, _ = neg_edges.all_edges(form='all', order='eid')
+        if 'head' in mode:
+            neg_src = neg_edges.parent_nid[neg_lsrc]
+            np.add.at(node_sampled, F.asnumpy(neg_src), 1)
+        else:
+            neg_dst = neg_edges.parent_nid[neg_ldst]
+            np.add.at(node_sampled, F.asnumpy(neg_dst), 1)
+        np.add.at(edge_sampled, F.asnumpy(pos_edges.parent_eid[pos_leid]), 1)
+
+        total_samples += batch_size
+        if (total_samples >= max_samples):
+            break
+
+    # Check rate here
+    edge_rate_0 = edge_sampled[0] / edge_sampled.sum()
+    edge_tail_half_cnt = edge_sampled[edge_sampled.shape[0] // 2:-1].sum()
+    edge_rate_tail_half = edge_tail_half_cnt / edge_sampled.sum()
+    assert np.allclose(edge_rate_0, 0.5, atol=0.05)
+    assert np.allclose(edge_rate_tail_half, 0.25, atol=0.05)
+
+    node_rate = node_sampled[-1] / node_sampled.sum()
+    node_rate_a = np.average(node_sampled[:50]) / node_sampled.sum()
+    node_rate_b = np.average(node_sampled[50:100]) / node_sampled.sum()
+    # As neg sampling does not contain duplicate nodes,
+    # this test takes some acceptable variation on the sample rate.
+    assert np.allclose(node_rate, node_rate_a * 5, atol=0.002)
+    assert np.allclose(node_rate_a, node_rate_b, atol=0.0002)

 @unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="Core dump")
 def test_negative_sampler():
    check_negative_sampler('PBG-head', False, 10)
    check_negative_sampler('head', True, 10)
    check_negative_sampler('head', False, 10)
+    check_weighted_negative_sampler('PBG-head', False, 10)
+    check_weighted_negative_sampler('head', True, 10)
+    check_weighted_negative_sampler('head', False, 10)
    #disable this check for now. It might take too long time.
    #check_negative_sampler('head', False, 100)