[GraphBolt][CUDA] Pipelined sampling accuracy fix (#7088)

a2e1c796 · Muhammed Fatih BALIN · GitHub · 4ee0a8bd · a2e1c796 · a2e1c796
Unverified Commit a2e1c796 authored Feb 05, 2024 by Muhammed Fatih BALIN Committed by GitHub Feb 05, 2024
2 changed files
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -48,22 +48,33 @@ class FetchInsubgraphData(Mapper):
        with torch.cuda.stream(self.stream):
            index = minibatch._seed_nodes
            if isinstance(index, dict):
+                for idx in index.values():
+                    idx.record_stream(torch.cuda.current_stream())
                index = self.graph._convert_to_homogeneous_nodes(index)
+            else:
+                index.record_stream(torch.cuda.current_stream())
+            def record_stream(tensor):
+                if stream is not None and tensor.is_cuda:
+                    tensor.record_stream(stream)
+                return tensor
+            if self.graph.node_type_offset is None:
+                # sorting not needed.
+                minibatch._subgraph_seed_nodes = None
+            else:
                index, original_positions = index.sort()
-            if (original_positions.diff() == 1).all().item():  # is_sorted
+                if (original_positions.diff() == 1).all().item():
+                    # already sorted.
                    minibatch._subgraph_seed_nodes = None
                else:
-                minibatch._subgraph_seed_nodes = original_positions
+                    minibatch._subgraph_seed_nodes = record_stream(
-            index.record_stream(torch.cuda.current_stream())
+                        original_positions.sort()[1]
+                    )
            index_select_csc_with_indptr = partial(
                torch.ops.graphbolt.index_select_csc, self.graph.csc_indptr
            )
-            def record_stream(tensor):
-                if stream is not None and tensor.is_cuda:
-                    tensor.record_stream(stream)
            indptr, indices = index_select_csc_with_indptr(
                self.graph.indices, index, None
            )

--- a/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
+++ b/tests/python/pytorch/graphbolt/impl/test_neighbor_sampler.py
@@ -41,8 +41,12 @@ def get_hetero_graph():
 @unittest.skipIf(F._default_context_str != "gpu", reason="Enabled only on GPU.")
 @pytest.mark.parametrize("hetero", [False, True])
 @pytest.mark.parametrize("prob_name", [None, "weight", "mask"])
-def test_NeighborSampler_GraphFetch(hetero, prob_name):
+@pytest.mark.parametrize("sorted", [False, True])
+def test_NeighborSampler_GraphFetch(hetero, prob_name, sorted):
+    if sorted:
        items = torch.arange(3)
+    else:
+        items = torch.tensor([2, 0, 1])
    names = "seed_nodes"
    itemset = gb.ItemSet(items, names=names)
    graph = get_hetero_graph().to(F.ctx())