we need to sync for mxnet. (#648)

16ec2a8b · Da Zheng · GitHub · d7062980 · 16ec2a8b · 16ec2a8b
Unverified Commit 16ec2a8b authored Jun 11, 2019 by Da Zheng Committed by GitHub Jun 11, 2019
4 changed files
--- a/python/dgl/backend/backend.py
+++ b/python/dgl/backend/backend.py
@@ -937,3 +937,12 @@ def copy_reduce(reducer, graph, target, in_data, out_size, in_map, out_map):
 # ----------------
 # These are not related to tensors. Some of them are temporary workarounds that
 # should be included in DGL in the future.
+
+def sync():
+    """Synchronize computation.
+
+    In DL frameworks such as MXNet and TensorFlow, the computation in operators
+    are done asynchronously. This is to synchronize computation and makes sure
+    that all computation is complete after this function call.
+    """
+    pass
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -393,3 +393,12 @@ def _reduce_grad(grad, shape):
    reduce_idx += 1  # skip batch dim
    grad = grad.sum(axis=tuple(reduce_idx), keepdims=True)
    return grad.reshape(shape)
+
+def sync():
+    """Synchronize computation.
+
+    In DL frameworks such as MXNet and TensorFlow, the computation in operators
+    are done asynchronously. This is to synchronize computation and makes sure
+    that all computation is complete after this function call.
+    """
+    mx.nd.waitall()
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -308,3 +308,7 @@ def _reduce_grad(grad, shape):
    reduce_idx += 1  # skip batch dim
    grad = grad.sum(dim=tuple(reduce_idx), keepdim=True)
    return grad.view(shape)
+
+def sync():
+    # Pytorch performs computation synchronously, so no need for synchronization.
+    pass
--- a/python/dgl/contrib/graph_store.py
+++ b/python/dgl/contrib/graph_store.py
@@ -363,6 +363,7 @@ class SharedMemoryStoreServer(object):
            init = self._init_manager.deserialize(init)
            data = init(shape, dtype, _get_ndata_path(graph_name, ndata_name))
            self._graph.ndata[ndata_name] = data
+            F.sync()
            return 0

        # RPC command: initialize edge embedding in the server.
@@ -375,6 +376,7 @@ class SharedMemoryStoreServer(object):
            assert self._graph.number_of_edges() == shape[0]
            init = self._init_manager.deserialize(init)
            data = init(shape, dtype, _get_edata_path(graph_name, edata_name))
+            F.sync()
            self._graph.edata[edata_name] = data
            return 0

@@ -636,6 +638,10 @@ class SharedMemoryDGLGraph(BaseGraphStore):
        timeout: int
            time out in seconds.
        """
+        # Before entering the barrier, we need to make sure all computation in the local
+        # process has completed.
+        F.sync()
+
        # Here I manually implement multi-processing barrier with RPC.
        # It uses busy wait with RPC. Whenever, all_enter is called, there is
        # a context switch, so it doesn't burn CPUs so badly.