Unverified Commit 16ec2a8b authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

we need to sync for mxnet. (#648)

parent d7062980
...@@ -937,3 +937,12 @@ def copy_reduce(reducer, graph, target, in_data, out_size, in_map, out_map): ...@@ -937,3 +937,12 @@ def copy_reduce(reducer, graph, target, in_data, out_size, in_map, out_map):
# ---------------- # ----------------
# These are not related to tensors. Some of them are temporary workarounds that # These are not related to tensors. Some of them are temporary workarounds that
# should be included in DGL in the future. # should be included in DGL in the future.
def sync():
"""Synchronize computation.
In DL frameworks such as MXNet and TensorFlow, the computation in operators
are done asynchronously. This is to synchronize computation and makes sure
that all computation is complete after this function call.
"""
pass
...@@ -393,3 +393,12 @@ def _reduce_grad(grad, shape): ...@@ -393,3 +393,12 @@ def _reduce_grad(grad, shape):
reduce_idx += 1 # skip batch dim reduce_idx += 1 # skip batch dim
grad = grad.sum(axis=tuple(reduce_idx), keepdims=True) grad = grad.sum(axis=tuple(reduce_idx), keepdims=True)
return grad.reshape(shape) return grad.reshape(shape)
def sync():
"""Synchronize computation.
In DL frameworks such as MXNet and TensorFlow, the computation in operators
are done asynchronously. This is to synchronize computation and makes sure
that all computation is complete after this function call.
"""
mx.nd.waitall()
...@@ -308,3 +308,7 @@ def _reduce_grad(grad, shape): ...@@ -308,3 +308,7 @@ def _reduce_grad(grad, shape):
reduce_idx += 1 # skip batch dim reduce_idx += 1 # skip batch dim
grad = grad.sum(dim=tuple(reduce_idx), keepdim=True) grad = grad.sum(dim=tuple(reduce_idx), keepdim=True)
return grad.view(shape) return grad.view(shape)
def sync():
# Pytorch performs computation synchronously, so no need for synchronization.
pass
...@@ -363,6 +363,7 @@ class SharedMemoryStoreServer(object): ...@@ -363,6 +363,7 @@ class SharedMemoryStoreServer(object):
init = self._init_manager.deserialize(init) init = self._init_manager.deserialize(init)
data = init(shape, dtype, _get_ndata_path(graph_name, ndata_name)) data = init(shape, dtype, _get_ndata_path(graph_name, ndata_name))
self._graph.ndata[ndata_name] = data self._graph.ndata[ndata_name] = data
F.sync()
return 0 return 0
# RPC command: initialize edge embedding in the server. # RPC command: initialize edge embedding in the server.
...@@ -375,6 +376,7 @@ class SharedMemoryStoreServer(object): ...@@ -375,6 +376,7 @@ class SharedMemoryStoreServer(object):
assert self._graph.number_of_edges() == shape[0] assert self._graph.number_of_edges() == shape[0]
init = self._init_manager.deserialize(init) init = self._init_manager.deserialize(init)
data = init(shape, dtype, _get_edata_path(graph_name, edata_name)) data = init(shape, dtype, _get_edata_path(graph_name, edata_name))
F.sync()
self._graph.edata[edata_name] = data self._graph.edata[edata_name] = data
return 0 return 0
...@@ -636,6 +638,10 @@ class SharedMemoryDGLGraph(BaseGraphStore): ...@@ -636,6 +638,10 @@ class SharedMemoryDGLGraph(BaseGraphStore):
timeout: int timeout: int
time out in seconds. time out in seconds.
""" """
# Before entering the barrier, we need to make sure all computation in the local
# process has completed.
F.sync()
# Here I manually implement multi-processing barrier with RPC. # Here I manually implement multi-processing barrier with RPC.
# It uses busy wait with RPC. Whenever, all_enter is called, there is # It uses busy wait with RPC. Whenever, all_enter is called, there is
# a context switch, so it doesn't burn CPUs so badly. # a context switch, so it doesn't burn CPUs so badly.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment