Unverified Commit d1cf5c38 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] adjust various APIs. (#1993)

* rename get_data_size.

* remove g from DistTensor.

* remove g from DistEmbedding.

* clean up API of graph partition book.

* fix DistGraph

* fix lint.

* collect all part policies.

* fix.

* fix.

* support distributed sampler.

* remove partition.py
parent a6b44e72
...@@ -97,11 +97,11 @@ class DistSAGE(nn.Module): ...@@ -97,11 +97,11 @@ class DistSAGE(nn.Module):
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()), nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()),
g.get_partition_book(), force_even=True) g.get_partition_book(), force_even=True)
y = dgl.distributed.DistTensor(g, (g.number_of_nodes(), self.n_hidden), th.float32, 'h', y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_hidden), th.float32, 'h',
persistent=True) persistent=True)
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
if l == len(self.layers) - 1: if l == len(self.layers) - 1:
y = dgl.distributed.DistTensor(g, (g.number_of_nodes(), self.n_classes), y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_classes),
th.float32, 'h_last', persistent=True) th.float32, 'h_last', persistent=True)
sampler = NeighborSampler(g, [-1], dgl.distributed.sample_neighbors, device) sampler = NeighborSampler(g, [-1], dgl.distributed.sample_neighbors, device)
...@@ -263,7 +263,7 @@ def main(args): ...@@ -263,7 +263,7 @@ def main(args):
dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers) dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph(args.ip_config, args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
print('rank:', g.rank()) print('rank:', g.rank())
pb = g.get_partition_book() pb = g.get_partition_book()
......
...@@ -195,11 +195,11 @@ class DistSAGE(SAGE): ...@@ -195,11 +195,11 @@ class DistSAGE(SAGE):
# TODO: can we standardize this? # TODO: can we standardize this?
nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()), nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()),
g.get_partition_book(), force_even=True) g.get_partition_book(), force_even=True)
y = dgl.distributed.DistTensor(g, (g.number_of_nodes(), self.n_hidden), th.float32, 'h', y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_hidden), th.float32, 'h',
persistent=True) persistent=True)
for l, layer in enumerate(self.layers): for l, layer in enumerate(self.layers):
if l == len(self.layers) - 1: if l == len(self.layers) - 1:
y = dgl.distributed.DistTensor(g, (g.number_of_nodes(), self.n_classes), y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_classes),
th.float32, 'h_last', persistent=True) th.float32, 'h_last', persistent=True)
sampler = PosNeighborSampler(g, [-1], dgl.distributed.sample_neighbors) sampler = PosNeighborSampler(g, [-1], dgl.distributed.sample_neighbors)
...@@ -421,7 +421,7 @@ def main(args): ...@@ -421,7 +421,7 @@ def main(args):
dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers) dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph(args.ip_config, args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
print('rank:', g.rank()) print('rank:', g.rank())
print('number of edges', g.number_of_edges()) print('number of edges', g.number_of_edges())
......
"""Data loading components for neighbor sampling""" """Data loading components for neighbor sampling"""
from .dataloader import BlockSampler from .dataloader import BlockSampler
from .. import sampling, subgraph from .. import sampling, subgraph, distributed
class MultiLayerNeighborSampler(BlockSampler): class MultiLayerNeighborSampler(BlockSampler):
"""Sampler that builds computational dependency of node representations via """Sampler that builds computational dependency of node representations via
...@@ -59,6 +59,12 @@ class MultiLayerNeighborSampler(BlockSampler): ...@@ -59,6 +59,12 @@ class MultiLayerNeighborSampler(BlockSampler):
def sample_frontier(self, block_id, g, seed_nodes): def sample_frontier(self, block_id, g, seed_nodes):
fanout = self.fanouts[block_id] fanout = self.fanouts[block_id]
if isinstance(g, distributed.DistGraph):
if fanout is None:
frontier = distributed.in_subgraph(g, seed_nodes)
else:
frontier = distributed.sample_neighbors(g, seed_nodes, fanout, replace=self.replace)
else:
if fanout is None: if fanout is None:
frontier = subgraph.in_subgraph(g, seed_nodes) frontier = subgraph.in_subgraph(g, seed_nodes)
else: else:
......
...@@ -9,7 +9,6 @@ from .. import heterograph_index ...@@ -9,7 +9,6 @@ from .. import heterograph_index
from .. import backend as F from .. import backend as F
from ..base import NID, EID from ..base import NID, EID
from .kvstore import KVServer, get_kvstore from .kvstore import KVServer, get_kvstore
from .standalone_kvstore import KVClient as SA_KVClient
from .._ffi.ndarray import empty_shared_mem from .._ffi.ndarray import empty_shared_mem
from ..frame import infer_scheme from ..frame import infer_scheme
from .partition import load_partition, load_partition_book from .partition import load_partition, load_partition_book
...@@ -142,7 +141,7 @@ class NodeDataView(MutableMapping): ...@@ -142,7 +141,7 @@ class NodeDataView(MutableMapping):
name1 = _get_data_name(name, policy.policy_str) name1 = _get_data_name(name, policy.policy_str)
dtype, shape, _ = g._client.get_data_meta(name1) dtype, shape, _ = g._client.get_data_meta(name1)
# We create a wrapper on the existing tensor in the kvstore. # We create a wrapper on the existing tensor in the kvstore.
self._data[name] = DistTensor(g, shape, dtype, name, part_policy=policy) self._data[name] = DistTensor(shape, dtype, name, part_policy=policy)
def _get_names(self): def _get_names(self):
return list(self._data.keys()) return list(self._data.keys())
...@@ -188,7 +187,7 @@ class EdgeDataView(MutableMapping): ...@@ -188,7 +187,7 @@ class EdgeDataView(MutableMapping):
name1 = _get_data_name(name, policy.policy_str) name1 = _get_data_name(name, policy.policy_str)
dtype, shape, _ = g._client.get_data_meta(name1) dtype, shape, _ = g._client.get_data_meta(name1)
# We create a wrapper on the existing tensor in the kvstore. # We create a wrapper on the existing tensor in the kvstore.
self._data[name] = DistTensor(g, shape, dtype, name, part_policy=policy) self._data[name] = DistTensor(shape, dtype, name, part_policy=policy)
def _get_names(self): def _get_names(self):
return list(self._data.keys()) return list(self._data.keys())
...@@ -321,8 +320,6 @@ class DistGraph: ...@@ -321,8 +320,6 @@ class DistGraph:
Parameters Parameters
---------- ----------
ip_config : str
Path of IP configuration file.
graph_name : str graph_name : str
The name of the graph. This name has to be the same as the one used in DistGraphServer. The name of the graph. This name has to be the same as the one used in DistGraphServer.
gpb : PartitionBook gpb : PartitionBook
...@@ -330,14 +327,13 @@ class DistGraph: ...@@ -330,14 +327,13 @@ class DistGraph:
part_config : str part_config : str
The partition config file. It's used in the standalone mode. The partition config file. It's used in the standalone mode.
''' '''
def __init__(self, ip_config, graph_name, gpb=None, part_config=None): def __init__(self, graph_name, gpb=None, part_config=None):
self.ip_config = ip_config
self.graph_name = graph_name self.graph_name = graph_name
self._gpb_input = gpb self._gpb_input = gpb
if os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone': if os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone':
assert part_config is not None, \ assert part_config is not None, \
'When running in the standalone model, the partition config file is required' 'When running in the standalone model, the partition config file is required'
self._client = SA_KVClient() self._client = get_kvstore()
# Load graph partition data. # Load graph partition data.
g, node_feats, edge_feats, self._gpb, _ = load_partition(part_config, 0) g, node_feats, edge_feats, self._gpb, _ = load_partition(part_config, 0)
assert self._gpb.num_partitions() == 1, \ assert self._gpb.num_partitions() == 1, \
...@@ -349,6 +345,7 @@ class DistGraph: ...@@ -349,6 +345,7 @@ class DistGraph:
self._client.add_data(_get_data_name(name, NODE_PART_POLICY), node_feats[name]) self._client.add_data(_get_data_name(name, NODE_PART_POLICY), node_feats[name])
for name in edge_feats: for name in edge_feats:
self._client.add_data(_get_data_name(name, EDGE_PART_POLICY), edge_feats[name]) self._client.add_data(_get_data_name(name, EDGE_PART_POLICY), edge_feats[name])
self._client.map_shared_data(self._gpb)
rpc.set_num_client(1) rpc.set_num_client(1)
else: else:
self._init() self._init()
...@@ -377,10 +374,10 @@ class DistGraph: ...@@ -377,10 +374,10 @@ class DistGraph:
self._client.map_shared_data(self._gpb) self._client.map_shared_data(self._gpb)
def __getstate__(self): def __getstate__(self):
return self.ip_config, self.graph_name, self._gpb return self.graph_name, self._gpb
def __setstate__(self, state): def __setstate__(self, state):
self.ip_config, self.graph_name, self._gpb_input = state self.graph_name, self._gpb_input = state
self._init() self._init()
self._ndata = NodeDataView(self) self._ndata = NodeDataView(self)
...@@ -428,6 +425,43 @@ class DistGraph: ...@@ -428,6 +425,43 @@ class DistGraph:
""" """
return self._edata return self._edata
@property
def idtype(self):
"""The dtype of graph index
Returns
-------
backend dtype object
th.int32/th.int64 or tf.int32/tf.int64 etc.
See Also
--------
long
int
"""
return self._g.idtype
@property
def device(self):
"""Get the device context of this graph.
Examples
--------
The following example uses PyTorch backend.
>>> g = dgl.bipartite(([0, 1, 1, 2], [0, 0, 2, 1]), 'user', 'plays', 'game')
>>> print(g.device)
device(type='cpu')
>>> g = g.to('cuda:0')
>>> print(g.device)
device(type='cuda', index=0)
Returns
-------
Device context object
"""
return self._g.device
@property @property
def ntypes(self): def ntypes(self):
"""Return the list of node types of this graph. """Return the list of node types of this graph.
...@@ -439,7 +473,7 @@ class DistGraph: ...@@ -439,7 +473,7 @@ class DistGraph:
Examples Examples
-------- --------
>>> g = DistGraph("ip_config.txt", "test") >>> g = DistGraph("test")
>>> g.ntypes >>> g.ntypes
['_U'] ['_U']
""" """
...@@ -457,7 +491,7 @@ class DistGraph: ...@@ -457,7 +491,7 @@ class DistGraph:
Examples Examples
-------- --------
>>> g = DistGraph("ip_config.txt", "test") >>> g = DistGraph("test")
>>> g.etypes >>> g.etypes
['_E'] ['_E']
""" """
......
...@@ -2,9 +2,8 @@ ...@@ -2,9 +2,8 @@
import os import os
from .graph_partition_book import PartitionPolicy, NODE_PART_POLICY, EDGE_PART_POLICY
from .dist_context import is_initialized from .dist_context import is_initialized
from ..base import DGLError from .kvstore import get_kvstore
from .. import utils from .. import utils
from .. import backend as F from .. import backend as F
...@@ -35,8 +34,6 @@ class DistTensor: ...@@ -35,8 +34,6 @@ class DistTensor:
Parameters Parameters
---------- ----------
g : DistGraph
The distributed graph object.
shape : tuple shape : tuple
The shape of the tensor The shape of the tensor
dtype : dtype dtype : dtype
...@@ -50,27 +47,34 @@ class DistTensor: ...@@ -50,27 +47,34 @@ class DistTensor:
persistent : bool persistent : bool
Whether the created tensor is persistent. Whether the created tensor is persistent.
''' '''
def __init__(self, g, shape, dtype, name=None, init_func=None, part_policy=None, def __init__(self, shape, dtype, name=None, init_func=None, part_policy=None,
persistent=False): persistent=False):
self.kvstore = g._client self.kvstore = get_kvstore()
self._shape = shape self._shape = shape
self._dtype = dtype self._dtype = dtype
part_policies = self.kvstore.all_possible_part_policy
# If a user doesn't provide a partition policy, we should find one based on
# the input shape.
if part_policy is None: if part_policy is None:
assert shape[0] != g.number_of_nodes() or shape[0] != g.number_of_edges(), \ for policy_name in part_policies:
policy = part_policies[policy_name]
if policy.get_size() == shape[0]:
# If multiple partition policies match the input shape, we cannot
# decide which is the right one automatically. We should ask users
# to provide one.
assert part_policy is None, \
'Multiple partition policies match the input shape. ' \
+ 'Please provide a partition policy explicitly.'
part_policy = policy
assert part_policy is not None, \
'Cannot determine the partition policy. Please provide it.' 'Cannot determine the partition policy. Please provide it.'
if shape[0] == g.number_of_nodes():
part_policy = PartitionPolicy(NODE_PART_POLICY, g.get_partition_book())
elif shape[0] == g.number_of_edges():
part_policy = PartitionPolicy(EDGE_PART_POLICY, g.get_partition_book())
else:
raise DGLError('Cannot determine the partition policy. Please provide it.')
self._part_policy = part_policy self._part_policy = part_policy
if init_func is None: if init_func is None:
init_func = _default_init_data init_func = _default_init_data
exist_names = g._client.data_name_list() exist_names = self.kvstore.data_name_list()
# If a user doesn't provide a name, we generate a name ourselves. # If a user doesn't provide a name, we generate a name ourselves.
# We need to generate the name in a deterministic way. # We need to generate the name in a deterministic way.
if name is None: if name is None:
...@@ -79,11 +83,11 @@ class DistTensor: ...@@ -79,11 +83,11 @@ class DistTensor:
self._name = _get_data_name(name, part_policy.policy_str) self._name = _get_data_name(name, part_policy.policy_str)
self._persistent = persistent self._persistent = persistent
if self._name not in exist_names: if self._name not in exist_names:
g._client.init_data(self._name, shape, dtype, part_policy, init_func) self.kvstore.init_data(self._name, shape, dtype, part_policy, init_func)
self._owner = True self._owner = True
else: else:
self._owner = False self._owner = False
dtype1, shape1, _ = g._client.get_data_meta(self._name) dtype1, shape1, _ = self.kvstore.get_data_meta(self._name)
assert dtype == dtype1, 'The dtype does not match with the existing tensor' assert dtype == dtype1, 'The dtype does not match with the existing tensor'
assert shape == shape1, 'The shape does not match with the existing tensor' assert shape == shape1, 'The shape does not match with the existing tensor'
......
...@@ -306,41 +306,6 @@ class GraphPartitionBook: ...@@ -306,41 +306,6 @@ class GraphPartitionBook:
getting remote tensor of eid2localeid.') getting remote tensor of eid2localeid.')
return F.gather_row(self._eidg2l[partid], eids) return F.gather_row(self._eidg2l[partid], eids)
def get_partition(self, partid):
"""Get the graph of one partition.
Parameters
----------
partid : int
Partition ID.
Returns
-------
DGLGraph
The graph of the partition.
"""
#TODO(zhengda) add implementation later.
def get_node_size(self):
"""Get the number of nodes in the current partition.
Return
------
int
The number of nodes in current partition
"""
return self._node_size
def get_edge_size(self):
"""Get the number of edges in the current partition.
Return
------
int
The number of edges in current partition
"""
return self._edge_size
@property @property
def partid(self): def partid(self):
"""Get the current partition id """Get the current partition id
...@@ -573,45 +538,6 @@ class RangePartitionBook: ...@@ -573,45 +538,6 @@ class RangePartitionBook:
return eids - int(start) return eids - int(start)
def get_partition(self, partid):
"""Get the graph of one partition.
Parameters
----------
partid : int
Partition ID.
Returns
-------
DGLGraph
The graph of the partition.
"""
#TODO(zhengda) add implementation later.
def get_node_size(self):
"""Get the number of nodes in the current partition.
Return
------
int
The number of nodes in current partition
"""
range_start = self._node_map[self._partid - 1] if self._partid > 0 else 0
range_end = self._node_map[self._partid]
return range_end - range_start
def get_edge_size(self):
"""Get the number of edges in the current partition.
Return
------
int
The number of edges in current partition
"""
range_start = self._edge_map[self._partid - 1] if self._partid > 0 else 0
range_end = self._edge_map[self._partid]
return range_end - range_start
@property @property
def partid(self): def partid(self):
"""Get the current partition id """Get the current partition id
...@@ -701,7 +627,7 @@ class PartitionPolicy(object): ...@@ -701,7 +627,7 @@ class PartitionPolicy(object):
else: else:
raise RuntimeError('Cannot support policy: %s ' % self._policy_str) raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
def get_data_size(self): def get_part_size(self):
"""Get data size of current partition. """Get data size of current partition.
Returns Returns
...@@ -715,3 +641,18 @@ class PartitionPolicy(object): ...@@ -715,3 +641,18 @@ class PartitionPolicy(object):
return len(self._partition_book.partid2nids(self._part_id)) return len(self._partition_book.partid2nids(self._part_id))
else: else:
raise RuntimeError('Cannot support policy: %s ' % self._policy_str) raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
def get_size(self):
"""Get the full size of the data.
Returns
-------
int
data size
"""
if self._policy_str == EDGE_PART_POLICY:
return self._partition_book._num_edges()
elif self._policy_str == NODE_PART_POLICY:
return self._partition_book._num_nodes()
else:
raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
...@@ -738,11 +738,11 @@ class KVServer(object): ...@@ -738,11 +738,11 @@ class KVServer(object):
dlpack = shared_data.to_dlpack() dlpack = shared_data.to_dlpack()
self._data_store[name] = F.zerocopy_from_dlpack(dlpack) self._data_store[name] = F.zerocopy_from_dlpack(dlpack)
self._data_store[name][:] = data_tensor[:] self._data_store[name][:] = data_tensor[:]
assert self._part_policy[name].get_data_size() == data_tensor.shape[0], \ assert self._part_policy[name].get_part_size() == data_tensor.shape[0], \
'kvserver expect partition {} for {} has {} rows, but gets {} rows'.format( 'kvserver expect partition {} for {} has {} rows, but gets {} rows'.format(
self._part_policy[name].part_id, self._part_policy[name].part_id,
policy_str, policy_str,
self._part_policy[name].get_data_size(), self._part_policy[name].get_part_size(),
data_tensor.shape[0]) data_tensor.shape[0])
self._pull_handlers[name] = default_pull_handler self._pull_handlers[name] = default_pull_handler
self._push_handlers[name] = default_push_handler self._push_handlers[name] = default_push_handler
...@@ -821,6 +821,8 @@ class KVClient(object): ...@@ -821,6 +821,8 @@ class KVClient(object):
self._data_store = {} self._data_store = {}
# Store the partition information with specified data name # Store the partition information with specified data name
self._part_policy = {} self._part_policy = {}
# This stores all unique partition policies in the kvstore. The key is the policy name.
self._all_possible_part_policy = {}
# Store the full data shape across kvserver # Store the full data shape across kvserver
self._full_data_shape = {} self._full_data_shape = {}
# Store all the data name # Store all the data name
...@@ -840,6 +842,11 @@ class KVClient(object): ...@@ -840,6 +842,11 @@ class KVClient(object):
# register role on server-0 # register role on server-0
self._role = role self._role = role
@property
def all_possible_part_policy(self):
"""Get all possible partition policies"""
return self._all_possible_part_policy
@property @property
def client_id(self): def client_id(self):
"""Get client ID""" """Get client ID"""
...@@ -961,7 +968,7 @@ class KVClient(object): ...@@ -961,7 +968,7 @@ class KVClient(object):
# Send request to the servers to initialize data. # Send request to the servers to initialize data.
# The servers may handle the duplicated initializations. # The servers may handle the duplicated initializations.
part_shape = shape.copy() part_shape = shape.copy()
part_shape[0] = part_policy.get_data_size() part_shape[0] = part_policy.get_part_size()
request = InitDataRequest(name, request = InitDataRequest(name,
tuple(part_shape), tuple(part_shape),
F.reverse_data_type_dict[dtype], F.reverse_data_type_dict[dtype],
...@@ -978,7 +985,7 @@ class KVClient(object): ...@@ -978,7 +985,7 @@ class KVClient(object):
self.barrier() self.barrier()
# Create local shared-data # Create local shared-data
local_shape = shape.copy() local_shape = shape.copy()
local_shape[0] = part_policy.get_data_size() local_shape[0] = part_policy.get_part_size()
if name in self._part_policy: if name in self._part_policy:
raise RuntimeError("Policy %s has already exists!" % name) raise RuntimeError("Policy %s has already exists!" % name)
if name in self._data_store: if name in self._data_store:
...@@ -986,6 +993,7 @@ class KVClient(object): ...@@ -986,6 +993,7 @@ class KVClient(object):
if name in self._full_data_shape: if name in self._full_data_shape:
raise RuntimeError("Data shape %s has already exists!" % name) raise RuntimeError("Data shape %s has already exists!" % name)
self._part_policy[name] = part_policy self._part_policy[name] = part_policy
self._all_possible_part_policy[part_policy.policy_str] = part_policy
shared_data = empty_shared_mem(name+'-kvdata-', False, \ shared_data = empty_shared_mem(name+'-kvdata-', False, \
local_shape, F.reverse_data_type_dict[dtype]) local_shape, F.reverse_data_type_dict[dtype])
dlpack = shared_data.to_dlpack() dlpack = shared_data.to_dlpack()
...@@ -1062,6 +1070,7 @@ class KVClient(object): ...@@ -1062,6 +1070,7 @@ class KVClient(object):
dlpack = shared_data.to_dlpack() dlpack = shared_data.to_dlpack()
self._data_store[name] = F.zerocopy_from_dlpack(dlpack) self._data_store[name] = F.zerocopy_from_dlpack(dlpack)
self._part_policy[name] = PartitionPolicy(policy_str, partition_book) self._part_policy[name] = PartitionPolicy(policy_str, partition_book)
self._all_possible_part_policy[policy_str] = self._part_policy[name]
self._pull_handlers[name] = default_pull_handler self._pull_handlers[name] = default_pull_handler
self._push_handlers[name] = default_push_handler self._push_handlers[name] = default_push_handler
# Get full data shape across servers # Get full data shape across servers
......
...@@ -3,17 +3,12 @@ ...@@ -3,17 +3,12 @@
from .. import backend as F from .. import backend as F
from .. import utils from .. import utils
from .dist_tensor import DistTensor from .dist_tensor import DistTensor
from .graph_partition_book import PartitionPolicy, NODE_PART_POLICY
class DistEmbedding: class DistEmbedding:
'''Embeddings in the distributed training. '''Embeddings in the distributed training.
By default, the embeddings are created for nodes in the graph.
Parameters Parameters
---------- ----------
g : DistGraph
The distributed graph object.
num_embeddings : int num_embeddings : int
The number of embeddings The number of embeddings
embedding_dim : int embedding_dim : int
...@@ -28,7 +23,7 @@ class DistEmbedding: ...@@ -28,7 +23,7 @@ class DistEmbedding:
Examples Examples
-------- --------
>>> emb_init = lambda shape, dtype: F.zeros(shape, dtype, F.cpu()) >>> emb_init = lambda shape, dtype: F.zeros(shape, dtype, F.cpu())
>>> emb = dgl.distributed.DistEmbedding(g, g.number_of_nodes(), 10) >>> emb = dgl.distributed.DistEmbedding(g.number_of_nodes(), 10)
>>> optimizer = dgl.distributed.SparseAdagrad([emb], lr=0.001) >>> optimizer = dgl.distributed.SparseAdagrad([emb], lr=0.001)
>>> for blocks in dataloader: >>> for blocks in dataloader:
>>> feats = emb(nids) >>> feats = emb(nids)
...@@ -36,12 +31,9 @@ class DistEmbedding: ...@@ -36,12 +31,9 @@ class DistEmbedding:
>>> loss.backward() >>> loss.backward()
>>> optimizer.step() >>> optimizer.step()
''' '''
def __init__(self, g, num_embeddings, embedding_dim, name=None, def __init__(self, num_embeddings, embedding_dim, name=None,
init_func=None, part_policy=None): init_func=None, part_policy=None):
if part_policy is None: self._tensor = DistTensor((num_embeddings, embedding_dim), F.float32, name,
part_policy = PartitionPolicy(NODE_PART_POLICY, g.get_partition_book())
self._tensor = DistTensor(g, (num_embeddings, embedding_dim), F.float32, name,
init_func, part_policy) init_func, part_policy)
self._trace = [] self._trace = []
......
...@@ -4,6 +4,7 @@ This kvstore is used when running in the standalone mode ...@@ -4,6 +4,7 @@ This kvstore is used when running in the standalone mode
""" """
from .. import backend as F from .. import backend as F
from .graph_partition_book import PartitionPolicy, NODE_PART_POLICY, EDGE_PART_POLICY
class KVClient(object): class KVClient(object):
''' The fake KVStore client. ''' The fake KVStore client.
...@@ -13,9 +14,15 @@ class KVClient(object): ...@@ -13,9 +14,15 @@ class KVClient(object):
''' '''
def __init__(self): def __init__(self):
self._data = {} self._data = {}
self._all_possible_part_policy = {}
self._push_handlers = {} self._push_handlers = {}
self._pull_handlers = {} self._pull_handlers = {}
@property
def all_possible_part_policy(self):
"""Get all possible partition policies"""
return self._all_possible_part_policy
def barrier(self): def barrier(self):
'''barrier''' '''barrier'''
...@@ -31,9 +38,11 @@ class KVClient(object): ...@@ -31,9 +38,11 @@ class KVClient(object):
'''add data to the client''' '''add data to the client'''
self._data[name] = tensor self._data[name] = tensor
def init_data(self, name, shape, dtype, _, init_func): def init_data(self, name, shape, dtype, part_policy, init_func):
'''add new data to the client''' '''add new data to the client'''
self._data[name] = init_func(shape, dtype) self._data[name] = init_func(shape, dtype)
if part_policy.policy_str not in self._all_possible_part_policy:
self._all_possible_part_policy[part_policy.policy_str] = part_policy
def delete_data(self, name): def delete_data(self, name):
'''delete the data''' '''delete the data'''
...@@ -63,3 +72,7 @@ class KVClient(object): ...@@ -63,3 +72,7 @@ class KVClient(object):
def map_shared_data(self, partition_book): def map_shared_data(self, partition_book):
'''Mapping shared-memory tensor from server to client.''' '''Mapping shared-memory tensor from server to client.'''
self._all_possible_part_policy[NODE_PART_POLICY] = PartitionPolicy(NODE_PART_POLICY,
partition_book)
self._all_possible_part_policy[EDGE_PART_POLICY] = PartitionPolicy(EDGE_PART_POLICY,
partition_book)
...@@ -71,7 +71,7 @@ def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_ed ...@@ -71,7 +71,7 @@ def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_ed
dgl.distributed.initialize("kv_ip_config.txt", server_count) dgl.distributed.initialize("kv_ip_config.txt", server_count)
gpb, graph_name = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name), gpb, graph_name = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
part_id, None) part_id, None)
g = DistGraph("kv_ip_config.txt", graph_name, gpb=gpb) g = DistGraph(graph_name, gpb=gpb)
check_dist_graph(g, num_clients, num_nodes, num_edges) check_dist_graph(g, num_clients, num_nodes, num_edges)
def check_dist_graph(g, num_clients, num_nodes, num_edges): def check_dist_graph(g, num_clients, num_nodes, num_edges):
...@@ -93,34 +93,34 @@ def check_dist_graph(g, num_clients, num_nodes, num_edges): ...@@ -93,34 +93,34 @@ def check_dist_graph(g, num_clients, num_nodes, num_edges):
# Test init node data # Test init node data
new_shape = (g.number_of_nodes(), 2) new_shape = (g.number_of_nodes(), 2)
g.ndata['test1'] = dgl.distributed.DistTensor(g, new_shape, F.int32) g.ndata['test1'] = dgl.distributed.DistTensor(new_shape, F.int32)
feats = g.ndata['test1'][nids] feats = g.ndata['test1'][nids]
assert np.all(F.asnumpy(feats) == 0) assert np.all(F.asnumpy(feats) == 0)
# reference to a one that exists # reference to a one that exists
test2 = dgl.distributed.DistTensor(g, new_shape, F.float32, 'test2', init_func=rand_init) test2 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2', init_func=rand_init)
test3 = dgl.distributed.DistTensor(g, new_shape, F.float32, 'test2') test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test2')
assert np.all(F.asnumpy(test2[nids]) == F.asnumpy(test3[nids])) assert np.all(F.asnumpy(test2[nids]) == F.asnumpy(test3[nids]))
# create a tensor and destroy a tensor and create it again. # create a tensor and destroy a tensor and create it again.
test3 = dgl.distributed.DistTensor(g, new_shape, F.float32, 'test3', init_func=rand_init) test3 = dgl.distributed.DistTensor(new_shape, F.float32, 'test3', init_func=rand_init)
del test3 del test3
test3 = dgl.distributed.DistTensor(g, (g.number_of_nodes(), 3), F.float32, 'test3') test3 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test3')
del test3 del test3
# test a persistent tesnor # test a persistent tesnor
test4 = dgl.distributed.DistTensor(g, new_shape, F.float32, 'test4', init_func=rand_init, test4 = dgl.distributed.DistTensor(new_shape, F.float32, 'test4', init_func=rand_init,
persistent=True) persistent=True)
del test4 del test4
try: try:
test4 = dgl.distributed.DistTensor(g, (g.number_of_nodes(), 3), F.float32, 'test4') test4 = dgl.distributed.DistTensor((g.number_of_nodes(), 3), F.float32, 'test4')
raise Exception('') raise Exception('')
except: except:
pass pass
# Test sparse emb # Test sparse emb
try: try:
emb = DistEmbedding(g, g.number_of_nodes(), 1, 'emb1', emb_init) emb = DistEmbedding(g.number_of_nodes(), 1, 'emb1', emb_init)
lr = 0.001 lr = 0.001
optimizer = SparseAdagrad([emb], lr=lr) optimizer = SparseAdagrad([emb], lr=lr)
with F.record_grad(): with F.record_grad():
...@@ -137,13 +137,13 @@ def check_dist_graph(g, num_clients, num_nodes, num_edges): ...@@ -137,13 +137,13 @@ def check_dist_graph(g, num_clients, num_nodes, num_edges):
assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1))) assert np.all(F.asnumpy(feats1) == np.zeros((len(rest), 1)))
policy = dgl.distributed.PartitionPolicy('node', g.get_partition_book()) policy = dgl.distributed.PartitionPolicy('node', g.get_partition_book())
grad_sum = dgl.distributed.DistTensor(g, (g.number_of_nodes(),), F.float32, grad_sum = dgl.distributed.DistTensor((g.number_of_nodes(),), F.float32,
'emb1_sum', policy) 'emb1_sum', policy)
if num_clients == 1: if num_clients == 1:
assert np.all(F.asnumpy(grad_sum[nids]) == np.ones((len(nids), 1)) * num_clients) assert np.all(F.asnumpy(grad_sum[nids]) == np.ones((len(nids), 1)) * num_clients)
assert np.all(F.asnumpy(grad_sum[rest]) == np.zeros((len(rest), 1))) assert np.all(F.asnumpy(grad_sum[rest]) == np.zeros((len(rest), 1)))
emb = DistEmbedding(g, g.number_of_nodes(), 1, 'emb2', emb_init) emb = DistEmbedding(g.number_of_nodes(), 1, 'emb2', emb_init)
with F.no_grad(): with F.no_grad():
feats1 = emb(nids) feats1 = emb(nids)
assert np.all(F.asnumpy(feats1) == 0) assert np.all(F.asnumpy(feats1) == 0)
...@@ -251,8 +251,7 @@ def test_standalone(): ...@@ -251,8 +251,7 @@ def test_standalone():
partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')
dgl.distributed.initialize("kv_ip_config.txt") dgl.distributed.initialize("kv_ip_config.txt")
dist_g = DistGraph("kv_ip_config.txt", graph_name, dist_g = DistGraph(graph_name, part_config='/tmp/dist_graph/{}.json'.format(graph_name))
part_config='/tmp/dist_graph/{}.json'.format(graph_name))
check_dist_graph(dist_g, 1, g.number_of_nodes(), g.number_of_edges()) check_dist_graph(dist_g, 1, g.number_of_nodes(), g.number_of_edges())
dgl.distributed.exit_client() # this is needed since there's two test here in one process dgl.distributed.exit_client() # this is needed since there's two test here in one process
......
...@@ -26,7 +26,7 @@ def start_sample_client(rank, tmpdir, disable_shared_mem): ...@@ -26,7 +26,7 @@ def start_sample_client(rank, tmpdir, disable_shared_mem):
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank) _, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph("rpc_ip_config.txt", "test_sampling", gpb=gpb) dist_graph = DistGraph("test_sampling", gpb=gpb)
sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
dgl.distributed.exit_client() dgl.distributed.exit_client()
return sampled_graph return sampled_graph
...@@ -36,7 +36,7 @@ def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids): ...@@ -36,7 +36,7 @@ def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids):
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_find_edges.json', rank) _, _, _, gpb, _ = load_partition(tmpdir / 'test_find_edges.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph("rpc_ip_config.txt", "test_find_edges", gpb=gpb) dist_graph = DistGraph("test_find_edges", gpb=gpb)
u, v = find_edges(dist_graph, eids) u, v = find_edges(dist_graph, eids)
dgl.distributed.exit_client() dgl.distributed.exit_client()
return u, v return u, v
...@@ -176,7 +176,7 @@ def check_standalone_sampling(tmpdir): ...@@ -176,7 +176,7 @@ def check_standalone_sampling(tmpdir):
os.environ['DGL_DIST_MODE'] = 'standalone' os.environ['DGL_DIST_MODE'] = 'standalone'
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt", 1)
dist_graph = DistGraph(None, "test_sampling", part_config=tmpdir / 'test_sampling.json') dist_graph = DistGraph("test_sampling", part_config=tmpdir / 'test_sampling.json')
sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
src, dst = sampled_graph.edges() src, dst = sampled_graph.edges()
...@@ -200,7 +200,7 @@ def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes): ...@@ -200,7 +200,7 @@ def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes):
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt", 1)
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank) _, _, _, gpb, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank)
dist_graph = DistGraph("rpc_ip_config.txt", "test_in_subgraph", gpb=gpb) dist_graph = DistGraph("test_in_subgraph", gpb=gpb)
sampled_graph = dgl.distributed.in_subgraph(dist_graph, nodes) sampled_graph = dgl.distributed.in_subgraph(dist_graph, nodes)
dgl.distributed.exit_client() dgl.distributed.exit_client()
return sampled_graph return sampled_graph
......
...@@ -55,8 +55,7 @@ def start_client(rank, tmpdir, disable_shared_mem, num_workers, drop_last): ...@@ -55,8 +55,7 @@ def start_client(rank, tmpdir, disable_shared_mem, num_workers, drop_last):
num_nodes_to_sample = 202 num_nodes_to_sample = 202
batch_size = 32 batch_size = 32
train_nid = th.arange(num_nodes_to_sample) train_nid = th.arange(num_nodes_to_sample)
dist_graph = DistGraph("mp_ip_config.txt", "test_mp", gpb=gpb, dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json')
part_config=tmpdir / 'test_sampling.json')
# Create sampler # Create sampler
sampler = NeighborSampler(dist_graph, [5, 10], sampler = NeighborSampler(dist_graph, [5, 10],
......
...@@ -100,8 +100,8 @@ def test_partition_policy(): ...@@ -100,8 +100,8 @@ def test_partition_policy():
eid_partid = edge_policy.to_partid(F.tensor([0,1,2,3,4,5,6], F.int64)) eid_partid = edge_policy.to_partid(F.tensor([0,1,2,3,4,5,6], F.int64))
assert_array_equal(F.asnumpy(nid_partid), F.asnumpy(F.tensor([0,0,0,0,0,0], F.int64))) assert_array_equal(F.asnumpy(nid_partid), F.asnumpy(F.tensor([0,0,0,0,0,0], F.int64)))
assert_array_equal(F.asnumpy(eid_partid), F.asnumpy(F.tensor([0,0,0,0,0,0,0], F.int64))) assert_array_equal(F.asnumpy(eid_partid), F.asnumpy(F.tensor([0,0,0,0,0,0,0], F.int64)))
assert node_policy.get_data_size() == len(node_map) assert node_policy.get_part_size() == len(node_map)
assert edge_policy.get_data_size() == len(edge_map) assert edge_policy.get_part_size() == len(edge_map)
def start_server(server_id, num_clients, num_servers): def start_server(server_id, num_clients, num_servers):
# Init kvserver # Init kvserver
......
import numpy as np
import argparse
import signal
import dgl
from dgl import backend as F
from dgl.data.utils import load_graphs, save_graphs
from dgl.contrib.dist_graph import partition_graph
import pickle
def main():
parser = argparse.ArgumentParser(description='Partition a graph')
parser.add_argument('--data', required=True, type=str,
help='The file path of the input graph in the DGL format.')
parser.add_argument('--graph-name', required=True, type=str,
help='The graph name')
parser.add_argument('-k', '--num-parts', required=True, type=int,
help='The number of partitions')
parser.add_argument('--num-hops', type=int, default=1,
help='The number of hops of HALO nodes we include in a partition')
parser.add_argument('-m', '--method', required=True, type=str,
help='The partitioning method: random, metis')
parser.add_argument('-o', '--output', required=True, type=str,
help='The output directory of the partitioned results')
args = parser.parse_args()
glist, _ = load_graphs(args.data)
g = glist[0]
partition_graph(g, args.graph_name, args.num_parts, args.output,
num_hops=args.num_hops, part_method=args.method)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment