"git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "548c85fff6b0a5b96f6064c86397e15477283f95"
Unverified Commit 0b3a447b authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

auto format distributed (#5317)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
parent 74c9d27d
...@@ -127,7 +127,7 @@ class CustomPool: ...@@ -127,7 +127,7 @@ class CustomPool:
# should be able to take infinite elements to avoid dead lock. # should be able to take infinite elements to avoid dead lock.
self.queue_size = 0 self.queue_size = 0
self.result_queue = ctx.Queue(self.queue_size) self.result_queue = ctx.Queue(self.queue_size)
self.results = {} # key is dataloader name, value is fetched batch. self.results = {} # key is dataloader name, value is fetched batch.
self.task_queues = [] self.task_queues = []
self.process_list = [] self.process_list = []
self.current_proc_id = 0 self.current_proc_id = 0
......
This diff is collapsed.
...@@ -2,21 +2,24 @@ ...@@ -2,21 +2,24 @@
import os import os
from .. import backend as F, utils
from .dist_context import is_initialized from .dist_context import is_initialized
from .kvstore import get_kvstore from .kvstore import get_kvstore
from .role import get_role from .role import get_role
from .. import utils
from .. import backend as F
from .rpc import get_group_id from .rpc import get_group_id
def _default_init_data(shape, dtype): def _default_init_data(shape, dtype):
return F.zeros(shape, dtype, F.cpu()) return F.zeros(shape, dtype, F.cpu())
# These IDs can identify the anonymous distributed tensors. # These IDs can identify the anonymous distributed tensors.
DIST_TENSOR_ID = 0 DIST_TENSOR_ID = 0
class DistTensor: class DistTensor:
''' Distributed tensor. """Distributed tensor.
``DistTensor`` references to a distributed tensor sharded and stored in a cluster of machines. ``DistTensor`` references to a distributed tensor sharded and stored in a cluster of machines.
It has the same interface as Pytorch Tensor to access its metadata (e.g., shape and data type). It has the same interface as Pytorch Tensor to access its metadata (e.g., shape and data type).
...@@ -103,12 +106,23 @@ class DistTensor: ...@@ -103,12 +106,23 @@ class DistTensor:
The creation of ``DistTensor`` is a synchronized operation. When a trainer process tries to The creation of ``DistTensor`` is a synchronized operation. When a trainer process tries to
create a ``DistTensor`` object, the creation succeeds only when all trainer processes create a ``DistTensor`` object, the creation succeeds only when all trainer processes
do the same. do the same.
''' """
def __init__(self, shape, dtype, name=None, init_func=None, part_policy=None,
persistent=False, is_gdata=True, attach=True): def __init__(
self,
shape,
dtype,
name=None,
init_func=None,
part_policy=None,
persistent=False,
is_gdata=True,
attach=True,
):
self.kvstore = get_kvstore() self.kvstore = get_kvstore()
assert self.kvstore is not None, \ assert (
'Distributed module is not initialized. Please call dgl.distributed.initialize.' self.kvstore is not None
), "Distributed module is not initialized. Please call dgl.distributed.initialize."
self._shape = shape self._shape = shape
self._dtype = dtype self._dtype = dtype
self._attach = attach self._attach = attach
...@@ -124,18 +138,21 @@ class DistTensor: ...@@ -124,18 +138,21 @@ class DistTensor:
# If multiple partition policies match the input shape, we cannot # If multiple partition policies match the input shape, we cannot
# decide which is the right one automatically. We should ask users # decide which is the right one automatically. We should ask users
# to provide one. # to provide one.
assert part_policy is None, \ assert part_policy is None, (
'Multiple partition policies match the input shape. ' \ "Multiple partition policies match the input shape. "
+ 'Please provide a partition policy explicitly.' + "Please provide a partition policy explicitly."
)
part_policy = policy part_policy = policy
assert part_policy is not None, \ assert part_policy is not None, (
'Cannot find a right partition policy. It is either because ' \ "Cannot find a right partition policy. It is either because "
+ 'its first dimension does not match the number of nodes or edges ' \ + "its first dimension does not match the number of nodes or edges "
+ 'of a distributed graph or there does not exist a distributed graph.' + "of a distributed graph or there does not exist a distributed graph."
)
self._part_policy = part_policy self._part_policy = part_policy
assert part_policy.get_size() == shape[0], \ assert (
'The partition policy does not match the input shape.' part_policy.get_size() == shape[0]
), "The partition policy does not match the input shape."
if init_func is None: if init_func is None:
init_func = _default_init_data init_func = _default_init_data
...@@ -143,13 +160,17 @@ class DistTensor: ...@@ -143,13 +160,17 @@ class DistTensor:
# If a user doesn't provide a name, we generate a name ourselves. # If a user doesn't provide a name, we generate a name ourselves.
# We need to generate the name in a deterministic way. # We need to generate the name in a deterministic way.
if name is None: if name is None:
assert not persistent, 'We cannot generate anonymous persistent distributed tensors' assert (
not persistent
), "We cannot generate anonymous persistent distributed tensors"
global DIST_TENSOR_ID global DIST_TENSOR_ID
# All processes of the same role should create DistTensor synchronously. # All processes of the same role should create DistTensor synchronously.
# Thus, all of them should have the same IDs. # Thus, all of them should have the same IDs.
name = 'anonymous-' + get_role() + '-' + str(DIST_TENSOR_ID) name = "anonymous-" + get_role() + "-" + str(DIST_TENSOR_ID)
DIST_TENSOR_ID += 1 DIST_TENSOR_ID += 1
assert isinstance(name, str), 'name {} is type {}'.format(name, type(name)) assert isinstance(name, str), "name {} is type {}".format(
name, type(name)
)
name = self._attach_group_id(name) name = self._attach_group_id(name)
self._tensor_name = name self._tensor_name = name
data_name = part_policy.get_data_name(name) data_name = part_policy.get_data_name(name)
...@@ -157,16 +178,24 @@ class DistTensor: ...@@ -157,16 +178,24 @@ class DistTensor:
self._persistent = persistent self._persistent = persistent
if self._name not in exist_names: if self._name not in exist_names:
self._owner = True self._owner = True
self.kvstore.init_data(self._name, shape, dtype, part_policy, init_func, is_gdata) self.kvstore.init_data(
self._name, shape, dtype, part_policy, init_func, is_gdata
)
else: else:
self._owner = False self._owner = False
dtype1, shape1, _ = self.kvstore.get_data_meta(self._name) dtype1, shape1, _ = self.kvstore.get_data_meta(self._name)
assert dtype == dtype1, 'The dtype does not match with the existing tensor' assert (
assert shape == shape1, 'The shape does not match with the existing tensor' dtype == dtype1
), "The dtype does not match with the existing tensor"
assert (
shape == shape1
), "The shape does not match with the existing tensor"
def __del__(self): def __del__(self):
initialized = os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone' \ initialized = (
or is_initialized() os.environ.get("DGL_DIST_MODE", "standalone") == "standalone"
or is_initialized()
)
if not self._persistent and self._owner and initialized: if not self._persistent and self._owner and initialized:
self.kvstore.delete_data(self._name) self.kvstore.delete_data(self._name)
...@@ -193,12 +222,12 @@ class DistTensor: ...@@ -193,12 +222,12 @@ class DistTensor:
def __or__(self, other): def __or__(self, other):
new_dist_tensor = DistTensor( new_dist_tensor = DistTensor(
self._shape, self._shape,
self._dtype, self._dtype,
part_policy=self._part_policy, part_policy=self._part_policy,
persistent=self._persistent, persistent=self._persistent,
is_gdata=self._is_gdata, is_gdata=self._is_gdata,
attach=self._attach attach=self._attach,
) )
kvstore = self.kvstore kvstore = self.kvstore
kvstore.union(self._name, other._name, new_dist_tensor._name) kvstore.union(self._name, other._name, new_dist_tensor._name)
...@@ -209,67 +238,67 @@ class DistTensor: ...@@ -209,67 +238,67 @@ class DistTensor:
@property @property
def part_policy(self): def part_policy(self):
'''Return the partition policy """Return the partition policy
Returns Returns
------- -------
PartitionPolicy PartitionPolicy
The partition policy of the distributed tensor. The partition policy of the distributed tensor.
''' """
return self._part_policy return self._part_policy
@property @property
def shape(self): def shape(self):
'''Return the shape of the distributed tensor. """Return the shape of the distributed tensor.
Returns Returns
------- -------
tuple tuple
The shape of the distributed tensor. The shape of the distributed tensor.
''' """
return self._shape return self._shape
@property @property
def dtype(self): def dtype(self):
'''Return the data type of the distributed tensor. """Return the data type of the distributed tensor.
Returns Returns
------ ------
dtype dtype
The data type of the tensor. The data type of the tensor.
''' """
return self._dtype return self._dtype
@property @property
def name(self): def name(self):
'''Return the name of the distributed tensor """Return the name of the distributed tensor
Returns Returns
------- -------
str str
The name of the tensor. The name of the tensor.
''' """
return self._detach_group_id(self._name) return self._detach_group_id(self._name)
@property @property
def tensor_name(self): def tensor_name(self):
'''Return the tensor name """Return the tensor name
Returns Returns
------- -------
str str
The name of the tensor. The name of the tensor.
''' """
return self._detach_group_id(self._tensor_name) return self._detach_group_id(self._tensor_name)
def count_nonzero(self): def count_nonzero(self):
'''Count and return the number of nonzero value """Count and return the number of nonzero value
Returns Returns
------- -------
int int
the number of nonzero value the number of nonzero value
''' """
return self.kvstore.count_nonzero(name=self._name) return self.kvstore.count_nonzero(name=self._name)
def _attach_group_id(self, name): def _attach_group_id(self, name):
...@@ -295,4 +324,4 @@ class DistTensor: ...@@ -295,4 +324,4 @@ class DistTensor:
if not self._attach: if not self._attach:
return name return name
suffix = "_{}".format(get_group_id()) suffix = "_{}".format(get_group_id())
return name[:-len(suffix)] return name[: -len(suffix)]
...@@ -5,8 +5,7 @@ from abc import ABC ...@@ -5,8 +5,7 @@ from abc import ABC
import numpy as np import numpy as np
from .. import backend as F from .. import backend as F, utils
from .. import utils
from .._ffi.ndarray import empty_shared_mem from .._ffi.ndarray import empty_shared_mem
from ..base import DGLError from ..base import DGLError
from ..ndarray import exist_shared_mem_array from ..ndarray import exist_shared_mem_array
...@@ -14,16 +13,17 @@ from ..partition import NDArrayPartition ...@@ -14,16 +13,17 @@ from ..partition import NDArrayPartition
from .constants import DEFAULT_ETYPE, DEFAULT_NTYPE from .constants import DEFAULT_ETYPE, DEFAULT_NTYPE
from .id_map import IdMap from .id_map import IdMap
from .shared_mem_utils import ( from .shared_mem_utils import (
DTYPE_DICT,
_get_edata_path, _get_edata_path,
_get_ndata_path, _get_ndata_path,
_to_shared_mem, _to_shared_mem,
DTYPE_DICT,
) )
CANONICAL_ETYPE_DELIMITER = ":" CANONICAL_ETYPE_DELIMITER = ":"
def _etype_tuple_to_str(c_etype): def _etype_tuple_to_str(c_etype):
'''Convert canonical etype from tuple to string. """Convert canonical etype from tuple to string.
Examples Examples
-------- --------
...@@ -32,14 +32,16 @@ def _etype_tuple_to_str(c_etype): ...@@ -32,14 +32,16 @@ def _etype_tuple_to_str(c_etype):
>>> print(c_etype_str) >>> print(c_etype_str)
'user:like:item' 'user:like:item'
''' """
assert isinstance(c_etype, tuple) and len(c_etype) == 3, \ assert isinstance(c_etype, tuple) and len(c_etype) == 3, (
"Passed-in canonical etype should be in format of (str, str, str). " \ "Passed-in canonical etype should be in format of (str, str, str). "
f"But got {c_etype}." f"But got {c_etype}."
)
return CANONICAL_ETYPE_DELIMITER.join(c_etype) return CANONICAL_ETYPE_DELIMITER.join(c_etype)
def _etype_str_to_tuple(c_etype): def _etype_str_to_tuple(c_etype):
'''Convert canonical etype from tuple to string. """Convert canonical etype from tuple to string.
Examples Examples
-------- --------
...@@ -48,13 +50,15 @@ def _etype_str_to_tuple(c_etype): ...@@ -48,13 +50,15 @@ def _etype_str_to_tuple(c_etype):
>>> print(c_etype) >>> print(c_etype)
('user', 'like', 'item') ('user', 'like', 'item')
''' """
ret = tuple(c_etype.split(CANONICAL_ETYPE_DELIMITER)) ret = tuple(c_etype.split(CANONICAL_ETYPE_DELIMITER))
assert len(ret) == 3, \ assert len(ret) == 3, (
"Passed-in canonical etype should be in format of 'str:str:str'. " \ "Passed-in canonical etype should be in format of 'str:str:str'. "
f"But got {c_etype}." f"But got {c_etype}."
)
return ret return ret
def _move_metadata_to_shared_mem( def _move_metadata_to_shared_mem(
graph_name, graph_name,
num_nodes, num_nodes,
...@@ -533,6 +537,7 @@ class GraphPartitionBook(ABC): ...@@ -533,6 +537,7 @@ class GraphPartitionBook(ABC):
Homogeneous edge IDs. Homogeneous edge IDs.
""" """
class RangePartitionBook(GraphPartitionBook): class RangePartitionBook(GraphPartitionBook):
"""This partition book supports more efficient storage of partition information. """This partition book supports more efficient storage of partition information.
...@@ -582,9 +587,10 @@ class RangePartitionBook(GraphPartitionBook): ...@@ -582,9 +587,10 @@ class RangePartitionBook(GraphPartitionBook):
ntype is not None for ntype in self._ntypes ntype is not None for ntype in self._ntypes
), "The node types have invalid IDs." ), "The node types have invalid IDs."
for c_etype, etype_id in etypes.items(): for c_etype, etype_id in etypes.items():
assert isinstance(c_etype, tuple) and len(c_etype) == 3, \ assert isinstance(c_etype, tuple) and len(c_etype) == 3, (
"Expect canonical edge type in a triplet of string, but got " \ "Expect canonical edge type in a triplet of string, but got "
f"{c_etype}." f"{c_etype}."
)
etype = c_etype[1] etype = c_etype[1]
self._etypes[etype_id] = etype self._etypes[etype_id] = etype
self._canonical_etypes[etype_id] = c_etype self._canonical_etypes[etype_id] = c_etype
...@@ -660,13 +666,19 @@ class RangePartitionBook(GraphPartitionBook): ...@@ -660,13 +666,19 @@ class RangePartitionBook(GraphPartitionBook):
# to local heterogenized node/edge IDs. One can do the mapping by binary search # to local heterogenized node/edge IDs. One can do the mapping by binary search
# on these arrays. # on these arrays.
self._local_ntype_offset = np.cumsum( self._local_ntype_offset = np.cumsum(
[0] + [ [0]
v[self._partid, 1] - v[self._partid, 0] + [
for v in self._typed_nid_range.values()]).tolist() v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_nid_range.values()
]
).tolist()
self._local_etype_offset = np.cumsum( self._local_etype_offset = np.cumsum(
[0] + [ [0]
v[self._partid, 1] - v[self._partid, 0] + [
for v in self._typed_eid_range.values()]).tolist() v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_eid_range.values()
]
).tolist()
# Get meta data of the partition book # Get meta data of the partition book
self._partition_meta_data = [] self._partition_meta_data = []
...@@ -945,7 +957,7 @@ class RangePartitionBook(GraphPartitionBook): ...@@ -945,7 +957,7 @@ class RangePartitionBook(GraphPartitionBook):
NODE_PART_POLICY = "node" NODE_PART_POLICY = "node"
EDGE_PART_POLICY = "edge" EDGE_PART_POLICY = "edge"
POLICY_DELIMITER = '~' POLICY_DELIMITER = "~"
class PartitionPolicy(object): class PartitionPolicy(object):
...@@ -967,11 +979,12 @@ class PartitionPolicy(object): ...@@ -967,11 +979,12 @@ class PartitionPolicy(object):
""" """
def __init__(self, policy_str, partition_book): def __init__(self, policy_str, partition_book):
assert (policy_str.startswith(NODE_PART_POLICY) or assert policy_str.startswith(NODE_PART_POLICY) or policy_str.startswith(
policy_str.startswith(EDGE_PART_POLICY)), ( EDGE_PART_POLICY
f"policy_str must start with {NODE_PART_POLICY} or " ), (
f"{EDGE_PART_POLICY}, but got {policy_str}." f"policy_str must start with {NODE_PART_POLICY} or "
) f"{EDGE_PART_POLICY}, but got {policy_str}."
)
if NODE_PART_POLICY == policy_str: if NODE_PART_POLICY == policy_str:
policy_str = NODE_PART_POLICY + POLICY_DELIMITER + DEFAULT_NTYPE policy_str = NODE_PART_POLICY + POLICY_DELIMITER + DEFAULT_NTYPE
if EDGE_PART_POLICY == policy_str: if EDGE_PART_POLICY == policy_str:
...@@ -1127,11 +1140,12 @@ class EdgePartitionPolicy(PartitionPolicy): ...@@ -1127,11 +1140,12 @@ class EdgePartitionPolicy(PartitionPolicy):
"""Partition policy for edges.""" """Partition policy for edges."""
def __init__(self, partition_book, etype=DEFAULT_ETYPE): def __init__(self, partition_book, etype=DEFAULT_ETYPE):
assert isinstance(etype, tuple) and len(etype) == 3, \ assert (
f"Expect canonical edge type in a triplet of string, but got {etype}." isinstance(etype, tuple) and len(etype) == 3
), f"Expect canonical edge type in a triplet of string, but got {etype}."
super(EdgePartitionPolicy, self).__init__( super(EdgePartitionPolicy, self).__init__(
EDGE_PART_POLICY + POLICY_DELIMITER + _etype_tuple_to_str(etype), EDGE_PART_POLICY + POLICY_DELIMITER + _etype_tuple_to_str(etype),
partition_book partition_book,
) )
...@@ -1156,9 +1170,10 @@ class HeteroDataName(object): ...@@ -1156,9 +1170,10 @@ class HeteroDataName(object):
def __init__(self, is_node, entity_type, data_name): def __init__(self, is_node, entity_type, data_name):
self._policy = NODE_PART_POLICY if is_node else EDGE_PART_POLICY self._policy = NODE_PART_POLICY if is_node else EDGE_PART_POLICY
if not is_node: if not is_node:
assert isinstance(entity_type, tuple) and len(entity_type) == 3, \ assert isinstance(entity_type, tuple) and len(entity_type) == 3, (
"Expect canonical edge type in a triplet of string, but got " \ "Expect canonical edge type in a triplet of string, but got "
f"{entity_type}." f"{entity_type}."
)
self._entity_type = entity_type self._entity_type = entity_type
self.data_name = data_name self.data_name = data_name
...@@ -1226,6 +1241,4 @@ def parse_hetero_data_name(name): ...@@ -1226,6 +1241,4 @@ def parse_hetero_data_name(name):
entity_type = names[1] entity_type = names[1]
if not is_node: if not is_node:
entity_type = _etype_str_to_tuple(entity_type) entity_type = _etype_str_to_tuple(entity_type)
return HeteroDataName( return HeteroDataName(is_node, entity_type, names[2])
is_node, entity_type, names[2]
)
...@@ -6,16 +6,17 @@ import numpy as np ...@@ -6,16 +6,17 @@ import numpy as np
from .. import backend as F from .. import backend as F
from ..base import EID, NID from ..base import EID, NID
from ..convert import graph, heterograph from ..convert import graph, heterograph
from ..sampling import sample_etype_neighbors as local_sample_etype_neighbors from ..sampling import (
from ..sampling import sample_neighbors as local_sample_neighbors sample_etype_neighbors as local_sample_etype_neighbors,
sample_neighbors as local_sample_neighbors,
)
from ..subgraph import in_subgraph as local_in_subgraph from ..subgraph import in_subgraph as local_in_subgraph
from ..utils import toindex from ..utils import toindex
from .. import backend as F
from .rpc import ( from .rpc import (
Request,
Response,
recv_responses, recv_responses,
register_service, register_service,
Request,
Response,
send_requests_to_machine, send_requests_to_machine,
) )
...@@ -207,6 +208,7 @@ def _in_subgraph(local_g, partition_book, seed_nodes): ...@@ -207,6 +208,7 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
# This is a limitation of the current DistDGL design. We should improve it # This is a limitation of the current DistDGL design. We should improve it
# later. # later.
class SamplingRequest(Request): class SamplingRequest(Request):
"""Sampling Request""" """Sampling Request"""
...@@ -798,9 +800,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False): ...@@ -798,9 +800,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
def local_access(local_g, partition_book, local_nids): def local_access(local_g, partition_book, local_nids):
# See NOTE 1 # See NOTE 1
_prob = ( _prob = [g.edata[prob].local_partition] if prob is not None else None
[g.edata[prob].local_partition] if prob is not None else None
)
return _sample_neighbors( return _sample_neighbors(
local_g, local_g,
partition_book, partition_book,
......
"""Module for mapping between node/edge IDs and node/edge types.""" """Module for mapping between node/edge IDs and node/edge types."""
import numpy as np import numpy as np
from .. import backend as F, utils
from .._ffi.function import _init_api from .._ffi.function import _init_api
from .. import backend as F
from .. import utils
class IdMap: class IdMap:
'''A map for converting node/edge IDs to their type IDs and type-wise IDs. """A map for converting node/edge IDs to their type IDs and type-wise IDs.
For a heterogeneous graph, DGL assigns an integer ID to each node/edge type; For a heterogeneous graph, DGL assigns an integer ID to each node/edge type;
node and edge of different types have independent IDs starting from zero. node and edge of different types have independent IDs starting from zero.
...@@ -96,7 +97,8 @@ class IdMap: ...@@ -96,7 +97,8 @@ class IdMap:
for a particular node type in a partition. For example, all nodes of type ``"T"`` in for a particular node type in a partition. For example, all nodes of type ``"T"`` in
partition ``i`` has ID range ``id_ranges["T"][i][0]`` to ``id_ranges["T"][i][1]``. partition ``i`` has ID range ``id_ranges["T"][i][0]`` to ``id_ranges["T"][i][1]``.
It is the same as the `node_map` argument in `RangePartitionBook`. It is the same as the `node_map` argument in `RangePartitionBook`.
''' """
def __init__(self, id_ranges): def __init__(self, id_ranges):
self.num_parts = list(id_ranges.values())[0].shape[0] self.num_parts = list(id_ranges.values())[0].shape[0]
self.num_types = len(id_ranges) self.num_types = len(id_ranges)
...@@ -105,7 +107,7 @@ class IdMap: ...@@ -105,7 +107,7 @@ class IdMap:
id_ranges = list(id_ranges.values()) id_ranges = list(id_ranges.values())
id_ranges.sort(key=lambda a: a[0, 0]) id_ranges.sort(key=lambda a: a[0, 0])
for i, id_range in enumerate(id_ranges): for i, id_range in enumerate(id_ranges):
ranges[i::self.num_types] = id_range ranges[i :: self.num_types] = id_range
map1 = np.cumsum(id_range[:, 1] - id_range[:, 0]) map1 = np.cumsum(id_range[:, 1] - id_range[:, 0])
typed_map.append(map1) typed_map.append(map1)
...@@ -116,7 +118,7 @@ class IdMap: ...@@ -116,7 +118,7 @@ class IdMap:
self.typed_map = utils.toindex(np.concatenate(typed_map)) self.typed_map = utils.toindex(np.concatenate(typed_map))
def __call__(self, ids): def __call__(self, ids):
'''Convert the homogeneous IDs to (type_id, type_wise_id). """Convert the homogeneous IDs to (type_id, type_wise_id).
Parameters Parameters
---------- ----------
...@@ -129,19 +131,23 @@ class IdMap: ...@@ -129,19 +131,23 @@ class IdMap:
Type IDs Type IDs
per_type_ids : Tensor per_type_ids : Tensor
Type-wise IDs Type-wise IDs
''' """
if self.num_types == 0: if self.num_types == 0:
return F.zeros((len(ids),), F.dtype(ids), F.cpu()), ids return F.zeros((len(ids),), F.dtype(ids), F.cpu()), ids
if len(ids) == 0: if len(ids) == 0:
return ids, ids return ids, ids
ids = utils.toindex(ids) ids = utils.toindex(ids)
ret = _CAPI_DGLHeteroMapIds(ids.todgltensor(), ret = _CAPI_DGLHeteroMapIds(
self.range_start.todgltensor(), ids.todgltensor(),
self.range_end.todgltensor(), self.range_start.todgltensor(),
self.typed_map.todgltensor(), self.range_end.todgltensor(),
self.num_parts, self.num_types) self.typed_map.todgltensor(),
self.num_parts,
self.num_types,
)
ret = utils.toindex(ret).tousertensor() ret = utils.toindex(ret).tousertensor()
return ret[:len(ids)], ret[len(ids):] return ret[: len(ids)], ret[len(ids) :]
_init_api("dgl.distributed.id_map") _init_api("dgl.distributed.id_map")
This diff is collapsed.
"""Define sparse embedding and optimizer.""" """Define sparse embedding and optimizer."""
import torch as th import torch as th
from .... import backend as F
from .... import utils from .... import backend as F, utils
from ...dist_tensor import DistTensor from ...dist_tensor import DistTensor
class DistEmbedding: class DistEmbedding:
'''Distributed node embeddings. """Distributed node embeddings.
DGL provides a distributed embedding to support models that require learnable embeddings. DGL provides a distributed embedding to support models that require learnable embeddings.
DGL's distributed embeddings are mainly used for learning node embeddings of graph models. DGL's distributed embeddings are mainly used for learning node embeddings of graph models.
...@@ -63,11 +64,23 @@ class DistEmbedding: ...@@ -63,11 +64,23 @@ class DistEmbedding:
the forward computation, users have to invoke the forward computation, users have to invoke
py:meth:`~dgl.distributed.optim.SparseAdagrad.step` afterwards. Otherwise, there will be py:meth:`~dgl.distributed.optim.SparseAdagrad.step` afterwards. Otherwise, there will be
some memory leak. some memory leak.
''' """
def __init__(self, num_embeddings, embedding_dim, name=None,
init_func=None, part_policy=None): def __init__(
self._tensor = DistTensor((num_embeddings, embedding_dim), F.float32, name, self,
init_func=init_func, part_policy=part_policy) num_embeddings,
embedding_dim,
name=None,
init_func=None,
part_policy=None,
):
self._tensor = DistTensor(
(num_embeddings, embedding_dim),
F.float32,
name,
init_func=init_func,
part_policy=part_policy,
)
self._trace = [] self._trace = []
self._name = name self._name = name
self._num_embeddings = num_embeddings self._num_embeddings = num_embeddings
...@@ -81,10 +94,10 @@ class DistEmbedding: ...@@ -81,10 +94,10 @@ class DistEmbedding:
# actually fails unit test. ??? # actually fails unit test. ???
# else: # else:
# assert 'th.distributed should be initialized' # assert 'th.distributed should be initialized'
self._optm_state = None # track optimizer state self._optm_state = None # track optimizer state
self._part_policy = part_policy self._part_policy = part_policy
def __call__(self, idx, device=th.device('cpu')): def __call__(self, idx, device=th.device("cpu")):
""" """
node_ids : th.tensor node_ids : th.tensor
Index of the embeddings to collect. Index of the embeddings to collect.
...@@ -104,8 +117,7 @@ class DistEmbedding: ...@@ -104,8 +117,7 @@ class DistEmbedding:
return emb return emb
def reset_trace(self): def reset_trace(self):
'''Reset the traced data. """Reset the traced data."""
'''
self._trace = [] self._trace = []
@property @property
......
...@@ -10,9 +10,9 @@ import dgl ...@@ -10,9 +10,9 @@ import dgl
from .... import backend as F from .... import backend as F
from ...dist_tensor import DistTensor from ...dist_tensor import DistTensor
from ...graph_partition_book import EDGE_PART_POLICY, NODE_PART_POLICY
from ...nn.pytorch import DistEmbedding from ...nn.pytorch import DistEmbedding
from .utils import alltoall_cpu, alltoallv_cpu from .utils import alltoall_cpu, alltoallv_cpu
from ...graph_partition_book import EDGE_PART_POLICY, NODE_PART_POLICY
EMB_STATES = "emb_states" EMB_STATES = "emb_states"
WORLD_SIZE = "world_size" WORLD_SIZE = "world_size"
......
This diff is collapsed.
"""Define utility functions for shared memory.""" """Define utility functions for shared memory."""
from .. import backend as F from .. import backend as F, ndarray as nd
from .. import ndarray as nd
from .._ffi.ndarray import empty_shared_mem from .._ffi.ndarray import empty_shared_mem
DTYPE_DICT = F.data_type_dict DTYPE_DICT = F.data_type_dict
......
...@@ -5,12 +5,14 @@ This kvstore is used when running in the standalone mode ...@@ -5,12 +5,14 @@ This kvstore is used when running in the standalone mode
from .. import backend as F from .. import backend as F
class KVClient(object): class KVClient(object):
''' The fake KVStore client. """The fake KVStore client.
This is to mimic the distributed KVStore client. It's used for DistGraph This is to mimic the distributed KVStore client. It's used for DistGraph
in standalone mode. in standalone mode.
''' """
def __init__(self): def __init__(self):
self._data = {} self._data = {}
self._all_possible_part_policy = {} self._all_possible_part_policy = {}
...@@ -30,25 +32,27 @@ class KVClient(object): ...@@ -30,25 +32,27 @@ class KVClient(object):
return 1 return 1
def barrier(self): def barrier(self):
'''barrier''' """barrier"""
def register_push_handler(self, name, func): def register_push_handler(self, name, func):
'''register push handler''' """register push handler"""
self._push_handlers[name] = func self._push_handlers[name] = func
def register_pull_handler(self, name, func): def register_pull_handler(self, name, func):
'''register pull handler''' """register pull handler"""
self._pull_handlers[name] = func self._pull_handlers[name] = func
def add_data(self, name, tensor, part_policy): def add_data(self, name, tensor, part_policy):
'''add data to the client''' """add data to the client"""
self._data[name] = tensor self._data[name] = tensor
self._gdata_name_list.add(name) self._gdata_name_list.add(name)
if part_policy.policy_str not in self._all_possible_part_policy: if part_policy.policy_str not in self._all_possible_part_policy:
self._all_possible_part_policy[part_policy.policy_str] = part_policy self._all_possible_part_policy[part_policy.policy_str] = part_policy
def init_data(self, name, shape, dtype, part_policy, init_func, is_gdata=True): def init_data(
'''add new data to the client''' self, name, shape, dtype, part_policy, init_func, is_gdata=True
):
"""add new data to the client"""
self._data[name] = init_func(shape, dtype) self._data[name] = init_func(shape, dtype)
if part_policy.policy_str not in self._all_possible_part_policy: if part_policy.policy_str not in self._all_possible_part_policy:
self._all_possible_part_policy[part_policy.policy_str] = part_policy self._all_possible_part_policy[part_policy.policy_str] = part_policy
...@@ -56,38 +60,38 @@ class KVClient(object): ...@@ -56,38 +60,38 @@ class KVClient(object):
self._gdata_name_list.add(name) self._gdata_name_list.add(name)
def delete_data(self, name): def delete_data(self, name):
'''delete the data''' """delete the data"""
del self._data[name] del self._data[name]
self._gdata_name_list.remove(name) self._gdata_name_list.remove(name)
def data_name_list(self): def data_name_list(self):
'''get the names of all data''' """get the names of all data"""
return list(self._data.keys()) return list(self._data.keys())
def gdata_name_list(self): def gdata_name_list(self):
'''get the names of graph data''' """get the names of graph data"""
return list(self._gdata_name_list) return list(self._gdata_name_list)
def get_data_meta(self, name): def get_data_meta(self, name):
'''get the metadata of data''' """get the metadata of data"""
return F.dtype(self._data[name]), F.shape(self._data[name]), None return F.dtype(self._data[name]), F.shape(self._data[name]), None
def push(self, name, id_tensor, data_tensor): def push(self, name, id_tensor, data_tensor):
'''push data to kvstore''' """push data to kvstore"""
if name in self._push_handlers: if name in self._push_handlers:
self._push_handlers[name](self._data, name, id_tensor, data_tensor) self._push_handlers[name](self._data, name, id_tensor, data_tensor)
else: else:
F.scatter_row_inplace(self._data[name], id_tensor, data_tensor) F.scatter_row_inplace(self._data[name], id_tensor, data_tensor)
def pull(self, name, id_tensor): def pull(self, name, id_tensor):
'''pull data from kvstore''' """pull data from kvstore"""
if name in self._pull_handlers: if name in self._pull_handlers:
return self._pull_handlers[name](self._data, name, id_tensor) return self._pull_handlers[name](self._data, name, id_tensor)
else: else:
return F.gather_row(self._data[name], id_tensor) return F.gather_row(self._data[name], id_tensor)
def map_shared_data(self, partition_book): def map_shared_data(self, partition_book):
'''Mapping shared-memory tensor from server to client.''' """Mapping shared-memory tensor from server to client."""
def count_nonzero(self, name): def count_nonzero(self, name):
"""Count nonzero value by pull request from KVServers. """Count nonzero value by pull request from KVServers.
...@@ -116,8 +120,7 @@ class KVClient(object): ...@@ -116,8 +120,7 @@ class KVClient(object):
return self._data return self._data
def union(self, operand1_name, operand2_name, output_name): def union(self, operand1_name, operand2_name, output_name):
"""Compute the union of two mask arrays in the KVStore. """Compute the union of two mask arrays in the KVStore."""
"""
self._data[output_name][:] = ( self._data[output_name][:] = (
self._data[operand1_name] | self._data[operand2_name] self._data[operand1_name] | self._data[operand2_name]
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment