"git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "23e1da778ddcb2a217fa965435d44df1d09cf4b2"
Unverified Commit 0b3a447b authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

auto format distributed (#5317)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
parent 74c9d27d
......@@ -127,7 +127,7 @@ class CustomPool:
# should be able to take infinite elements to avoid dead lock.
self.queue_size = 0
self.result_queue = ctx.Queue(self.queue_size)
self.results = {} # key is dataloader name, value is fetched batch.
self.results = {} # key is dataloader name, value is fetched batch.
self.task_queues = []
self.process_list = []
self.current_proc_id = 0
......
This diff is collapsed.
......@@ -2,21 +2,24 @@
import os
from .. import backend as F, utils
from .dist_context import is_initialized
from .kvstore import get_kvstore
from .role import get_role
from .. import utils
from .. import backend as F
from .rpc import get_group_id
def _default_init_data(shape, dtype):
return F.zeros(shape, dtype, F.cpu())
# These IDs can identify the anonymous distributed tensors.
DIST_TENSOR_ID = 0
class DistTensor:
''' Distributed tensor.
"""Distributed tensor.
``DistTensor`` references to a distributed tensor sharded and stored in a cluster of machines.
It has the same interface as Pytorch Tensor to access its metadata (e.g., shape and data type).
......@@ -103,12 +106,23 @@ class DistTensor:
The creation of ``DistTensor`` is a synchronized operation. When a trainer process tries to
create a ``DistTensor`` object, the creation succeeds only when all trainer processes
do the same.
'''
def __init__(self, shape, dtype, name=None, init_func=None, part_policy=None,
persistent=False, is_gdata=True, attach=True):
"""
def __init__(
self,
shape,
dtype,
name=None,
init_func=None,
part_policy=None,
persistent=False,
is_gdata=True,
attach=True,
):
self.kvstore = get_kvstore()
assert self.kvstore is not None, \
'Distributed module is not initialized. Please call dgl.distributed.initialize.'
assert (
self.kvstore is not None
), "Distributed module is not initialized. Please call dgl.distributed.initialize."
self._shape = shape
self._dtype = dtype
self._attach = attach
......@@ -124,18 +138,21 @@ class DistTensor:
# If multiple partition policies match the input shape, we cannot
# decide which is the right one automatically. We should ask users
# to provide one.
assert part_policy is None, \
'Multiple partition policies match the input shape. ' \
+ 'Please provide a partition policy explicitly.'
assert part_policy is None, (
"Multiple partition policies match the input shape. "
+ "Please provide a partition policy explicitly."
)
part_policy = policy
assert part_policy is not None, \
'Cannot find a right partition policy. It is either because ' \
+ 'its first dimension does not match the number of nodes or edges ' \
+ 'of a distributed graph or there does not exist a distributed graph.'
assert part_policy is not None, (
"Cannot find a right partition policy. It is either because "
+ "its first dimension does not match the number of nodes or edges "
+ "of a distributed graph or there does not exist a distributed graph."
)
self._part_policy = part_policy
assert part_policy.get_size() == shape[0], \
'The partition policy does not match the input shape.'
assert (
part_policy.get_size() == shape[0]
), "The partition policy does not match the input shape."
if init_func is None:
init_func = _default_init_data
......@@ -143,13 +160,17 @@ class DistTensor:
# If a user doesn't provide a name, we generate a name ourselves.
# We need to generate the name in a deterministic way.
if name is None:
assert not persistent, 'We cannot generate anonymous persistent distributed tensors'
assert (
not persistent
), "We cannot generate anonymous persistent distributed tensors"
global DIST_TENSOR_ID
# All processes of the same role should create DistTensor synchronously.
# Thus, all of them should have the same IDs.
name = 'anonymous-' + get_role() + '-' + str(DIST_TENSOR_ID)
name = "anonymous-" + get_role() + "-" + str(DIST_TENSOR_ID)
DIST_TENSOR_ID += 1
assert isinstance(name, str), 'name {} is type {}'.format(name, type(name))
assert isinstance(name, str), "name {} is type {}".format(
name, type(name)
)
name = self._attach_group_id(name)
self._tensor_name = name
data_name = part_policy.get_data_name(name)
......@@ -157,16 +178,24 @@ class DistTensor:
self._persistent = persistent
if self._name not in exist_names:
self._owner = True
self.kvstore.init_data(self._name, shape, dtype, part_policy, init_func, is_gdata)
self.kvstore.init_data(
self._name, shape, dtype, part_policy, init_func, is_gdata
)
else:
self._owner = False
dtype1, shape1, _ = self.kvstore.get_data_meta(self._name)
assert dtype == dtype1, 'The dtype does not match with the existing tensor'
assert shape == shape1, 'The shape does not match with the existing tensor'
assert (
dtype == dtype1
), "The dtype does not match with the existing tensor"
assert (
shape == shape1
), "The shape does not match with the existing tensor"
def __del__(self):
initialized = os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone' \
or is_initialized()
initialized = (
os.environ.get("DGL_DIST_MODE", "standalone") == "standalone"
or is_initialized()
)
if not self._persistent and self._owner and initialized:
self.kvstore.delete_data(self._name)
......@@ -193,12 +222,12 @@ class DistTensor:
def __or__(self, other):
new_dist_tensor = DistTensor(
self._shape,
self._dtype,
part_policy=self._part_policy,
persistent=self._persistent,
is_gdata=self._is_gdata,
attach=self._attach
self._shape,
self._dtype,
part_policy=self._part_policy,
persistent=self._persistent,
is_gdata=self._is_gdata,
attach=self._attach,
)
kvstore = self.kvstore
kvstore.union(self._name, other._name, new_dist_tensor._name)
......@@ -209,67 +238,67 @@ class DistTensor:
@property
def part_policy(self):
'''Return the partition policy
"""Return the partition policy
Returns
-------
PartitionPolicy
The partition policy of the distributed tensor.
'''
"""
return self._part_policy
@property
def shape(self):
'''Return the shape of the distributed tensor.
"""Return the shape of the distributed tensor.
Returns
-------
tuple
The shape of the distributed tensor.
'''
"""
return self._shape
@property
def dtype(self):
'''Return the data type of the distributed tensor.
"""Return the data type of the distributed tensor.
Returns
------
dtype
The data type of the tensor.
'''
"""
return self._dtype
@property
def name(self):
'''Return the name of the distributed tensor
"""Return the name of the distributed tensor
Returns
-------
str
The name of the tensor.
'''
"""
return self._detach_group_id(self._name)
@property
def tensor_name(self):
'''Return the tensor name
"""Return the tensor name
Returns
-------
str
The name of the tensor.
'''
"""
return self._detach_group_id(self._tensor_name)
def count_nonzero(self):
'''Count and return the number of nonzero value
"""Count and return the number of nonzero value
Returns
-------
int
the number of nonzero value
'''
"""
return self.kvstore.count_nonzero(name=self._name)
def _attach_group_id(self, name):
......@@ -295,4 +324,4 @@ class DistTensor:
if not self._attach:
return name
suffix = "_{}".format(get_group_id())
return name[:-len(suffix)]
return name[: -len(suffix)]
......@@ -5,8 +5,7 @@ from abc import ABC
import numpy as np
from .. import backend as F
from .. import utils
from .. import backend as F, utils
from .._ffi.ndarray import empty_shared_mem
from ..base import DGLError
from ..ndarray import exist_shared_mem_array
......@@ -14,16 +13,17 @@ from ..partition import NDArrayPartition
from .constants import DEFAULT_ETYPE, DEFAULT_NTYPE
from .id_map import IdMap
from .shared_mem_utils import (
DTYPE_DICT,
_get_edata_path,
_get_ndata_path,
_to_shared_mem,
DTYPE_DICT,
)
CANONICAL_ETYPE_DELIMITER = ":"
def _etype_tuple_to_str(c_etype):
'''Convert canonical etype from tuple to string.
"""Convert canonical etype from tuple to string.
Examples
--------
......@@ -32,14 +32,16 @@ def _etype_tuple_to_str(c_etype):
>>> print(c_etype_str)
'user:like:item'
'''
assert isinstance(c_etype, tuple) and len(c_etype) == 3, \
"Passed-in canonical etype should be in format of (str, str, str). " \
"""
assert isinstance(c_etype, tuple) and len(c_etype) == 3, (
"Passed-in canonical etype should be in format of (str, str, str). "
f"But got {c_etype}."
)
return CANONICAL_ETYPE_DELIMITER.join(c_etype)
def _etype_str_to_tuple(c_etype):
'''Convert canonical etype from tuple to string.
"""Convert canonical etype from tuple to string.
Examples
--------
......@@ -48,13 +50,15 @@ def _etype_str_to_tuple(c_etype):
>>> print(c_etype)
('user', 'like', 'item')
'''
"""
ret = tuple(c_etype.split(CANONICAL_ETYPE_DELIMITER))
assert len(ret) == 3, \
"Passed-in canonical etype should be in format of 'str:str:str'. " \
assert len(ret) == 3, (
"Passed-in canonical etype should be in format of 'str:str:str'. "
f"But got {c_etype}."
)
return ret
def _move_metadata_to_shared_mem(
graph_name,
num_nodes,
......@@ -533,6 +537,7 @@ class GraphPartitionBook(ABC):
Homogeneous edge IDs.
"""
class RangePartitionBook(GraphPartitionBook):
"""This partition book supports more efficient storage of partition information.
......@@ -582,9 +587,10 @@ class RangePartitionBook(GraphPartitionBook):
ntype is not None for ntype in self._ntypes
), "The node types have invalid IDs."
for c_etype, etype_id in etypes.items():
assert isinstance(c_etype, tuple) and len(c_etype) == 3, \
"Expect canonical edge type in a triplet of string, but got " \
assert isinstance(c_etype, tuple) and len(c_etype) == 3, (
"Expect canonical edge type in a triplet of string, but got "
f"{c_etype}."
)
etype = c_etype[1]
self._etypes[etype_id] = etype
self._canonical_etypes[etype_id] = c_etype
......@@ -660,13 +666,19 @@ class RangePartitionBook(GraphPartitionBook):
# to local heterogenized node/edge IDs. One can do the mapping by binary search
# on these arrays.
self._local_ntype_offset = np.cumsum(
[0] + [
v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_nid_range.values()]).tolist()
[0]
+ [
v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_nid_range.values()
]
).tolist()
self._local_etype_offset = np.cumsum(
[0] + [
v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_eid_range.values()]).tolist()
[0]
+ [
v[self._partid, 1] - v[self._partid, 0]
for v in self._typed_eid_range.values()
]
).tolist()
# Get meta data of the partition book
self._partition_meta_data = []
......@@ -945,7 +957,7 @@ class RangePartitionBook(GraphPartitionBook):
NODE_PART_POLICY = "node"
EDGE_PART_POLICY = "edge"
POLICY_DELIMITER = '~'
POLICY_DELIMITER = "~"
class PartitionPolicy(object):
......@@ -967,11 +979,12 @@ class PartitionPolicy(object):
"""
def __init__(self, policy_str, partition_book):
assert (policy_str.startswith(NODE_PART_POLICY) or
policy_str.startswith(EDGE_PART_POLICY)), (
f"policy_str must start with {NODE_PART_POLICY} or "
f"{EDGE_PART_POLICY}, but got {policy_str}."
)
assert policy_str.startswith(NODE_PART_POLICY) or policy_str.startswith(
EDGE_PART_POLICY
), (
f"policy_str must start with {NODE_PART_POLICY} or "
f"{EDGE_PART_POLICY}, but got {policy_str}."
)
if NODE_PART_POLICY == policy_str:
policy_str = NODE_PART_POLICY + POLICY_DELIMITER + DEFAULT_NTYPE
if EDGE_PART_POLICY == policy_str:
......@@ -1127,11 +1140,12 @@ class EdgePartitionPolicy(PartitionPolicy):
"""Partition policy for edges."""
def __init__(self, partition_book, etype=DEFAULT_ETYPE):
assert isinstance(etype, tuple) and len(etype) == 3, \
f"Expect canonical edge type in a triplet of string, but got {etype}."
assert (
isinstance(etype, tuple) and len(etype) == 3
), f"Expect canonical edge type in a triplet of string, but got {etype}."
super(EdgePartitionPolicy, self).__init__(
EDGE_PART_POLICY + POLICY_DELIMITER + _etype_tuple_to_str(etype),
partition_book
partition_book,
)
......@@ -1156,9 +1170,10 @@ class HeteroDataName(object):
def __init__(self, is_node, entity_type, data_name):
self._policy = NODE_PART_POLICY if is_node else EDGE_PART_POLICY
if not is_node:
assert isinstance(entity_type, tuple) and len(entity_type) == 3, \
"Expect canonical edge type in a triplet of string, but got " \
assert isinstance(entity_type, tuple) and len(entity_type) == 3, (
"Expect canonical edge type in a triplet of string, but got "
f"{entity_type}."
)
self._entity_type = entity_type
self.data_name = data_name
......@@ -1226,6 +1241,4 @@ def parse_hetero_data_name(name):
entity_type = names[1]
if not is_node:
entity_type = _etype_str_to_tuple(entity_type)
return HeteroDataName(
is_node, entity_type, names[2]
)
return HeteroDataName(is_node, entity_type, names[2])
......@@ -6,16 +6,17 @@ import numpy as np
from .. import backend as F
from ..base import EID, NID
from ..convert import graph, heterograph
from ..sampling import sample_etype_neighbors as local_sample_etype_neighbors
from ..sampling import sample_neighbors as local_sample_neighbors
from ..sampling import (
sample_etype_neighbors as local_sample_etype_neighbors,
sample_neighbors as local_sample_neighbors,
)
from ..subgraph import in_subgraph as local_in_subgraph
from ..utils import toindex
from .. import backend as F
from .rpc import (
Request,
Response,
recv_responses,
register_service,
Request,
Response,
send_requests_to_machine,
)
......@@ -207,6 +208,7 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
# This is a limitation of the current DistDGL design. We should improve it
# later.
class SamplingRequest(Request):
"""Sampling Request"""
......@@ -798,9 +800,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
def local_access(local_g, partition_book, local_nids):
# See NOTE 1
_prob = (
[g.edata[prob].local_partition] if prob is not None else None
)
_prob = [g.edata[prob].local_partition] if prob is not None else None
return _sample_neighbors(
local_g,
partition_book,
......
"""Module for mapping between node/edge IDs and node/edge types."""
import numpy as np
from .. import backend as F, utils
from .._ffi.function import _init_api
from .. import backend as F
from .. import utils
class IdMap:
'''A map for converting node/edge IDs to their type IDs and type-wise IDs.
"""A map for converting node/edge IDs to their type IDs and type-wise IDs.
For a heterogeneous graph, DGL assigns an integer ID to each node/edge type;
node and edge of different types have independent IDs starting from zero.
......@@ -96,7 +97,8 @@ class IdMap:
for a particular node type in a partition. For example, all nodes of type ``"T"`` in
partition ``i`` has ID range ``id_ranges["T"][i][0]`` to ``id_ranges["T"][i][1]``.
It is the same as the `node_map` argument in `RangePartitionBook`.
'''
"""
def __init__(self, id_ranges):
self.num_parts = list(id_ranges.values())[0].shape[0]
self.num_types = len(id_ranges)
......@@ -105,7 +107,7 @@ class IdMap:
id_ranges = list(id_ranges.values())
id_ranges.sort(key=lambda a: a[0, 0])
for i, id_range in enumerate(id_ranges):
ranges[i::self.num_types] = id_range
ranges[i :: self.num_types] = id_range
map1 = np.cumsum(id_range[:, 1] - id_range[:, 0])
typed_map.append(map1)
......@@ -116,7 +118,7 @@ class IdMap:
self.typed_map = utils.toindex(np.concatenate(typed_map))
def __call__(self, ids):
'''Convert the homogeneous IDs to (type_id, type_wise_id).
"""Convert the homogeneous IDs to (type_id, type_wise_id).
Parameters
----------
......@@ -129,19 +131,23 @@ class IdMap:
Type IDs
per_type_ids : Tensor
Type-wise IDs
'''
"""
if self.num_types == 0:
return F.zeros((len(ids),), F.dtype(ids), F.cpu()), ids
if len(ids) == 0:
return ids, ids
ids = utils.toindex(ids)
ret = _CAPI_DGLHeteroMapIds(ids.todgltensor(),
self.range_start.todgltensor(),
self.range_end.todgltensor(),
self.typed_map.todgltensor(),
self.num_parts, self.num_types)
ret = _CAPI_DGLHeteroMapIds(
ids.todgltensor(),
self.range_start.todgltensor(),
self.range_end.todgltensor(),
self.typed_map.todgltensor(),
self.num_parts,
self.num_types,
)
ret = utils.toindex(ret).tousertensor()
return ret[:len(ids)], ret[len(ids):]
return ret[: len(ids)], ret[len(ids) :]
_init_api("dgl.distributed.id_map")
This diff is collapsed.
"""Define sparse embedding and optimizer."""
import torch as th
from .... import backend as F
from .... import utils
from .... import backend as F, utils
from ...dist_tensor import DistTensor
class DistEmbedding:
'''Distributed node embeddings.
"""Distributed node embeddings.
DGL provides a distributed embedding to support models that require learnable embeddings.
DGL's distributed embeddings are mainly used for learning node embeddings of graph models.
......@@ -63,11 +64,23 @@ class DistEmbedding:
the forward computation, users have to invoke
py:meth:`~dgl.distributed.optim.SparseAdagrad.step` afterwards. Otherwise, there will be
some memory leak.
'''
def __init__(self, num_embeddings, embedding_dim, name=None,
init_func=None, part_policy=None):
self._tensor = DistTensor((num_embeddings, embedding_dim), F.float32, name,
init_func=init_func, part_policy=part_policy)
"""
def __init__(
self,
num_embeddings,
embedding_dim,
name=None,
init_func=None,
part_policy=None,
):
self._tensor = DistTensor(
(num_embeddings, embedding_dim),
F.float32,
name,
init_func=init_func,
part_policy=part_policy,
)
self._trace = []
self._name = name
self._num_embeddings = num_embeddings
......@@ -81,10 +94,10 @@ class DistEmbedding:
# actually fails unit test. ???
# else:
# assert 'th.distributed should be initialized'
self._optm_state = None # track optimizer state
self._optm_state = None # track optimizer state
self._part_policy = part_policy
def __call__(self, idx, device=th.device('cpu')):
def __call__(self, idx, device=th.device("cpu")):
"""
node_ids : th.tensor
Index of the embeddings to collect.
......@@ -104,8 +117,7 @@ class DistEmbedding:
return emb
def reset_trace(self):
'''Reset the traced data.
'''
"""Reset the traced data."""
self._trace = []
@property
......
......@@ -10,9 +10,9 @@ import dgl
from .... import backend as F
from ...dist_tensor import DistTensor
from ...graph_partition_book import EDGE_PART_POLICY, NODE_PART_POLICY
from ...nn.pytorch import DistEmbedding
from .utils import alltoall_cpu, alltoallv_cpu
from ...graph_partition_book import EDGE_PART_POLICY, NODE_PART_POLICY
EMB_STATES = "emb_states"
WORLD_SIZE = "world_size"
......
This diff is collapsed.
"""Define utility functions for shared memory."""
from .. import backend as F
from .. import ndarray as nd
from .. import backend as F, ndarray as nd
from .._ffi.ndarray import empty_shared_mem
DTYPE_DICT = F.data_type_dict
......
......@@ -5,12 +5,14 @@ This kvstore is used when running in the standalone mode
from .. import backend as F
class KVClient(object):
''' The fake KVStore client.
"""The fake KVStore client.
This is to mimic the distributed KVStore client. It's used for DistGraph
in standalone mode.
'''
"""
def __init__(self):
self._data = {}
self._all_possible_part_policy = {}
......@@ -30,25 +32,27 @@ class KVClient(object):
return 1
def barrier(self):
'''barrier'''
"""barrier"""
def register_push_handler(self, name, func):
'''register push handler'''
"""register push handler"""
self._push_handlers[name] = func
def register_pull_handler(self, name, func):
'''register pull handler'''
"""register pull handler"""
self._pull_handlers[name] = func
def add_data(self, name, tensor, part_policy):
'''add data to the client'''
"""add data to the client"""
self._data[name] = tensor
self._gdata_name_list.add(name)
if part_policy.policy_str not in self._all_possible_part_policy:
self._all_possible_part_policy[part_policy.policy_str] = part_policy
def init_data(self, name, shape, dtype, part_policy, init_func, is_gdata=True):
'''add new data to the client'''
def init_data(
self, name, shape, dtype, part_policy, init_func, is_gdata=True
):
"""add new data to the client"""
self._data[name] = init_func(shape, dtype)
if part_policy.policy_str not in self._all_possible_part_policy:
self._all_possible_part_policy[part_policy.policy_str] = part_policy
......@@ -56,38 +60,38 @@ class KVClient(object):
self._gdata_name_list.add(name)
def delete_data(self, name):
'''delete the data'''
"""delete the data"""
del self._data[name]
self._gdata_name_list.remove(name)
def data_name_list(self):
'''get the names of all data'''
"""get the names of all data"""
return list(self._data.keys())
def gdata_name_list(self):
'''get the names of graph data'''
"""get the names of graph data"""
return list(self._gdata_name_list)
def get_data_meta(self, name):
'''get the metadata of data'''
"""get the metadata of data"""
return F.dtype(self._data[name]), F.shape(self._data[name]), None
def push(self, name, id_tensor, data_tensor):
'''push data to kvstore'''
"""push data to kvstore"""
if name in self._push_handlers:
self._push_handlers[name](self._data, name, id_tensor, data_tensor)
else:
F.scatter_row_inplace(self._data[name], id_tensor, data_tensor)
def pull(self, name, id_tensor):
'''pull data from kvstore'''
"""pull data from kvstore"""
if name in self._pull_handlers:
return self._pull_handlers[name](self._data, name, id_tensor)
else:
return F.gather_row(self._data[name], id_tensor)
def map_shared_data(self, partition_book):
'''Mapping shared-memory tensor from server to client.'''
"""Mapping shared-memory tensor from server to client."""
def count_nonzero(self, name):
"""Count nonzero value by pull request from KVServers.
......@@ -116,8 +120,7 @@ class KVClient(object):
return self._data
def union(self, operand1_name, operand2_name, output_name):
"""Compute the union of two mask arrays in the KVStore.
"""
"""Compute the union of two mask arrays in the KVStore."""
self._data[output_name][:] = (
self._data[operand1_name] | self._data[operand2_name]
self._data[operand1_name] | self._data[operand2_name]
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment