Unverified Commit 701b4fcc authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Sampling] New sampling pipeline plus asynchronous prefetching (#3665)

* initial update

* more

* more

* multi-gpu example

* cluster gcn, finalize homogeneous

* more explanation

* fix

* bunch of fixes

* fix

* RGAT example and more fixes

* shadow-gnn sampler and some changes in unit test

* fix

* wth

* more fixes

* remove shadow+node/edge dataloader tests for possible ux changes

* lints

* add legacy dataloading import just in case

* fix

* update pylint for f-strings

* fix

* lint

* lint

* lint again

* cherry-picking commit fa9f494

* oops

* fix

* add sample_neighbors in dist_graph

* fix

* lint

* fix

* fix

* fix

* fix tutorial

* fix

* fix

* fix

* fix warning

* remove debug

* add get_foo_storage apis

* lint
parent 5152a879
...@@ -101,4 +101,4 @@ class AsyncTransferer(object): ...@@ -101,4 +101,4 @@ class AsyncTransferer(object):
return Transfer(transfer_id, self._handle) return Transfer(transfer_id, self._handle)
_init_api("dgl.dataloading.async_transferer") _init_api("dataloading.async_transferer", "dgl._dataloading.async_transferer")
"""Cluster-GCN subgraph iterators."""
import os
import pickle
import numpy as np
from ..transform import metis_partition_assignment
from .. import backend as F
from .dataloader import SubgraphIterator
class ClusterGCNSubgraphIterator(SubgraphIterator):
"""Subgraph sampler following that of ClusterGCN.
This sampler first partitions the graph with METIS partitioning, then it caches the nodes of
each partition to a file within the given cache directory.
This is used in conjunction with :class:`dgl.dataloading.pytorch.GraphDataLoader`.
Notes
-----
The graph must be homogeneous and on CPU.
Parameters
----------
g : DGLGraph
The original graph.
num_partitions : int
The number of partitions.
cache_directory : str
The path to the cache directory for storing the partition result.
refresh : bool
If True, recompute the partition.
Examples
--------
Assuming that you have a graph ``g``:
>>> sgiter = dgl.dataloading.ClusterGCNSubgraphIterator(
... g, num_partitions=100, cache_directory='.', refresh=True)
>>> dataloader = dgl.dataloading.GraphDataLoader(sgiter, batch_size=4, num_workers=0)
>>> for subgraph_batch in dataloader:
... train_on(subgraph_batch)
"""
def __init__(self, g, num_partitions, cache_directory, refresh=False):
if os.name == 'nt':
raise NotImplementedError("METIS partitioning is not supported on Windows yet.")
super().__init__(g)
# First see if the cache is already there. If so, directly read from cache.
if not refresh and self._load_parts(cache_directory):
return
# Otherwise, build the cache.
assignment = F.asnumpy(metis_partition_assignment(g, num_partitions))
self._save_parts(assignment, cache_directory)
def _cache_file_path(self, cache_directory):
return os.path.join(cache_directory, 'cluster_gcn_cache')
def _load_parts(self, cache_directory):
path = self._cache_file_path(cache_directory)
if not os.path.exists(path):
return False
with open(path, 'rb') as file_:
self.part_indptr, self.part_indices = pickle.load(file_)
return True
def _save_parts(self, assignment, cache_directory):
os.makedirs(cache_directory, exist_ok=True)
self.part_indices = np.argsort(assignment)
num_nodes_per_part = np.bincount(assignment)
self.part_indptr = np.insert(np.cumsum(num_nodes_per_part), 0, 0)
with open(self._cache_file_path(cache_directory), 'wb') as file_:
pickle.dump((self.part_indptr, self.part_indices), file_)
def __len__(self):
return self.part_indptr.shape[0] - 1
def __getitem__(self, i):
nodes = self.part_indices[self.part_indptr[i]:self.part_indptr[i+1]]
return self.g.subgraph(nodes)
This diff is collapsed.
"""Negative samplers"""
from collections.abc import Mapping
from .. import backend as F
from ..sampling import global_uniform_negative_sampling
class _BaseNegativeSampler(object):
def _generate(self, g, eids, canonical_etype):
raise NotImplementedError
def __call__(self, g, eids):
"""Returns negative samples.
Parameters
----------
g : DGLGraph
The graph.
eids : Tensor or dict[etype, Tensor]
The sampled edges in the minibatch.
Returns
-------
tuple[Tensor, Tensor] or dict[etype, tuple[Tensor, Tensor]]
The returned source-destination pairs as negative samples.
"""
if isinstance(eids, Mapping):
eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()}
else:
assert len(g.etypes) == 1, \
'please specify a dict of etypes and ids for graphs with multiple edge types'
neg_pair = self._generate(g, eids, g.canonical_etypes[0])
return neg_pair
class PerSourceUniform(_BaseNegativeSampler):
"""Negative sampler that randomly chooses negative destination nodes
for each source node according to a uniform distribution.
For each edge ``(u, v)`` of type ``(srctype, etype, dsttype)``, DGL generates
:attr:`k` pairs of negative edges ``(u, v')``, where ``v'`` is chosen
uniformly from all the nodes of type ``dsttype``. The resulting edges will
also have type ``(srctype, etype, dsttype)``.
Parameters
----------
k : int
The number of negative samples per edge.
Examples
--------
>>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
>>> neg_sampler = dgl.dataloading.negative_sampler.PerSourceUniform(2)
>>> neg_sampler(g, torch.tensor([0, 1]))
(tensor([0, 0, 1, 1]), tensor([1, 0, 2, 3]))
"""
def __init__(self, k):
self.k = k
def _generate(self, g, eids, canonical_etype):
_, _, vtype = canonical_etype
shape = F.shape(eids)
dtype = F.dtype(eids)
ctx = F.context(eids)
shape = (shape[0] * self.k,)
src, _ = g.find_edges(eids, etype=canonical_etype)
src = F.repeat(src, self.k, 0)
dst = F.randint(shape, dtype, ctx, 0, g.number_of_nodes(vtype))
return src, dst
# Alias
Uniform = PerSourceUniform
class GlobalUniform(_BaseNegativeSampler):
"""Negative sampler that randomly chooses negative source-destination pairs according
to a uniform distribution.
For each edge ``(u, v)`` of type ``(srctype, etype, dsttype)``, DGL generates at most
:attr:`k` pairs of negative edges ``(u', v')``, where ``u'`` is chosen uniformly from
all the nodes of type ``srctype`` and ``v'`` is chosen uniformly from all the nodes
of type ``dsttype``. The resulting edges will also have type
``(srctype, etype, dsttype)``. DGL guarantees that the sampled pairs will not have
edges in between.
Parameters
----------
k : int
The desired number of negative samples to generate per edge.
exclude_self_loops : bool, optional
Whether to exclude self-loops from negative samples. (Default: True)
replace : bool, optional
Whether to sample with replacement. Setting it to True will make things
faster. (Default: True)
redundancy : float, optional
Indicates how much more negative samples to actually generate during rejection sampling
before finding the unique pairs.
Increasing it will increase the likelihood of getting :attr:`k` negative samples
per edge, but will also take more time and memory.
(Default: automatically determined by the density of graph)
Notes
-----
This negative sampler will try to generate as many negative samples as possible, but
it may rarely return less than :attr:`k` negative samples per edge.
This is more likely to happen if a graph is so small or dense that not many unique
negative samples exist.
Examples
--------
>>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
>>> neg_sampler = dgl.dataloading.negative_sampler.GlobalUniform(2, True)
>>> neg_sampler(g, torch.LongTensor([0, 1]))
(tensor([0, 1, 3, 2]), tensor([2, 0, 2, 1]))
"""
def __init__(self, k, exclude_self_loops=True, replace=False, redundancy=None):
self.k = k
self.exclude_self_loops = exclude_self_loops
self.replace = replace
self.redundancy = redundancy
def _generate(self, g, eids, canonical_etype):
return global_uniform_negative_sampling(
g, len(eids) * self.k, self.exclude_self_loops, self.replace,
canonical_etype, self.redundancy)
...@@ -10,7 +10,6 @@ from torch.utils.data.distributed import DistributedSampler ...@@ -10,7 +10,6 @@ from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist import torch.distributed as dist
from ..dataloader import NodeCollator, EdgeCollator, GraphCollator, SubgraphIterator from ..dataloader import NodeCollator, EdgeCollator, GraphCollator, SubgraphIterator
from ...distributed import DistGraph from ...distributed import DistGraph
from ...distributed import DistDataLoader
from ...ndarray import NDArray as DGLNDArray from ...ndarray import NDArray as DGLNDArray
from ... import backend as F from ... import backend as F
from ...base import DGLError from ...base import DGLError
...@@ -26,6 +25,10 @@ PYTORCH_VER = LooseVersion(th.__version__) ...@@ -26,6 +25,10 @@ PYTORCH_VER = LooseVersion(th.__version__)
PYTORCH_16 = PYTORCH_VER >= LooseVersion("1.6.0") PYTORCH_16 = PYTORCH_VER >= LooseVersion("1.6.0")
PYTORCH_17 = PYTORCH_VER >= LooseVersion("1.7.0") PYTORCH_17 = PYTORCH_VER >= LooseVersion("1.7.0")
def _check_graph_type(g):
if isinstance(g, DistGraph):
raise TypeError("Please use DistNodeDataLoader or DistEdgeDataLoader for DistGraph")
def _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed): def _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed):
# Note: will change the content of dataloader_kwargs # Note: will change the content of dataloader_kwargs
dist_sampler_kwargs = {'shuffle': dataloader_kwargs['shuffle']} dist_sampler_kwargs = {'shuffle': dataloader_kwargs['shuffle']}
...@@ -166,14 +169,6 @@ class _ScalarDataBatcher(th.utils.data.IterableDataset): ...@@ -166,14 +169,6 @@ class _ScalarDataBatcher(th.utils.data.IterableDataset):
"""Set epoch number for distributed training.""" """Set epoch number for distributed training."""
self.epoch = epoch self.epoch = epoch
def _remove_kwargs_dist(kwargs):
if 'num_workers' in kwargs:
del kwargs['num_workers']
if 'pin_memory' in kwargs:
del kwargs['pin_memory']
print('Distributed DataLoader does not support pin_memory')
return kwargs
# The following code is a fix to the PyTorch-specific issue in # The following code is a fix to the PyTorch-specific issue in
# https://github.com/dmlc/dgl/issues/2137 # https://github.com/dmlc/dgl/issues/2137
# #
...@@ -290,14 +285,14 @@ def _restore_storages(subgs, g): ...@@ -290,14 +285,14 @@ def _restore_storages(subgs, g):
_restore_subgraph_storage(subg, g) _restore_subgraph_storage(subg, g)
class _NodeCollator(NodeCollator): class _NodeCollator(NodeCollator):
def collate(self, items): def collate(self, items): # pylint: disable=missing-docstring
# input_nodes, output_nodes, blocks # input_nodes, output_nodes, blocks
result = super().collate(items) result = super().collate(items)
_pop_storages(result[-1], self.g) _pop_storages(result[-1], self.g)
return result return result
class _EdgeCollator(EdgeCollator): class _EdgeCollator(EdgeCollator):
def collate(self, items): def collate(self, items): # pylint: disable=missing-docstring
if self.negative_sampler is None: if self.negative_sampler is None:
# input_nodes, pair_graph, blocks # input_nodes, pair_graph, blocks
result = super().collate(items) result = super().collate(items)
...@@ -381,10 +376,10 @@ def _background_node_dataloader(dl_iter, g, device, results, load_input, load_ou ...@@ -381,10 +376,10 @@ def _background_node_dataloader(dl_iter, g, device, results, load_input, load_ou
class _NodeDataLoaderIter: class _NodeDataLoaderIter:
def __init__(self, node_dataloader): def __init__(self, node_dataloader, iter_):
self.device = node_dataloader.device self.device = node_dataloader.device
self.node_dataloader = node_dataloader self.node_dataloader = node_dataloader
self.iter_ = iter(node_dataloader.dataloader) self.iter_ = iter_
self.async_load = node_dataloader.async_load and ( self.async_load = node_dataloader.async_load and (
F.device_type(self.device) == 'cuda') F.device_type(self.device) == 'cuda')
if self.async_load: if self.async_load:
...@@ -418,10 +413,10 @@ class _NodeDataLoaderIter: ...@@ -418,10 +413,10 @@ class _NodeDataLoaderIter:
return input_nodes, output_nodes, blocks return input_nodes, output_nodes, blocks
class _EdgeDataLoaderIter: class _EdgeDataLoaderIter:
def __init__(self, edge_dataloader): def __init__(self, edge_dataloader, iter_):
self.device = edge_dataloader.device self.device = edge_dataloader.device
self.edge_dataloader = edge_dataloader self.edge_dataloader = edge_dataloader
self.iter_ = iter(edge_dataloader.dataloader) self.iter_ = iter_
# Make this an iterator for PyTorch Lightning compatibility # Make this an iterator for PyTorch Lightning compatibility
def __iter__(self): def __iter__(self):
...@@ -441,9 +436,9 @@ class _EdgeDataLoaderIter: ...@@ -441,9 +436,9 @@ class _EdgeDataLoaderIter:
return result return result
class _GraphDataLoaderIter: class _GraphDataLoaderIter:
def __init__(self, graph_dataloader): def __init__(self, graph_dataloader, iter_):
self.dataloader = graph_dataloader self.dataloader = graph_dataloader
self.iter_ = iter(graph_dataloader.dataloader) self.iter_ = iter_
def __iter__(self): def __iter__(self):
return self return self
...@@ -490,14 +485,9 @@ def _init_dataloader(collator, device, dataloader_kwargs, use_ddp, ddp_seed): ...@@ -490,14 +485,9 @@ def _init_dataloader(collator, device, dataloader_kwargs, use_ddp, ddp_seed):
else: else:
dist_sampler = None dist_sampler = None
dataloader = DataLoader( return use_scalar_batcher, scalar_batcher, dataset, collator, dist_sampler
dataset,
collate_fn=collator.collate,
**dataloader_kwargs)
return use_scalar_batcher, scalar_batcher, dataloader, dist_sampler class NodeDataLoader(DataLoader):
class NodeDataLoader:
"""PyTorch dataloader for batch-iterating over a set of nodes, generating the list """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
of message flow graphs (MFGs) as computation dependency of the said minibatch. of message flow graphs (MFGs) as computation dependency of the said minibatch.
...@@ -600,6 +590,7 @@ class NodeDataLoader: ...@@ -600,6 +590,7 @@ class NodeDataLoader:
def __init__(self, g, nids, graph_sampler, device=None, use_ddp=False, ddp_seed=0, def __init__(self, g, nids, graph_sampler, device=None, use_ddp=False, ddp_seed=0,
load_input=None, load_output=None, async_load=False, **kwargs): load_input=None, load_output=None, async_load=False, **kwargs):
_check_graph_type(g)
collator_kwargs = {} collator_kwargs = {}
dataloader_kwargs = {} dataloader_kwargs = {}
for k, v in kwargs.items(): for k, v in kwargs.items():
...@@ -608,65 +599,42 @@ class NodeDataLoader: ...@@ -608,65 +599,42 @@ class NodeDataLoader:
else: else:
dataloader_kwargs[k] = v dataloader_kwargs[k] = v
if isinstance(g, DistGraph): if device is None:
if device is None: # default to the same device the graph is on
# for the distributed case default to the CPU device = th.device(g.device)
device = 'cpu'
assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.' if not g.is_homogeneous:
# Distributed DataLoader currently does not support heterogeneous graphs if load_input or load_output:
# and does not copy features. Fallback to normal solution raise DGLError('load_input/load_output not supported for heterograph yet.')
self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs) self.load_input = {} if load_input is None else load_input
_remove_kwargs_dist(dataloader_kwargs) self.load_output = {} if load_output is None else load_output
self.dataloader = DistDataLoader(self.collator.dataset, self.async_load = async_load
collate_fn=self.collator.collate,
**dataloader_kwargs) # if the sampler supports it, tell it to output to the specified device.
self.is_distributed = True # But if async_load is enabled, set_output_context should be skipped as
else: # we'd like to avoid any graph/data transfer graphs across devices in
if device is None: # sampler. Such transfer will be handled in dataloader.
# default to the same device the graph is on num_workers = dataloader_kwargs.get('num_workers', 0)
device = th.device(g.device) if ((not async_load) and
callable(getattr(graph_sampler, "set_output_context", None)) and
if not g.is_homogeneous: num_workers == 0):
if load_input or load_output: graph_sampler.set_output_context(to_dgl_context(device))
raise DGLError('load_input/load_output not supported for heterograph yet.')
self.load_input = {} if load_input is None else load_input self.collator = _NodeCollator(g, nids, graph_sampler, **collator_kwargs)
self.load_output = {} if load_output is None else load_output self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \
self.async_load = async_load _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
# if the sampler supports it, tell it to output to the specified device.
# But if async_load is enabled, set_output_context should be skipped as
# we'd like to avoid any graph/data transfer graphs across devices in
# sampler. Such transfer will be handled in dataloader.
num_workers = dataloader_kwargs.get('num_workers', 0)
if ((not async_load) and
callable(getattr(graph_sampler, "set_output_context", None)) and
num_workers == 0):
graph_sampler.set_output_context(to_dgl_context(device))
self.collator = _NodeCollator(g, nids, graph_sampler, **collator_kwargs)
self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \
_init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
self.use_ddp = use_ddp self.use_ddp = use_ddp
self.is_distributed = False self.is_distributed = False
# Precompute the CSR and CSC representations so each subprocess does not # Precompute the CSR and CSC representations so each subprocess does not
# duplicate. # duplicate.
if num_workers > 0: if num_workers > 0:
g.create_formats_() g.create_formats_()
self.device = device self.device = device
def __iter__(self): def __iter__(self):
"""Return the iterator of the data loader.""" return _NodeDataLoaderIter(self, super().__iter__())
if self.is_distributed:
# Directly use the iterator of DistDataLoader, which doesn't copy features anyway.
return iter(self.dataloader)
else:
return _NodeDataLoaderIter(self)
def __len__(self):
"""Return the number of batches of the data loader."""
return len(self.dataloader)
def set_epoch(self, epoch): def set_epoch(self, epoch):
"""Sets the epoch number for the underlying sampler which ensures all replicas """Sets the epoch number for the underlying sampler which ensures all replicas
...@@ -689,7 +657,7 @@ class NodeDataLoader: ...@@ -689,7 +657,7 @@ class NodeDataLoader:
else: else:
raise DGLError('set_epoch is only available when use_ddp is True.') raise DGLError('set_epoch is only available when use_ddp is True.')
class EdgeDataLoader: class EdgeDataLoader(DataLoader):
"""PyTorch dataloader for batch-iterating over a set of edges, generating the list """PyTorch dataloader for batch-iterating over a set of edges, generating the list
of message flow graphs (MFGs) as computation dependency of the said minibatch for of message flow graphs (MFGs) as computation dependency of the said minibatch for
edge classification, edge regression, and link prediction. edge classification, edge regression, and link prediction.
...@@ -897,8 +865,9 @@ class EdgeDataLoader: ...@@ -897,8 +865,9 @@ class EdgeDataLoader:
* Link prediction on heterogeneous graph: RGCN for link prediction. * Link prediction on heterogeneous graph: RGCN for link prediction.
""" """
collator_arglist = inspect.getfullargspec(EdgeCollator).args collator_arglist = inspect.getfullargspec(EdgeCollator).args
def __init__(self, g, eids, graph_sampler, device='cpu', use_ddp=False, ddp_seed=0,
def __init__(self, g, eids, graph_sampler, device='cpu', use_ddp=False, ddp_seed=0, **kwargs): **kwargs):
_check_graph_type(g)
collator_kwargs = {} collator_kwargs = {}
dataloader_kwargs = {} dataloader_kwargs = {}
for k, v in kwargs.items(): for k, v in kwargs.items():
...@@ -907,53 +876,30 @@ class EdgeDataLoader: ...@@ -907,53 +876,30 @@ class EdgeDataLoader:
else: else:
dataloader_kwargs[k] = v dataloader_kwargs[k] = v
if isinstance(g, DistGraph):
if device is None:
# for the distributed case default to the CPU
device = 'cpu'
assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
# Distributed DataLoader currently does not support heterogeneous graphs
# and does not copy features. Fallback to normal solution
self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
_remove_kwargs_dist(dataloader_kwargs)
self.dataloader = DistDataLoader(self.collator.dataset,
collate_fn=self.collator.collate,
**dataloader_kwargs)
self.is_distributed = True
else:
if device is None: if device is None:
# default to the same device the graph is on # default to the same device the graph is on
device = th.device(g.device) device = th.device(g.device)
# if the sampler supports it, tell it to output to the # if the sampler supports it, tell it to output to the
# specified device # specified device
num_workers = dataloader_kwargs.get('num_workers', 0) num_workers = dataloader_kwargs.get('num_workers', 0)
if callable(getattr(graph_sampler, "set_output_context", None)) and num_workers == 0: if callable(getattr(graph_sampler, "set_output_context", None)) and num_workers == 0:
graph_sampler.set_output_context(to_dgl_context(device)) graph_sampler.set_output_context(to_dgl_context(device))
self.collator = _EdgeCollator(g, eids, graph_sampler, **collator_kwargs) self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \ self.use_scalar_batcher, self.scalar_batcher, dataset, collator, self.dist_sampler = \
_init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed) _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
self.use_ddp = use_ddp self.use_ddp = use_ddp
self.is_distributed = False super().__init__(dataset, collate_fn=collator.collate, **dataloader_kwargs)
# Precompute the CSR and CSC representations so each subprocess does not duplicate. # Precompute the CSR and CSC representations so each subprocess does not duplicate.
if num_workers > 0: if num_workers > 0:
g.create_formats_() g.create_formats_()
self.device = device self.device = device
def __iter__(self): def __iter__(self):
"""Return the iterator of the data loader.""" return _EdgeDataLoaderIter(self, super().__iter__())
if self.is_distributed:
# Directly use the iterator of DistDataLoader, which doesn't copy features anyway.
return iter(self.dataloader)
else:
return _EdgeDataLoaderIter(self)
def __len__(self):
"""Return the number of batches of the data loader."""
return len(self.dataloader)
def set_epoch(self, epoch): def set_epoch(self, epoch):
"""Sets the epoch number for the underlying sampler which ensures all replicas """Sets the epoch number for the underlying sampler which ensures all replicas
...@@ -976,7 +922,7 @@ class EdgeDataLoader: ...@@ -976,7 +922,7 @@ class EdgeDataLoader:
else: else:
raise DGLError('set_epoch is only available when use_ddp is True.') raise DGLError('set_epoch is only available when use_ddp is True.')
class GraphDataLoader: class GraphDataLoader(DataLoader):
"""PyTorch dataloader for batch-iterating over a set of graphs, generating the batched """PyTorch dataloader for batch-iterating over a set of graphs, generating the batched
graph and corresponding label tensor (if provided) of the said minibatch. graph and corresponding label tensor (if provided) of the said minibatch.
...@@ -1023,7 +969,6 @@ class GraphDataLoader: ...@@ -1023,7 +969,6 @@ class GraphDataLoader:
... train_on(batched_graph, labels) ... train_on(batched_graph, labels)
""" """
collator_arglist = inspect.getfullargspec(GraphCollator).args collator_arglist = inspect.getfullargspec(GraphCollator).args
def __init__(self, dataset, collate_fn=None, use_ddp=False, ddp_seed=0, **kwargs): def __init__(self, dataset, collate_fn=None, use_ddp=False, ddp_seed=0, **kwargs):
collator_kwargs = {} collator_kwargs = {}
dataloader_kwargs = {} dataloader_kwargs = {}
...@@ -1058,14 +1003,11 @@ class GraphDataLoader: ...@@ -1058,14 +1003,11 @@ class GraphDataLoader:
if use_ddp: if use_ddp:
self.dist_sampler = _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed) self.dist_sampler = _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed)
dataloader_kwargs['sampler'] = self.dist_sampler dataloader_kwargs['sampler'] = self.dist_sampler
super().__init__(dataset, collate_fn=self.collate, **dataloader_kwargs)
self.dataloader = DataLoader(dataset=dataset,
collate_fn=self.collate,
**dataloader_kwargs)
def __iter__(self): def __iter__(self):
"""Return the iterator of the data loader.""" """Return the iterator of the data loader."""
return _GraphDataLoaderIter(self) return _GraphDataLoaderIter(self, super().__iter__())
def __len__(self): def __len__(self):
"""Return the number of batches of the data loader.""" """Return the number of batches of the data loader."""
......
"""ShaDow-GNN subgraph samplers."""
from ..utils import prepare_tensor_or_dict
from ..base import NID
from .. import transform
from ..sampling import sample_neighbors
from .neighbor import NeighborSamplingMixin
from .dataloader import exclude_edges, Sampler
class ShaDowKHopSampler(NeighborSamplingMixin, Sampler):
"""K-hop subgraph sampler used by
`ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__.
It performs node-wise neighbor sampling but instead of returning a list of
MFGs, it returns a single subgraph induced by all the sampled nodes. The
seed nodes from which the neighbors are sampled will appear the first in the
induced nodes of the subgraph.
This is used in conjunction with :class:`dgl.dataloading.pytorch.NodeDataLoader`
and :class:`dgl.dataloading.pytorch.EdgeDataLoader`.
Parameters
----------
fanouts : list[int] or list[dict[etype, int]]
List of neighbors to sample per edge type for each GNN layer, with the i-th
element being the fanout for the i-th GNN layer.
If only a single integer is provided, DGL assumes that every edge type
will have the same fanout.
If -1 is provided for one edge type on one layer, then all inbound edges
of that edge type will be included.
replace : bool, default True
Whether to sample with replacement
prob : str, optional
If given, the probability of each neighbor being sampled is proportional
to the edge feature value with the given name in ``g.edata``. The feature must be
a scalar on each edge.
Examples
--------
To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
the first, second, and third layer respectively (assuming the backend is PyTorch):
>>> g = dgl.data.CoraFullDataset()[0]
>>> sampler = dgl.dataloading.ShaDowKHopSampler([5, 10, 15])
>>> dataloader = dgl.dataloading.NodeDataLoader(
... g, torch.arange(g.num_nodes()), sampler,
... batch_size=5, shuffle=True, drop_last=False, num_workers=4)
>>> for input_nodes, output_nodes, (subgraph,) in dataloader:
... print(subgraph)
... assert torch.equal(input_nodes, subgraph.ndata[dgl.NID])
... assert torch.equal(input_nodes[:output_nodes.shape[0]], output_nodes)
... break
Graph(num_nodes=529, num_edges=3796,
ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64),
'feat': Scheme(shape=(8710,), dtype=torch.float32),
'_ID': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
If training on a heterogeneous graph and you want different number of neighbors for each
edge type, one should instead provide a list of dicts. Each dict would specify the
number of neighbors to pick per edge type.
>>> sampler = dgl.dataloading.ShaDowKHopSampler([
... {('user', 'follows', 'user'): 5,
... ('user', 'plays', 'game'): 4,
... ('game', 'played-by', 'user'): 3}] * 3)
If you would like non-uniform neighbor sampling:
>>> g.edata['p'] = torch.rand(g.num_edges()) # any non-negative 1D vector works
>>> sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10, 15], prob='p')
"""
def __init__(self, fanouts, replace=False, prob=None, output_ctx=None):
super().__init__(output_ctx)
self.fanouts = fanouts
self.replace = replace
self.prob = prob
self.set_output_context(output_ctx)
def sample(self, g, seed_nodes, exclude_eids=None):
self._build_fanout(len(self.fanouts), g)
self._build_prob_arrays(g)
seed_nodes = prepare_tensor_or_dict(g, seed_nodes, 'seed nodes')
output_nodes = seed_nodes
for i in range(len(self.fanouts)):
fanout = self.fanouts[i]
frontier = sample_neighbors(
g, seed_nodes, fanout, replace=self.replace, prob=self.prob_arrays)
block = transform.to_block(frontier, seed_nodes)
seed_nodes = block.srcdata[NID]
subg = g.subgraph(seed_nodes, relabel_nodes=True)
subg = exclude_edges(subg, exclude_eids, self.output_device)
return seed_nodes, output_nodes, [subg]
...@@ -14,15 +14,12 @@ Read the user guide :ref:`guide-minibatch`. ...@@ -14,15 +14,12 @@ Read the user guide :ref:`guide-minibatch`.
This package is experimental and the interfaces may be subject This package is experimental and the interfaces may be subject
to changes in future releases. It currently only has implementations in PyTorch. to changes in future releases. It currently only has implementations in PyTorch.
""" """
from .neighbor import * from .. import backend as F
from .dataloader import * from .neighbor_sampler import *
from .cluster_gcn import * from .cluster_gcn import *
from .shadow import * from .shadow import *
from .base import *
from . import negative_sampler from . import negative_sampler
from .async_transferer import AsyncTransferer
from .. import backend as F
if F.get_preferred_backend() == 'pytorch': if F.get_preferred_backend() == 'pytorch':
from .pytorch import * from .dataloader import *
from .dist_dataloader import *
"""Base classes and functionalities for dataloaders"""
from collections import Mapping
from ..base import NID, EID
from ..convert import heterograph
from .. import backend as F
from ..transform import compact_graphs
from ..frame import LazyFeature
from ..utils import recursive_apply
def _set_lazy_features(x, xdata, feature_names):
if feature_names is None:
return
if not isinstance(feature_names, Mapping):
xdata.update({k: LazyFeature(k) for k in feature_names})
else:
for type_, names in feature_names.items():
x[type_].data.update({k: LazyFeature(k) for k in names})
def set_node_lazy_features(g, feature_names):
"""Set lazy features for ``g.ndata`` if :attr:`feature_names` is a list of strings,
or ``g.nodes[ntype].data`` if :attr:`feature_names` is a dict of list of strings.
"""
return _set_lazy_features(g.nodes, g.ndata, feature_names)
def set_edge_lazy_features(g, feature_names):
"""Set lazy features for ``g.edata`` if :attr:`feature_names` is a list of strings,
or ``g.edges[etype].data`` if :attr:`feature_names` is a dict of list of strings.
"""
return _set_lazy_features(g.edges, g.edata, feature_names)
def set_src_lazy_features(g, feature_names):
"""Set lazy features for ``g.srcdata`` if :attr:`feature_names` is a list of strings,
or ``g.srcnodes[srctype].data`` if :attr:`feature_names` is a dict of list of strings.
"""
return _set_lazy_features(g.srcnodes, g.srcdata, feature_names)
def set_dst_lazy_features(g, feature_names):
"""Set lazy features for ``g.dstdata`` if :attr:`feature_names` is a list of strings,
or ``g.dstnodes[dsttype].data`` if :attr:`feature_names` is a dict of list of strings.
"""
return _set_lazy_features(g.dstnodes, g.dstdata, feature_names)
class BlockSampler(object):
"""BlockSampler is an abstract class assuming to take in a set of nodes whose
outputs are to compute, and return a list of blocks.
Moreover, it assumes that the input node features will be put in the first block's
``srcdata``, the output node labels will be put in the last block's ``dstdata``, and
the edge data will be put in all the blocks' ``edata``.
"""
def __init__(self, prefetch_node_feats=None, prefetch_labels=None,
prefetch_edge_feats=None, output_device=None):
self.prefetch_node_feats = prefetch_node_feats or []
self.prefetch_labels = prefetch_labels or []
self.prefetch_edge_feats = prefetch_edge_feats or []
self.output_device = output_device
def sample_blocks(self, g, seed_nodes, exclude_eids=None):
"""Generates a list of blocks from the given seed nodes.
This function must return a triplet where the first element is the input node IDs
for the first GNN layer (a tensor or a dict of tensors for heterogeneous graphs),
the second element is the output node IDs for the last GNN layer, and the third
element is the said list of blocks.
"""
raise NotImplementedError
def assign_lazy_features(self, result):
"""Assign lazy features for prefetching."""
# A LazyFeature is a placeholder telling the dataloader where and which IDs
# to prefetch. It has the signature LazyFeature(name, id_). id_ can be None
# if the LazyFeature is set into one of the subgraph's ``xdata``, in which case the
# dataloader will infer from the subgraph's ``xdata[dgl.NID]`` (or ``xdata[dgl.EID]``
# if the LazyFeature is set as edge features).
#
# If you want to prefetch things other than ndata and edata, you can also
# return a LazyFeature(name, id_). If a LazyFeature is returned in places other than
# in a graph's ndata/edata/srcdata/dstdata, the DataLoader will prefetch it
# from its dictionary ``other_data``.
# For instance, you can run
#
# return blocks, LazyFeature('other_feat', id_)
#
# To make it work with the sampler returning the stuff above, your dataloader
# needs to have the following
#
# dataloader.attach_data('other_feat', tensor)
#
# Then you can run
#
# for blocks, other_feat in dataloader:
# train_on(blocks, other_feat)
input_nodes, output_nodes, blocks = result
set_src_lazy_features(blocks[0], self.prefetch_node_feats)
set_dst_lazy_features(blocks[-1], self.prefetch_labels)
for block in blocks:
set_edge_lazy_features(block, self.prefetch_edge_feats)
return input_nodes, output_nodes, blocks
def sample(self, g, seed_nodes):
"""Sample a list of blocks from the given seed nodes."""
result = self.sample_blocks(g, seed_nodes)
return self.assign_lazy_features(result)
def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map):
if isinstance(eids, Mapping):
eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
exclude_eids = {
k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0)
for k, v in eids.items()}
else:
exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0)
return exclude_eids
def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
exclude_eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
reverse_etype_map = {
g.to_canonical_etype(k): g.to_canonical_etype(v)
for k, v in reverse_etype_map.items()}
exclude_eids.update({reverse_etype_map[k]: v for k, v in exclude_eids.items()})
return exclude_eids
def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
if exclude_mode is None:
return None
elif F.is_tensor(exclude_mode) or (
isinstance(exclude_mode, Mapping) and
all(F.is_tensor(v) for v in exclude_mode.values())):
return exclude_mode
elif exclude_mode == 'self':
return eids
elif exclude_mode == 'reverse_id':
return _find_exclude_eids_with_reverse_id(g, eids, kwargs['reverse_eid_map'])
elif exclude_mode == 'reverse_types':
return _find_exclude_eids_with_reverse_types(g, eids, kwargs['reverse_etype_map'])
else:
raise ValueError('unsupported mode {}'.format(exclude_mode))
def find_exclude_eids(g, seed_edges, exclude, reverse_eids=None, reverse_etypes=None,
output_device=None):
"""Find all edge IDs to exclude according to :attr:`exclude_mode`.
Parameters
----------
g : DGLGraph
The graph.
exclude_mode : str, optional
Can be either of the following,
None (default)
Does not exclude any edge.
Tensor or dict[etype, Tensor]
Exclude the given edge IDs.
'self'
Exclude the given edges themselves but nothing else.
'reverse_id'
Exclude all edges specified in ``eids``, as well as their reverse edges
of the same edge type.
The mapping from each edge ID to its reverse edge ID is specified in
the keyword argument ``reverse_eid_map``.
This mode assumes that the reverse of an edge with ID ``e`` and type
``etype`` will have ID ``reverse_eid_map[e]`` and type ``etype``.
'reverse_types'
Exclude all edges specified in ``eids``, as well as their reverse
edges of the corresponding edge types.
The mapping from each edge type to its reverse edge type is specified
in the keyword argument ``reverse_etype_map``.
This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
will have ID ``e`` and type ``reverse_etype_map[etype]``.
eids : Tensor or dict[etype, Tensor]
The edge IDs.
reverse_eids : Tensor or dict[etype, Tensor]
The mapping from edge ID to its reverse edge ID.
reverse_etypes : dict[etype, etype]
The mapping from edge etype to its reverse edge type.
output_device : device
The device of the output edge IDs.
"""
exclude_eids = _find_exclude_eids(
g,
exclude,
seed_edges,
reverse_eid_map=reverse_eids,
reverse_etype_map=reverse_etypes)
if exclude_eids is not None:
exclude_eids = recursive_apply(
exclude_eids, lambda x: x.to(output_device))
return exclude_eids
class EdgeBlockSampler(object):
"""Adapts a :class:`BlockSampler` object's :attr:`sample` method for edge
classification and link prediction.
"""
def __init__(self, block_sampler, exclude=None, reverse_eids=None,
reverse_etypes=None, negative_sampler=None, prefetch_node_feats=None,
prefetch_labels=None, prefetch_edge_feats=None):
self.reverse_eids = reverse_eids
self.reverse_etypes = reverse_etypes
self.exclude = exclude
self.block_sampler = block_sampler
self.negative_sampler = negative_sampler
self.prefetch_node_feats = prefetch_node_feats or []
self.prefetch_labels = prefetch_labels or []
self.prefetch_edge_feats = prefetch_edge_feats or []
self.output_device = block_sampler.output_device
def _build_neg_graph(self, g, seed_edges):
neg_srcdst = self.negative_sampler(g, seed_edges)
if not isinstance(neg_srcdst, Mapping):
assert len(g.canonical_etypes) == 1, \
'graph has multiple or no edge types; '\
'please return a dict in negative sampler.'
neg_srcdst = {g.canonical_etypes[0]: neg_srcdst}
dtype = F.dtype(list(neg_srcdst.values())[0][0])
neg_edges = {
etype: neg_srcdst.get(etype, (F.tensor([], dtype), F.tensor([], dtype)))
for etype in g.canonical_etypes}
neg_pair_graph = heterograph(
neg_edges, {ntype: g.num_nodes(ntype) for ntype in g.ntypes})
return neg_pair_graph
def assign_lazy_features(self, result):
"""Assign lazy features for prefetching."""
pair_graph = result[1]
blocks = result[-1]
set_src_lazy_features(blocks[0], self.prefetch_node_feats)
set_edge_lazy_features(pair_graph, self.prefetch_labels)
for block in blocks:
set_edge_lazy_features(block, self.prefetch_edge_feats)
# In-place updates
return result
def sample(self, g, seed_edges):
"""Samples a list of blocks, as well as a subgraph containing the sampled
edges from the original graph.
If :attr:`negative_sampler` is given, also returns another graph containing the
negative pairs as edges.
"""
exclude = self.exclude
pair_graph = g.edge_subgraph(
seed_edges, relabel_nodes=False, output_device=self.output_device)
eids = pair_graph.edata[EID]
if self.negative_sampler is not None:
neg_graph = self._build_neg_graph(g, seed_edges)
pair_graph, neg_graph = compact_graphs([pair_graph, neg_graph])
else:
pair_graph = compact_graphs(pair_graph)
pair_graph.edata[EID] = eids
seed_nodes = pair_graph.ndata[NID]
exclude_eids = find_exclude_eids(
g, seed_edges, exclude, self.reverse_eids, self.reverse_etypes,
self.output_device)
input_nodes, _, blocks = self.block_sampler.sample_blocks(g, seed_nodes, exclude_eids)
if self.negative_sampler is None:
return self.assign_lazy_features((input_nodes, pair_graph, blocks))
else:
return self.assign_lazy_features((input_nodes, pair_graph, neg_graph, blocks))
"""Cluster-GCN subgraph iterators.""" """Cluster-GCN samplers."""
import os import os
import pickle import pickle
import numpy as np import numpy as np
from ..transform import metis_partition_assignment
from .. import backend as F from .. import backend as F
from .dataloader import SubgraphIterator from ..base import DGLError
from ..partition import metis_partition_assignment
from .base import set_node_lazy_features, set_edge_lazy_features
class ClusterGCNSubgraphIterator(SubgraphIterator): class ClusterGCNSampler(object):
"""Subgraph sampler following that of ClusterGCN. """Cluster-GCN sampler.
This sampler first partitions the graph with METIS partitioning, then it caches the nodes of This sampler first partitions the graph with METIS partitioning, then it caches the nodes of
each partition to a file within the given cache directory. each partition to a file within the given cache directory.
This is used in conjunction with :class:`dgl.dataloading.pytorch.GraphDataLoader`. This is used in conjunction with :class:`dgl.dataloading.DataLoader`.
Notes Notes
----- -----
...@@ -23,61 +24,53 @@ class ClusterGCNSubgraphIterator(SubgraphIterator): ...@@ -23,61 +24,53 @@ class ClusterGCNSubgraphIterator(SubgraphIterator):
---------- ----------
g : DGLGraph g : DGLGraph
The original graph. The original graph.
num_partitions : int k : int
The number of partitions. The number of partitions.
cache_directory : str cache_path : str
The path to the cache directory for storing the partition result. The path to the cache directory for storing the partition result.
refresh : bool
If True, recompute the partition.
Examples
--------
Assuming that you have a graph ``g``:
>>> sgiter = dgl.dataloading.ClusterGCNSubgraphIterator(
... g, num_partitions=100, cache_directory='.', refresh=True)
>>> dataloader = dgl.dataloading.GraphDataLoader(sgiter, batch_size=4, num_workers=0)
>>> for subgraph_batch in dataloader:
... train_on(subgraph_batch)
""" """
def __init__(self, g, num_partitions, cache_directory, refresh=False): def __init__(self, g, k, balance_ntypes=None, balance_edges=False, mode='k-way',
if os.name == 'nt': prefetch_node_feats=None, prefetch_edge_feats=None, output_device=None,
raise NotImplementedError("METIS partitioning is not supported on Windows yet.") cache_path='cluster_gcn.pkl'):
super().__init__(g) if os.path.exists(cache_path):
try:
# First see if the cache is already there. If so, directly read from cache. with open(cache_path, 'rb') as f:
if not refresh and self._load_parts(cache_directory): self.partition_offset, self.partition_node_ids = pickle.load(f)
return except (EOFError, TypeError, ValueError):
raise DGLError(
# Otherwise, build the cache. f'The contents in the cache file {cache_path} is invalid. '
assignment = F.asnumpy(metis_partition_assignment(g, num_partitions)) f'Please remove the cache file {cache_path} or specify another path.')
self._save_parts(assignment, cache_directory) if len(self.partition_offset) != k + 1:
raise DGLError(
def _cache_file_path(self, cache_directory): f'Number of partitions in the cache does not match the value of k. '
return os.path.join(cache_directory, 'cluster_gcn_cache') f'Please remove the cache file {cache_path} or specify another path.')
if len(self.partition_node_ids) != g.num_nodes():
def _load_parts(self, cache_directory): raise DGLError(
path = self._cache_file_path(cache_directory) f'Number of nodes in the cache does not match the given graph. '
if not os.path.exists(path): f'Please remove the cache file {cache_path} or specify another path.')
return False else:
partition_ids = metis_partition_assignment(
with open(path, 'rb') as file_: g, k, balance_ntypes=balance_ntypes, balance_edges=balance_edges, mode=mode)
self.part_indptr, self.part_indices = pickle.load(file_) partition_ids = F.asnumpy(partition_ids)
return True partition_node_ids = np.argsort(partition_ids)
partition_size = F.zerocopy_from_numpy(np.bincount(partition_ids, minlength=k))
def _save_parts(self, assignment, cache_directory): partition_offset = F.zerocopy_from_numpy(np.insert(np.cumsum(partition_size), 0, 0))
os.makedirs(cache_directory, exist_ok=True) partition_node_ids = F.zerocopy_from_numpy(partition_ids)
with open(cache_path, 'wb') as f:
self.part_indices = np.argsort(assignment) pickle.dump((partition_offset, partition_node_ids), f)
num_nodes_per_part = np.bincount(assignment) self.partition_offset = partition_offset
self.part_indptr = np.insert(np.cumsum(num_nodes_per_part), 0, 0) self.partition_node_ids = partition_node_ids
with open(self._cache_file_path(cache_directory), 'wb') as file_: self.prefetch_node_feats = prefetch_node_feats or []
pickle.dump((self.part_indptr, self.part_indices), file_) self.prefetch_edge_feats = prefetch_edge_feats or []
self.output_device = output_device
def __len__(self):
return self.part_indptr.shape[0] - 1 def sample(self, g, partition_ids):
"""Samples a subgraph given a list of partition IDs."""
def __getitem__(self, i): node_ids = F.cat([
nodes = self.part_indices[self.part_indptr[i]:self.part_indptr[i+1]] self.partition_node_ids[self.partition_offset[i]:self.partition_offset[i+1]]
return self.g.subgraph(nodes) for i in F.asnumpy(partition_ids)], 0)
sg = g.subgraph(node_ids, relabel_nodes=True, output_device=self.output_device)
set_node_lazy_features(sg, self.prefetch_node_feats)
set_edge_lazy_features(sg, self.prefetch_edge_feats)
return sg
This diff is collapsed.
"""Distributed dataloaders.
"""
import inspect
from ..distributed import DistDataLoader
# Still depends on the legacy NodeCollator...
from .._dataloading.dataloader import NodeCollator, EdgeCollator
def _remove_kwargs_dist(kwargs):
if 'num_workers' in kwargs:
del kwargs['num_workers']
if 'pin_memory' in kwargs:
del kwargs['pin_memory']
print('Distributed DataLoaders do not support pin_memory.')
return kwargs
class DistNodeDataLoader(DistDataLoader):
"""PyTorch dataloader for batch-iterating over a set of nodes, generating the list
of message flow graphs (MFGs) as computation dependency of the said minibatch, on
a distributed graph.
All the arguments have the same meaning as the single-machine counterpart
:class:`dgl.dataloading.pytorch.NodeDataLoader` except the first argument
:attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
Parameters
----------
g : DistGraph
The distributed graph.
nids, graph_sampler, device, kwargs :
See :class:`dgl.dataloading.pytorch.NodeDataLoader`.
See also
--------
dgl.dataloading.pytorch.NodeDataLoader
"""
def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
collator_kwargs = {}
dataloader_kwargs = {}
_collator_arglist = inspect.getfullargspec(NodeCollator).args
for k, v in kwargs.items():
if k in _collator_arglist:
collator_kwargs[k] = v
else:
dataloader_kwargs[k] = v
if device is None:
# for the distributed case default to the CPU
device = 'cpu'
assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
# Distributed DataLoader currently does not support heterogeneous graphs
# and does not copy features. Fallback to normal solution
self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
_remove_kwargs_dist(dataloader_kwargs)
super().__init__(self.collator.dataset,
collate_fn=self.collator.collate,
**dataloader_kwargs)
self.device = device
class DistEdgeDataLoader(DistDataLoader):
"""PyTorch dataloader for batch-iterating over a set of edges, generating the list
of message flow graphs (MFGs) as computation dependency of the said minibatch for
edge classification, edge regression, and link prediction, on a distributed
graph.
All the arguments have the same meaning as the single-machine counterpart
:class:`dgl.dataloading.pytorch.EdgeDataLoader` except the first argument
:attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
Parameters
----------
g : DistGraph
The distributed graph.
eids, graph_sampler, device, kwargs :
See :class:`dgl.dataloading.pytorch.EdgeDataLoader`.
See also
--------
dgl.dataloading.pytorch.EdgeDataLoader
"""
def __init__(self, g, eids, graph_sampler, device=None, **kwargs):
collator_kwargs = {}
dataloader_kwargs = {}
_collator_arglist = inspect.getfullargspec(EdgeCollator).args
for k, v in kwargs.items():
if k in _collator_arglist:
collator_kwargs[k] = v
else:
dataloader_kwargs[k] = v
if device is None:
# for the distributed case default to the CPU
device = 'cpu'
assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
# Distributed DataLoader currently does not support heterogeneous graphs
# and does not copy features. Fallback to normal solution
self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
_remove_kwargs_dist(dataloader_kwargs)
super().__init__(self.collator.dataset,
collate_fn=self.collator.collate,
**dataloader_kwargs)
self.device = device
"""Negative samplers""" """Negative samplers"""
from collections.abc import Mapping from collections.abc import Mapping
from .. import backend as F from .. import backend as F
from ..sampling import global_uniform_negative_sampling
class _BaseNegativeSampler(object): class _BaseNegativeSampler(object):
def _generate(self, g, eids, canonical_etype): def _generate(self, g, eids, canonical_etype):
...@@ -26,7 +25,7 @@ class _BaseNegativeSampler(object): ...@@ -26,7 +25,7 @@ class _BaseNegativeSampler(object):
eids = {g.to_canonical_etype(k): v for k, v in eids.items()} eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()} neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()}
else: else:
assert len(g.etypes) == 1, \ assert len(g.canonical_etypes) == 1, \
'please specify a dict of etypes and ids for graphs with multiple edge types' 'please specify a dict of etypes and ids for graphs with multiple edge types'
neg_pair = self._generate(g, eids, g.canonical_etypes[0]) neg_pair = self._generate(g, eids, g.canonical_etypes[0])
...@@ -64,7 +63,7 @@ class PerSourceUniform(_BaseNegativeSampler): ...@@ -64,7 +63,7 @@ class PerSourceUniform(_BaseNegativeSampler):
shape = (shape[0] * self.k,) shape = (shape[0] * self.k,)
src, _ = g.find_edges(eids, etype=canonical_etype) src, _ = g.find_edges(eids, etype=canonical_etype)
src = F.repeat(src, self.k, 0) src = F.repeat(src, self.k, 0)
dst = F.randint(shape, dtype, ctx, 0, g.number_of_nodes(vtype)) dst = F.randint(shape, dtype, ctx, 0, g.num_nodes(vtype))
return src, dst return src, dst
# Alias # Alias
...@@ -90,14 +89,6 @@ class GlobalUniform(_BaseNegativeSampler): ...@@ -90,14 +89,6 @@ class GlobalUniform(_BaseNegativeSampler):
replace : bool, optional replace : bool, optional
Whether to sample with replacement. Setting it to True will make things Whether to sample with replacement. Setting it to True will make things
faster. (Default: True) faster. (Default: True)
redundancy : float, optional
Indicates how much more negative samples to actually generate during rejection sampling
before finding the unique pairs.
Increasing it will increase the likelihood of getting :attr:`k` negative samples
per edge, but will also take more time and memory.
(Default: automatically determined by the density of graph)
Notes Notes
----- -----
...@@ -113,13 +104,11 @@ class GlobalUniform(_BaseNegativeSampler): ...@@ -113,13 +104,11 @@ class GlobalUniform(_BaseNegativeSampler):
>>> neg_sampler(g, torch.LongTensor([0, 1])) >>> neg_sampler(g, torch.LongTensor([0, 1]))
(tensor([0, 1, 3, 2]), tensor([2, 0, 2, 1])) (tensor([0, 1, 3, 2]), tensor([2, 0, 2, 1]))
""" """
def __init__(self, k, exclude_self_loops=True, replace=False, redundancy=None): def __init__(self, k, exclude_self_loops=True, replace=False):
self.k = k self.k = k
self.exclude_self_loops = exclude_self_loops self.exclude_self_loops = exclude_self_loops
self.replace = replace self.replace = replace
self.redundancy = redundancy
def _generate(self, g, eids, canonical_etype): def _generate(self, g, eids, canonical_etype):
return global_uniform_negative_sampling( return g.global_uniform_negative_sampling(
g, len(eids) * self.k, self.exclude_self_loops, self.replace, len(eids) * self.k, self.exclude_self_loops, self.replace, canonical_etype)
canonical_etype, self.redundancy)
"""Data loading components for neighbor sampling"""
from ..base import NID, EID
from ..transform import to_block
from .base import BlockSampler
class NeighborSampler(BlockSampler):
"""Sampler that builds computational dependency of node representations via
neighbor sampling for multilayer GNN.
This sampler will make every node gather messages from a fixed number of neighbors
per edge type. The neighbors are picked uniformly.
Parameters
----------
fanouts : list[int] or list[dict[etype, int]]
List of neighbors to sample per edge type for each GNN layer, with the i-th
element being the fanout for the i-th GNN layer.
If only a single integer is provided, DGL assumes that every edge type
will have the same fanout.
If -1 is provided for one edge type on one layer, then all inbound edges
of that edge type will be included.
replace : bool, default False
Whether to sample with replacement
prob : str, optional
If given, the probability of each neighbor being sampled is proportional
to the edge feature value with the given name in ``g.edata``. The feature must be
a scalar on each edge.
Examples
--------
To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
the first, second, and third layer respectively (assuming the backend is PyTorch):
>>> sampler = dgl.dataloading.NeighborSampler([5, 10, 15])
>>> dataloader = dgl.dataloading.NodeDataLoader(
... g, train_nid, sampler,
... batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
>>> for input_nodes, output_nodes, blocks in dataloader:
... train_on(blocks)
If training on a heterogeneous graph and you want different number of neighbors for each
edge type, one should instead provide a list of dicts. Each dict would specify the
number of neighbors to pick per edge type.
>>> sampler = dgl.dataloading.NeighborSampler([
... {('user', 'follows', 'user'): 5,
... ('user', 'plays', 'game'): 4,
... ('game', 'played-by', 'user'): 3}] * 3)
If you would like non-uniform neighbor sampling:
>>> g.edata['p'] = torch.rand(g.num_edges()) # any non-negative 1D vector works
>>> sampler = dgl.dataloading.NeighborSampler([5, 10, 15], prob='p')
Notes
-----
For the concept of MFGs, please refer to
:ref:`User Guide Section 6 <guide-minibatch>` and
:doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
"""
def __init__(self, fanouts, edge_dir='in', prob=None, replace=False, **kwargs):
super().__init__(**kwargs)
self.fanouts = fanouts
self.edge_dir = edge_dir
self.prob = prob
self.replace = replace
def sample_blocks(self, g, seed_nodes, exclude_eids=None):
output_nodes = seed_nodes
blocks = []
for fanout in reversed(self.fanouts):
frontier = g.sample_neighbors(
seed_nodes, fanout, edge_dir=self.edge_dir, prob=self.prob,
replace=self.replace, output_device=self.output_device,
exclude_edges=exclude_eids)
eid = frontier.edata[EID]
block = to_block(frontier, seed_nodes)
block.edata[EID] = eid
seed_nodes = block.srcdata[NID]
blocks.insert(0, block)
return seed_nodes, output_nodes, blocks
MultiLayerNeighborSampler = NeighborSampler
class MultiLayerFullNeighborSampler(NeighborSampler):
"""Sampler that builds computational dependency of node representations by taking messages
from all neighbors for multilayer GNN.
This sampler will make every node gather messages from every single neighbor per edge type.
Parameters
----------
n_layers : int
The number of GNN layers to sample.
return_eids : bool, default False
Whether to return the edge IDs involved in message passing in the MFG.
If True, the edge IDs will be stored as an edge feature named ``dgl.EID``.
Examples
--------
To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
a homogeneous graph where each node takes messages from all neighbors for the first,
second, and third layer respectively (assuming the backend is PyTorch):
>>> sampler = dgl.dataloading.MultiLayerFullNeighborSampler(3)
>>> dataloader = dgl.dataloading.NodeDataLoader(
... g, train_nid, sampler,
... batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
>>> for input_nodes, output_nodes, blocks in dataloader:
... train_on(blocks)
Notes
-----
For the concept of MFGs, please refer to
:ref:`User Guide Section 6 <guide-minibatch>` and
:doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
"""
def __init__(self, num_layers, edge_dir='in', prob=None, replace=False, **kwargs):
super().__init__([-1] * num_layers, edge_dir=edge_dir, prob=prob, replace=replace,
**kwargs)
"""ShaDow-GNN subgraph samplers.""" """ShaDow-GNN subgraph samplers."""
from ..utils import prepare_tensor_or_dict from ..sampling.utils import EidExcluder
from ..base import NID
from .. import transform from .. import transform
from ..sampling import sample_neighbors from ..base import NID
from .neighbor import NeighborSamplingMixin from .base import set_node_lazy_features, set_edge_lazy_features
from .dataloader import exclude_edges, Sampler
class ShaDowKHopSampler(NeighborSamplingMixin, Sampler): class ShaDowKHopSampler(object):
"""K-hop subgraph sampler used by """K-hop subgraph sampler used by
`ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__. `ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__.
...@@ -70,29 +68,32 @@ class ShaDowKHopSampler(NeighborSamplingMixin, Sampler): ...@@ -70,29 +68,32 @@ class ShaDowKHopSampler(NeighborSamplingMixin, Sampler):
If you would like non-uniform neighbor sampling: If you would like non-uniform neighbor sampling:
>>> g.edata['p'] = torch.rand(g.num_edges()) # any non-negative 1D vector works >>> g.edata['p'] = torch.rand(g.num_edges()) # any non-negative 1D vector works
>>> sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10, 15], prob='p') >>> sampler = dgl.dataloading.ShaDowKHopSampler([5, 10, 15], prob='p')
""" """
def __init__(self, fanouts, replace=False, prob=None, output_ctx=None): def __init__(self, fanouts, replace=False, prob=None, prefetch_node_feats=None,
super().__init__(output_ctx) prefetch_edge_feats=None, output_device=None):
self.fanouts = fanouts self.fanouts = fanouts
self.replace = replace self.replace = replace
self.prob = prob self.prob = prob
self.set_output_context(output_ctx) self.prefetch_node_feats = prefetch_node_feats
self.prefetch_edge_feats = prefetch_edge_feats
self.output_device = output_device
def sample(self, g, seed_nodes, exclude_eids=None): def sample(self, g, seed_nodes, exclude_edges=None):
self._build_fanout(len(self.fanouts), g) """Sample a subgraph given a tensor of seed nodes."""
self._build_prob_arrays(g)
seed_nodes = prepare_tensor_or_dict(g, seed_nodes, 'seed nodes')
output_nodes = seed_nodes output_nodes = seed_nodes
for fanout in reversed(self.fanouts):
for i in range(len(self.fanouts)): frontier = g.sample_neighbors(
fanout = self.fanouts[i] seed_nodes, fanout, output_device=self.output_device,
frontier = sample_neighbors( replace=self.replace, prob=self.prob, exclude_edges=exclude_edges)
g, seed_nodes, fanout, replace=self.replace, prob=self.prob_arrays)
block = transform.to_block(frontier, seed_nodes) block = transform.to_block(frontier, seed_nodes)
seed_nodes = block.srcdata[NID] seed_nodes = block.srcdata[NID]
subg = g.subgraph(seed_nodes, relabel_nodes=True) subg = g.subgraph(seed_nodes, relabel_nodes=True, output_device=self.output_device)
subg = exclude_edges(subg, exclude_eids, self.output_device) if exclude_edges is not None:
subg = EidExcluder(exclude_edges)(subg)
set_node_lazy_features(subg, self.prefetch_node_feats)
set_edge_lazy_features(subg, self.prefetch_edge_feats)
return seed_nodes, output_nodes, [subg] return seed_nodes, output_nodes, subg
...@@ -26,6 +26,7 @@ from . import rpc ...@@ -26,6 +26,7 @@ from . import rpc
from . import role from . import role
from .server_state import ServerState from .server_state import ServerState
from .rpc_server import start_server from .rpc_server import start_server
from . import graph_services
from .graph_services import find_edges as dist_find_edges from .graph_services import find_edges as dist_find_edges
from .graph_services import out_degrees as dist_out_degrees from .graph_services import out_degrees as dist_out_degrees
from .graph_services import in_degrees as dist_in_degrees from .graph_services import in_degrees as dist_in_degrees
...@@ -1223,6 +1224,20 @@ class DistGraph: ...@@ -1223,6 +1224,20 @@ class DistGraph:
''' '''
self._client.barrier() self._client.barrier()
def sample_neighbors(self, seed_nodes, fanout, edge_dir='in', prob=None,
exclude_edges=None, replace=False,
output_device=None):
# pylint: disable=unused-argument
"""Sample neighbors from a distributed graph."""
# Currently prob, exclude_edges, output_device, and edge_dir are ignored.
if len(self.etypes) > 1:
frontier = graph_services.sample_etype_neighbors(
self, seed_nodes, ETYPE, fanout, replace=replace)
else:
frontier = graph_services.sample_neighbors(
self, seed_nodes, fanout, replace=replace)
return frontier
def _get_ndata_names(self, ntype=None): def _get_ndata_names(self, ntype=None):
''' Get the names of all node data. ''' Get the names of all node data.
''' '''
......
This diff is collapsed.
"""Python interfaces to DGL farthest point sampler.""" """Python interfaces to DGL farthest point sampler."""
from dgl._ffi.base import DGLError
import numpy as np import numpy as np
from .._ffi.base import DGLError
from .._ffi.function import _init_api from .._ffi.function import _init_api
from .. import backend as F from .. import backend as F
from .. import ndarray as nd from .. import ndarray as nd
......
...@@ -1600,6 +1600,14 @@ class DGLHeteroGraph(object): ...@@ -1600,6 +1600,14 @@ class DGLHeteroGraph(object):
# View # View
################################################################# #################################################################
def get_node_storage(self, key, ntype=None):
"""Get storage object of node feature of type :attr:`ntype` and name :attr:`key`."""
return self._node_frames[self.get_ntype_id(ntype)]._columns[key]
def get_edge_storage(self, key, etype=None):
"""Get storage object of edge feature of type :attr:`etype` and name :attr:`key`."""
return self._edge_frames[self.get_etype_id(etype)]._columns[key]
@property @property
def nodes(self): def nodes(self):
"""Return a node view """Return a node view
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment