Unverified Commit edfbee2c authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Make sample_neighbors copy features on demand (#2042)

* fix

* fix

* lint

* fix

* test

* fix

* fix
parent 09ec6020
...@@ -158,8 +158,10 @@ def run(args, device, data): ...@@ -158,8 +158,10 @@ def run(args, device, data):
tic_step = time.time() tic_step = time.time()
for step, (input_nodes, seeds, blocks) in enumerate(dataloader): for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
# Load the input features as well as output labels # Load the input features as well as output labels
batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device) #batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
blocks = [block.int().to(device) for block in blocks] blocks = [block.int().to(device) for block in blocks]
batch_inputs = blocks[0].srcdata['features']
batch_labels = blocks[-1].dstdata['labels']
# Compute loss and prediction # Compute loss and prediction
batch_pred = model(blocks, batch_inputs) batch_pred = model(blocks, batch_inputs)
......
...@@ -56,7 +56,7 @@ def _sample_neighbors(local_g, partition_book, seed_nodes, fan_out, edge_dir, pr ...@@ -56,7 +56,7 @@ def _sample_neighbors(local_g, partition_book, seed_nodes, fan_out, edge_dir, pr
local_ids = F.astype(local_ids, local_g.idtype) local_ids = F.astype(local_ids, local_g.idtype)
# local_ids = self.seed_nodes # local_ids = self.seed_nodes
sampled_graph = local_sample_neighbors( sampled_graph = local_sample_neighbors(
local_g, local_ids, fan_out, edge_dir, prob, replace) local_g, local_ids, fan_out, edge_dir, prob, replace, _dist_training=True)
global_nid_mapping = local_g.ndata[NID] global_nid_mapping = local_g.ndata[NID]
src, dst = sampled_graph.edges() src, dst = sampled_graph.edges()
global_src, global_dst = global_nid_mapping[src], global_nid_mapping[dst] global_src, global_dst = global_nid_mapping[src], global_nid_mapping[dst]
......
...@@ -11,7 +11,8 @@ __all__ = [ ...@@ -11,7 +11,8 @@ __all__ = [
'sample_neighbors', 'sample_neighbors',
'select_topk'] 'select_topk']
def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False): def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
copy_ndata=True, copy_edata=True, _dist_training=False):
"""Sample neighboring edges of the given nodes and return the induced subgraph. """Sample neighboring edges of the given nodes and return the induced subgraph.
For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
...@@ -53,12 +54,35 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False): ...@@ -53,12 +54,35 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
to sum up to one). Otherwise, the result will be undefined. to sum up to one). Otherwise, the result will be undefined.
replace : bool, optional replace : bool, optional
If True, sample with replacement. If True, sample with replacement.
copy_ndata: bool, optional
If True, the node features of the new graph are copied from
the original graph. If False, the new graph will not have any
node features.
(Default: True)
copy_edata: bool, optional
If True, the edge features of the new graph are copied from
the original graph. If False, the new graph will not have any
edge features.
(Default: True)
_dist_training : bool, optional
Internal argument. Do not use.
(Default: False)
Returns Returns
------- -------
DGLGraph DGLGraph
A sampled subgraph containing only the sampled neighboring edges. It is on CPU. A sampled subgraph containing only the sampled neighboring edges. It is on CPU.
Notes
-----
If :attr:`copy_ndata` or :attr:`copy_edata` is True, same tensors are used as
the node or edge features of the original graph and the new graph.
As a result, users should avoid performing in-place operations
on the node features of the new graph to avoid feature corruption.
Examples Examples
-------- --------
Assume that you have the following graph Assume that you have the following graph
...@@ -130,11 +154,30 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False): ...@@ -130,11 +154,30 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
edge_dir, prob_arrays, replace) edge_dir, prob_arrays, replace)
induced_edges = subgidx.induced_edges induced_edges = subgidx.induced_edges
ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes) ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes)
# handle features
# (TODO) (BarclayII) DGL distributed fails with bus error, freezes, or other
# incomprehensible errors with lazy feature copy.
# So in distributed training context, we fall back to old behavior where we
# only set the edge IDs.
if not _dist_training:
if copy_ndata:
print(g, type(g))
node_frames = utils.extract_node_subframes(g, None)
utils.set_new_frames(ret, node_frames=node_frames)
if copy_edata:
print(g, type(g))
edge_frames = utils.extract_edge_subframes(g, induced_edges)
utils.set_new_frames(ret, edge_frames=edge_frames)
else:
for i, etype in enumerate(ret.canonical_etypes): for i, etype in enumerate(ret.canonical_etypes):
ret.edges[etype].data[EID] = induced_edges[i] ret.edges[etype].data[EID] = induced_edges[i]
return ret return ret
def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False): def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False,
copy_ndata=True, copy_edata=True):
"""Select the neighboring edges with k-largest (or k-smallest) weights of the given """Select the neighboring edges with k-largest (or k-smallest) weights of the given
nodes and return the induced subgraph. nodes and return the induced subgraph.
...@@ -176,12 +219,31 @@ def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False): ...@@ -176,12 +219,31 @@ def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False):
ascending : bool, optional ascending : bool, optional
If True, DGL will return edges with k-smallest weights instead of If True, DGL will return edges with k-smallest weights instead of
k-largest weights. k-largest weights.
copy_ndata: bool, optional
If True, the node features of the new graph are copied from
the original graph. If False, the new graph will not have any
node features.
(Default: True)
copy_edata: bool, optional
If True, the edge features of the new graph are copied from
the original graph. If False, the new graph will not have any
edge features.
(Default: True)
Returns Returns
------- -------
DGLGraph DGLGraph
A sampled subgraph containing only the sampled neighboring edges. It is on CPU. A sampled subgraph containing only the sampled neighboring edges. It is on CPU.
Notes
-----
If :attr:`copy_ndata` or :attr:`copy_edata` is True, same tensors are used as
the node or edge features of the original graph and the new graph.
As a result, users should avoid performing in-place operations
on the node features of the new graph to avoid feature corruption.
Examples Examples
-------- --------
>>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0])) >>> g = dgl.graph(([0, 0, 1, 1, 2, 2], [1, 2, 0, 1, 2, 0]))
...@@ -231,8 +293,17 @@ def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False): ...@@ -231,8 +293,17 @@ def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False):
g._graph, nodes_all_types, k_array, edge_dir, weight_arrays, bool(ascending)) g._graph, nodes_all_types, k_array, edge_dir, weight_arrays, bool(ascending))
induced_edges = subgidx.induced_edges induced_edges = subgidx.induced_edges
ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes) ret = DGLHeteroGraph(subgidx.graph, g.ntypes, g.etypes)
for i, etype in enumerate(ret.canonical_etypes):
ret.edges[etype].data[EID] = induced_edges[i] # handle features
if copy_ndata:
print(g, type(g))
node_frames = utils.extract_node_subframes(g, None)
utils.set_new_frames(ret, node_frames=node_frames)
if copy_edata:
print(g, type(g))
edge_frames = utils.extract_edge_subframes(g, induced_edges)
utils.set_new_frames(ret, edge_frames=edge_frames)
return ret return ret
_init_api('dgl.sampling.neighbor', __name__) _init_api('dgl.sampling.neighbor', __name__)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment