Unverified Commit e9b624fe authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

Merge branch 'master' into dist_part

parents 8086d1ed a88e7f7e
...@@ -360,6 +360,13 @@ class HeteroEmbedding(nn.Module): ...@@ -360,6 +360,13 @@ class HeteroEmbedding(nn.Module):
""" """
return {self.raw_keys[typ]: emb.weight for typ, emb in self.embeds.items()} return {self.raw_keys[typ]: emb.weight for typ, emb in self.embeds.items()}
def reset_parameters(self):
"""
Use the xavier method in nn.init module to make the parameters uniformly distributed
"""
for typ in self.embeds.keys():
nn.init.xavier_uniform_(self.embeds[typ].weight)
def forward(self, input_ids): def forward(self, input_ids):
"""Forward function """Forward function
......
...@@ -135,6 +135,7 @@ def edge_softmax(graph, logits, eids=ALL, norm_by='dst'): ...@@ -135,6 +135,7 @@ def edge_softmax(graph, logits, eids=ALL, norm_by='dst'):
eids=eids, norm_by=norm_by) eids=eids, norm_by=norm_by)
else: else:
logits_list = [None] * graph._graph.number_of_etypes() logits_list = [None] * graph._graph.number_of_etypes()
logits = {graph.to_canonical_etype(k): v for k, v in logits.items()}
for rel in graph.canonical_etypes: for rel in graph.canonical_etypes:
etid = graph.get_etype_id(rel) etid = graph.get_etype_id(rel)
logits_list[etid] = logits[rel] logits_list[etid] = logits[rel]
......
"""Classes and functions for batching multiple graphs together.""" """Classes and functions for batching multiple graphs together."""
from __future__ import absolute_import from __future__ import absolute_import
from .base import DGLError from .base import DGLError, dgl_warning
from . import backend as F from . import backend as F
from .ops import segment from .ops import segment
...@@ -365,8 +365,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None): ...@@ -365,8 +365,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
graph : DGLGraph graph : DGLGraph
The graph. The graph.
graph_feat : tensor graph_feat : tensor
The feature to broadcast. Tensor shape is :math:`(*)` for single graph, and The feature to broadcast. Tensor shape is :math:`(B, *)` for batched graph,
:math:`(B, *)` for batched graph. where :math:`B` is the batch size.
ntype : str, optional ntype : str, optional
Node type. Can be omitted if there is only one node type. Node type. Can be omitted if there is only one node type.
...@@ -403,9 +404,11 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None): ...@@ -403,9 +404,11 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014], [0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]]) [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
Broadcast feature to all nodes in the single graph. Broadcast feature to all nodes in the single graph (the feature tensor shape
to broadcast should be :math:`(1, *)`).
>>> dgl.broadcast_nodes(g1, feat[0]) >>> feat0 = th.unsqueeze(feat[0], 0)
>>> dgl.broadcast_nodes(g1, feat0)
tensor([[0.4325, 0.7710, 0.5541, 0.0544, 0.9368], tensor([[0.4325, 0.7710, 0.5541, 0.0544, 0.9368],
[0.4325, 0.7710, 0.5541, 0.0544, 0.9368]]) [0.4325, 0.7710, 0.5541, 0.0544, 0.9368]])
...@@ -413,7 +416,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None): ...@@ -413,7 +416,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
-------- --------
broadcast_edges broadcast_edges
""" """
if len(F.shape(graph_feat)) == 1: if (F.shape(graph_feat)[0] != graph.batch_size and graph.batch_size == 1):
dgl_warning('For a single graph, use a tensor of shape (1, *) for graph_feat.'
' The support of shape (*) will be deprecated.')
graph_feat = F.unsqueeze(graph_feat, dim=0) graph_feat = F.unsqueeze(graph_feat, dim=0)
return F.repeat(graph_feat, graph.batch_num_nodes(ntype), dim=0) return F.repeat(graph_feat, graph.batch_num_nodes(ntype), dim=0)
...@@ -434,8 +439,8 @@ def broadcast_edges(graph, graph_feat, *, etype=None): ...@@ -434,8 +439,8 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
graph : DGLGraph graph : DGLGraph
The graph. The graph.
graph_feat : tensor graph_feat : tensor
The feature to broadcast. Tensor shape is :math:`(*)` for single graph, and The feature to broadcast. Tensor shape is :math:`(B, *)` for batched graph,
:math:`(B, *)` for batched graph. where :math:`B` is the batch size.
etype : str, typle of str, optional etype : str, typle of str, optional
Edge type. Can be omitted if there is only one edge type in the graph. Edge type. Can be omitted if there is only one edge type in the graph.
...@@ -470,9 +475,11 @@ def broadcast_edges(graph, graph_feat, *, etype=None): ...@@ -470,9 +475,11 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014], [0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]]) [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
Broadcast feature to all edges in the single graph. Broadcast feature to all edges in the single graph (the feature tensor shape
to broadcast should be :math:`(1, *)`).
>>> dgl.broadcast_edges(g2, feat[1]) >>> feat1 = th.unsqueeze(feat[1], 0)
>>> dgl.broadcast_edges(g2, feat1)
tensor([[0.2721, 0.4629, 0.7269, 0.0724, 0.1014], tensor([[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]]) [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
...@@ -480,7 +487,9 @@ def broadcast_edges(graph, graph_feat, *, etype=None): ...@@ -480,7 +487,9 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
-------- --------
broadcast_nodes broadcast_nodes
""" """
if len(F.shape(graph_feat)) == 1: if (F.shape(graph_feat)[0] != graph.batch_size and graph.batch_size == 1):
dgl_warning('For a single graph, use a tensor of shape (1, *) for graph_feat.'
' The support of shape (*) will be deprecated.')
graph_feat = F.unsqueeze(graph_feat, dim=0) graph_feat = F.unsqueeze(graph_feat, dim=0)
return F.repeat(graph_feat, graph.batch_num_edges(etype), dim=0) return F.repeat(graph_feat, graph.batch_num_edges(etype), dim=0)
......
...@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No ...@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
The features must be non-negative floats, and the sum of the features of The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined. to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
replace : bool, optional replace : bool, optional
If True, sample with replacement. If True, sample with replacement.
copy_ndata: bool, optional copy_ndata: bool, optional
...@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False, ...@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
Node/edge features are not preserved. The original IDs of Node/edge features are not preserved. The original IDs of
the sampled edges are stored as the `dgl.EID` feature in the returned graph. the sampled edges are stored as the `dgl.EID` feature in the returned graph.
GPU sampling is supported for this function. Refer to :ref:`guide-minibatch-gpu-sampling`
for more details.
Parameters Parameters
---------- ----------
g : DGLGraph g : DGLGraph
...@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False, ...@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
The features must be non-negative floats, and the sum of the features of The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined. to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
exclude_edges: tensor or dict exclude_edges: tensor or dict
Edge IDs to exclude during sampling neighbors for the seed nodes. Edge IDs to exclude during sampling neighbors for the seed nodes.
......
...@@ -38,6 +38,9 @@ class RandomWalkNeighborSampler(object): ...@@ -38,6 +38,9 @@ class RandomWalkNeighborSampler(object):
This is a generalization of PinSAGE sampler which only works on bidirectional bipartite This is a generalization of PinSAGE sampler which only works on bidirectional bipartite
graphs. graphs.
UVA and GPU sampling is supported for this sampler.
Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
Parameters Parameters
---------- ----------
G : DGLGraph G : DGLGraph
...@@ -104,13 +107,14 @@ class RandomWalkNeighborSampler(object): ...@@ -104,13 +107,14 @@ class RandomWalkNeighborSampler(object):
A tensor of given node IDs of node type ``ntype`` to generate neighbors from. The A tensor of given node IDs of node type ``ntype`` to generate neighbors from. The
node type ``ntype`` is the beginning and ending node type of the given metapath. node type ``ntype`` is the beginning and ending node type of the given metapath.
It must be on CPU and have the same dtype as the ID type of the graph. It must be on the same device as the graph and have the same dtype
as the ID type of the graph.
Returns Returns
------- -------
g : DGLGraph g : DGLGraph
A homogeneous graph constructed by selecting neighbors for each given node according A homogeneous graph constructed by selecting neighbors for each given node according
to the algorithm above. The returned graph is on CPU. to the algorithm above.
""" """
seed_nodes = utils.prepare_tensor(self.G, seed_nodes, 'seed_nodes') seed_nodes = utils.prepare_tensor(self.G, seed_nodes, 'seed_nodes')
self.restart_prob = F.copy_to(self.restart_prob, F.context(seed_nodes)) self.restart_prob = F.copy_to(self.restart_prob, F.context(seed_nodes))
...@@ -147,6 +151,9 @@ class PinSAGESampler(RandomWalkNeighborSampler): ...@@ -147,6 +151,9 @@ class PinSAGESampler(RandomWalkNeighborSampler):
The edges of the returned homogeneous graph will connect to the given nodes from their most The edges of the returned homogeneous graph will connect to the given nodes from their most
commonly visited nodes, with a feature indicating the number of visits. commonly visited nodes, with a feature indicating the number of visits.
UVA and GPU sampling is supported for this sampler.
Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
Parameters Parameters
---------- ----------
G : DGLGraph G : DGLGraph
......
...@@ -30,6 +30,8 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -30,6 +30,8 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
If a random walk stops in advance, DGL pads the trace with -1 to have the same If a random walk stops in advance, DGL pads the trace with -1 to have the same
length. length.
This function supports the graph on GPU and UVA sampling.
Parameters Parameters
---------- ----------
g : DGLGraph g : DGLGraph
...@@ -37,8 +39,9 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -37,8 +39,9 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
nodes : Tensor nodes : Tensor
Node ID tensor from which the random walk traces starts. Node ID tensor from which the random walk traces starts.
The tensor must have the same dtype as the ID type The tensor must have the same dtype as the ID type of the graph.
of the graph. The tensor must be on the same device as the graph or
on the GPU when the graph is pinned (UVA sampling).
metapath : list[str or tuple of str], optional metapath : list[str or tuple of str], optional
Metapath, specified as a list of edge types. Metapath, specified as a list of edge types.
...@@ -60,12 +63,15 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -60,12 +63,15 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
must be positive for the outbound edges of all nodes (although they don't have must be positive for the outbound edges of all nodes (although they don't have
to sum up to one). The result will be undefined otherwise. to sum up to one). The result will be undefined otherwise.
The feature tensor must be on the same device as the graph.
If omitted, DGL assumes that the neighbors are picked uniformly. If omitted, DGL assumes that the neighbors are picked uniformly.
restart_prob : float or Tensor, optional restart_prob : float or Tensor, optional
Probability to terminate the current trace before each transition. Probability to terminate the current trace before each transition.
If a tensor is given, :attr:`restart_prob` should have the same length as If a tensor is given, :attr:`restart_prob` should be on the same device as the graph
:attr:`metapath` or :attr:`length`. or on the GPU when the graph is pinned (UVA sampling),
and have the same length as :attr:`metapath` or :attr:`length`.
return_eids : bool, optional return_eids : bool, optional
If True, additionally return the edge IDs traversed. If True, additionally return the edge IDs traversed.
...@@ -176,19 +182,16 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -176,19 +182,16 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
metapath = F.to_dgl_nd(F.astype(F.tensor(metapath), g.idtype)) metapath = F.to_dgl_nd(F.astype(F.tensor(metapath), g.idtype))
# Load the probability tensor from the edge frames # Load the probability tensor from the edge frames
ctx = utils.to_dgl_context(g.device)
if prob is None: if prob is None:
p_nd = [nd.array([], ctx=nodes.ctx) for _ in g.canonical_etypes] p_nd = [nd.array([], ctx=ctx) for _ in g.canonical_etypes]
else: else:
p_nd = [] p_nd = []
for etype in g.canonical_etypes: for etype in g.canonical_etypes:
if prob in g.edges[etype].data: if prob in g.edges[etype].data:
prob_nd = F.to_dgl_nd(g.edges[etype].data[prob]) prob_nd = F.to_dgl_nd(g.edges[etype].data[prob])
if prob_nd.ctx != nodes.ctx:
raise ValueError(
'context of seed node array and edges[%s].data[%s] are different' %
(etype, prob))
else: else:
prob_nd = nd.array([], ctx=nodes.ctx) prob_nd = nd.array([], ctx=ctx)
p_nd.append(prob_nd) p_nd.append(prob_nd)
# Actual random walk # Actual random walk
...@@ -198,9 +201,11 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -198,9 +201,11 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
restart_prob = F.to_dgl_nd(restart_prob) restart_prob = F.to_dgl_nd(restart_prob)
traces, eids, types = _CAPI_DGLSamplingRandomWalkWithStepwiseRestart( traces, eids, types = _CAPI_DGLSamplingRandomWalkWithStepwiseRestart(
gidx, nodes, metapath, p_nd, restart_prob) gidx, nodes, metapath, p_nd, restart_prob)
else: elif isinstance(restart_prob, float):
traces, eids, types = _CAPI_DGLSamplingRandomWalkWithRestart( traces, eids, types = _CAPI_DGLSamplingRandomWalkWithRestart(
gidx, nodes, metapath, p_nd, restart_prob) gidx, nodes, metapath, p_nd, restart_prob)
else:
raise TypeError("restart_prob should be float or Tensor.")
traces = F.from_dgl_nd(traces) traces = F.from_dgl_nd(traces)
types = F.from_dgl_nd(types) types = F.from_dgl_nd(types)
......
...@@ -365,11 +365,11 @@ def _gspmm_hetero(gidx, op, reduce_op, u_len, u_and_e_tuple): ...@@ -365,11 +365,11 @@ def _gspmm_hetero(gidx, op, reduce_op, u_len, u_and_e_tuple):
for l, arg_u_nd in enumerate(list_arg_u_nd): for l, arg_u_nd in enumerate(list_arg_u_nd):
# TODO(Israt): l or src_id as index of lhs # TODO(Israt): l or src_id as index of lhs
list_arg_u[l] = None if list_arg_u[l] is None else F.zerocopy_from_dgl_ndarray(arg_u_nd) list_arg_u[l] = None if list_arg_u[l] is None else F.zerocopy_from_dgl_ndarray(arg_u_nd)
if expand_u and use_cmp: if list_arg_u[l] is not None and expand_u and use_cmp:
list_arg_u[l] = F.squeeze(list_arg_u[l], -1) list_arg_u[l] = F.squeeze(list_arg_u[l], -1)
for l, arg_e_nd in enumerate(list_arg_e_nd): for l, arg_e_nd in enumerate(list_arg_e_nd):
list_arg_e[l] = None if list_arg_e[l] is None else F.zerocopy_from_dgl_ndarray(arg_e_nd) list_arg_e[l] = None if list_arg_e[l] is None else F.zerocopy_from_dgl_ndarray(arg_e_nd)
if expand_e and use_cmp: if list_arg_e[l] is not None and expand_e and use_cmp:
list_arg_e[l] = F.squeeze(list_arg_e[l], -1) list_arg_e[l] = F.squeeze(list_arg_e[l], -1)
for l, arg_u_ntype_nd in enumerate(list_arg_u_ntype_nd): for l, arg_u_ntype_nd in enumerate(list_arg_u_ntype_nd):
list_arg_u_ntype[l] = None if arg_u_ntype_nd is None \ list_arg_u_ntype[l] = None if arg_u_ntype_nd is None \
...@@ -562,7 +562,7 @@ def _gsddmm_hetero(gidx, op, lhs_len, lhs_target='u', rhs_target='v', lhs_and_rh ...@@ -562,7 +562,7 @@ def _gsddmm_hetero(gidx, op, lhs_len, lhs_target='u', rhs_target='v', lhs_and_rh
e = out_list[l] e = out_list[l]
e = F.tensor([]) if e is None else e e = F.tensor([]) if e is None else e
if (expand_lhs or not use_lhs) and (expand_rhs or not use_rhs): if (expand_lhs or not use_lhs) and (expand_rhs or not use_rhs):
e = F.squeeze(v, -1) e = F.squeeze(e, -1)
out_list[l] = e out_list[l] = e
out = tuple(out_list) out = tuple(out_list)
return out return out
......
...@@ -40,6 +40,7 @@ from ..partition import metis_partition_assignment ...@@ -40,6 +40,7 @@ from ..partition import metis_partition_assignment
from ..partition import partition_graph_with_halo from ..partition import partition_graph_with_halo
from ..partition import metis_partition from ..partition import metis_partition
from .. import subgraph from .. import subgraph
from .. import function
# TO BE DEPRECATED # TO BE DEPRECATED
from .._deprecate.graph import DGLGraph as DGLGraphStale from .._deprecate.graph import DGLGraph as DGLGraphStale
...@@ -1764,13 +1765,24 @@ def remove_nodes(g, nids, ntype=None, store_ids=False): ...@@ -1764,13 +1765,24 @@ def remove_nodes(g, nids, ntype=None, store_ids=False):
g.remove_nodes(nids, ntype=ntype, store_ids=store_ids) g.remove_nodes(nids, ntype=ntype, store_ids=store_ids)
return g return g
def add_self_loop(g, etype=None): def add_self_loop(g, edge_feat_names=None, fill_data=1., etype=None):
r"""Add self-loops for each node in the graph and return a new graph. r"""Add self-loops for each node in the graph and return a new graph.
Parameters Parameters
---------- ----------
g : DGLGraph g : DGLGraph
The graph. The graph.
edge_feat_names : list[str], optional
The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
to all self-loop features. Default: None.
fill_data : int, float or str, optional
The value to fill the self-loop features. Default: 1.
* If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
``fill_data``.
* if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
features of the incoming edges of the corresponding nodes. The supported aggregation are:
``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
etype : str or (str, str, str), optional etype : str or (str, str, str), optional
The type names of the edges. The allowed type name formats are: The type names of the edges. The allowed type name formats are:
...@@ -1792,7 +1804,6 @@ def add_self_loop(g, etype=None): ...@@ -1792,7 +1804,6 @@ def add_self_loop(g, etype=None):
* The function adds self-loops regardless of whether they already exist or not. * The function adds self-loops regardless of whether they already exist or not.
If one wishes to have exactly one self-loop for every node, If one wishes to have exactly one self-loop for every node,
call :func:`remove_self_loop` before invoking :func:`add_self_loop`. call :func:`remove_self_loop` before invoking :func:`add_self_loop`.
* Features of the new edges (self-loop edges) will be filled with zeros.
* This function discards the batch information. Please use * This function discards the batch information. Please use
:func:`dgl.DGLGraph.set_batch_num_nodes` :func:`dgl.DGLGraph.set_batch_num_nodes`
and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
...@@ -1808,7 +1819,7 @@ def add_self_loop(g, etype=None): ...@@ -1808,7 +1819,7 @@ def add_self_loop(g, etype=None):
>>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0]))) >>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0])))
>>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1) >>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
>>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1) >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
>>> g = dgl.add_self_loop(g) >>> g = dgl.add_self_loop(g, fill_data='sum')
>>> g >>> g
Graph(num_nodes=3, num_edges=6, Graph(num_nodes=3, num_edges=6,
ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)} ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
...@@ -1817,8 +1828,8 @@ def add_self_loop(g, etype=None): ...@@ -1817,8 +1828,8 @@ def add_self_loop(g, etype=None):
tensor([[0.], tensor([[0.],
[1.], [1.],
[2.], [2.],
[0.], [2.],
[0.], [1.],
[0.]]) [0.]])
**Heterogeneous Graphs** **Heterogeneous Graphs**
...@@ -1831,17 +1842,49 @@ def add_self_loop(g, etype=None): ...@@ -1831,17 +1842,49 @@ def add_self_loop(g, etype=None):
>>> g = dgl.add_self_loop(g, etype='follows') >>> g = dgl.add_self_loop(g, etype='follows')
>>> g >>> g
Graph(num_nodes={'user': 3, 'game': 2}, Graph(num_nodes={'user': 3, 'game': 2},
num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5}, num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
metagraph=[('user', 'user'), ('user', 'game')]) metagraph=[('user', 'user'), ('user', 'game')])
""" """
etype = g.to_canonical_etype(etype) etype = g.to_canonical_etype(etype)
data = {}
reduce_funcs = {'sum': function.sum,
'mean': function.mean,
'max': function.max,
'min': function.min}
if edge_feat_names is None:
edge_feat_names = g.edges[etype].data.keys()
if etype[0] != etype[2]: if etype[0] != etype[2]:
raise DGLError( raise DGLError(
'add_self_loop does not support unidirectional bipartite graphs: {}.' \ 'add_self_loop does not support unidirectional bipartite graphs: {}.' \
'Please make sure the types of head node and tail node are identical.' \ 'Please make sure the types of head node and tail node are identical.' \
''.format(etype)) ''.format(etype))
for feat_name in edge_feat_names:
if isinstance(fill_data, (int, float)):
dtype = g.edges[etype].data[feat_name].dtype
dshape = g.edges[etype].data[feat_name].shape
tmp_fill_data = F.copy_to(F.astype(F.tensor([fill_data]), dtype), g.device)
if len(dshape) > 1:
data[feat_name] = F.zeros((g.num_nodes(etype[0]), *dshape[1:]), dtype,
g.device) + tmp_fill_data
else:
data[feat_name] = F.zeros((g.num_nodes(etype[0]),), dtype, g.device) + tmp_fill_data
elif isinstance(fill_data, str):
if fill_data not in reduce_funcs.keys():
raise DGLError('Unsupported aggregation: {}'.format(fill_data))
reducer = reduce_funcs[fill_data]
with g.local_scope():
g.update_all(function.copy_e(feat_name, "h"), reducer('h', 'h'), etype=etype)
data[feat_name] = g.nodes[etype[0]].data['h']
nodes = g.nodes(etype[0]) nodes = g.nodes(etype[0])
new_g = add_edges(g, nodes, nodes, etype=etype) if len(data):
new_g = add_edges(g, nodes, nodes, data=data, etype=etype)
else:
new_g = add_edges(g, nodes, nodes, etype=etype)
return new_g return new_g
DGLHeteroGraph.add_self_loop = utils.alias_func(add_self_loop) DGLHeteroGraph.add_self_loop = utils.alias_func(add_self_loop)
...@@ -2872,6 +2915,8 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'): ...@@ -2872,6 +2915,8 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``node``. ``tag_type`` is ``node``.
>>> import dgl >>> import dgl
>>> import torch
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2])) >>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g.adjacency_matrix(scipy_fmt='csr').nonzero() >>> g.adjacency_matrix(scipy_fmt='csr').nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
...@@ -2890,11 +2935,10 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'): ...@@ -2890,11 +2935,10 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``edge``. ``tag_type`` is ``edge``.
>>> from dgl import backend as F
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2])) >>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g.edges() >>> g.edges()
(tensor([0, 0, 0, 0, 0, 1, 1, 1]), tensor([0, 1, 2, 3, 4, 0, 1, 2])) (tensor([0, 0, 0, 0, 0, 1, 1, 1]), tensor([0, 1, 2, 3, 4, 0, 1, 2]))
>>> tag = F.tensor([1, 1, 0, 2, 0, 1, 1, 0]) >>> tag = torch.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> g_sorted = dgl.sort_csr_by_tag(g, tag, tag_type='edge') >>> g_sorted = dgl.sort_csr_by_tag(g, tag, tag_type='edge')
>>> g_sorted.adj(scipy_fmt='csr').nonzero() >>> g_sorted.adj(scipy_fmt='csr').nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32)) (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
...@@ -2995,6 +3039,7 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'): ...@@ -2995,6 +3039,7 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``node``. ``tag_type`` is ``node``.
>>> import dgl >>> import dgl
>>> import torch
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1])) >>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero() >>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
...@@ -3013,9 +3058,8 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'): ...@@ -3013,9 +3058,8 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``edge``. ``tag_type`` is ``edge``.
>>> from dgl import backend as F
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1])) >>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> tag = F.tensor([1, 1, 0, 2, 0, 1, 1, 0]) >>> tag = torch.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> g_sorted = dgl.sort_csc_by_tag(g, tag, tag_type='edge') >>> g_sorted = dgl.sort_csc_by_tag(g, tag, tag_type='edge')
>>> g_sorted.adj(scipy_fmt='csr', transpose=True).nonzero() >>> g_sorted.adj(scipy_fmt='csr', transpose=True).nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32)) (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
......
...@@ -415,6 +415,17 @@ class AddSelfLoop(BaseTransform): ...@@ -415,6 +415,17 @@ class AddSelfLoop(BaseTransform):
If False, it will first remove self-loops to prevent duplicate self-loops. If False, it will first remove self-loops to prevent duplicate self-loops.
new_etypes : bool, optional new_etypes : bool, optional
If True, it will add an edge type 'self' per node type, which holds self-loops. If True, it will add an edge type 'self' per node type, which holds self-loops.
edge_feat_names : list[str], optional
The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
to all self-loop features. Default: None.
fill_data : int, float or str, optional
The value to fill the self-loop features. Default: 1.
* If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
``fill_data``.
* if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
features of the incoming edges of the corresponding nodes. The supported aggregation are:
``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
Example Example
------- -------
...@@ -424,23 +435,39 @@ class AddSelfLoop(BaseTransform): ...@@ -424,23 +435,39 @@ class AddSelfLoop(BaseTransform):
Case1: Add self-loops for a homogeneous graph Case1: Add self-loops for a homogeneous graph
>>> transform = AddSelfLoop() >>> transform = AddSelfLoop(fill_data='sum')
>>> g = dgl.graph(([1, 1], [1, 2])) >>> g = dgl.graph(([0, 0, 2], [2, 1, 0]))
>>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
>>> new_g = transform(g) >>> new_g = transform(g)
>>> print(new_g.edges()) >>> print(new_g.edges())
(tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2])) (tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
>>> print(new_g.edata('he'))
tensor([[0.],
[1.],
[2.],
[2.],
[1.],
[0.]])
Case2: Add self-loops for a heterogeneous graph Case2: Add self-loops for a heterogeneous graph
>>> transform = AddSelfLoop(fill_data='sum')
>>> g = dgl.heterograph({ >>> g = dgl.heterograph({
... ('user', 'plays', 'game'): ([0], [1]), ... ('user', 'follows', 'user'): (torch.tensor([1, 2]),
... ('user', 'follows', 'user'): ([1], [2]) ... torch.tensor([0, 1])),
... }) ... ('user', 'plays', 'game'): (torch.tensor([0, 1]),
... torch.tensor([0, 1]))})
>>> g.edata['feat'] = {('user', 'follows', 'user'): torch.randn(2, 5),
... ('user', 'plays', 'game'): torch.randn(2, 5)}
>>> g.edata['feat1'] = {('user', 'follows', 'user'): torch.randn(2, 15),
... ('user', 'plays', 'game'): torch.randn(2, 15)}
>>> new_g = transform(g) >>> new_g = transform(g)
>>> print(new_g.edges(etype='plays')) >>> print(new_g.edges(etype='plays'))
(tensor([0]), tensor([1])) (tensor([0, 1]), tensor([0, 1]))
>>> print(new_g.edges(etype='follows')) >>> print(new_g.edges(etype='follows'))
(tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2])) (tensor([1, 2]), tensor([0, 1]))
>>> print(new_g.edata['feat'][('user', 'follows', 'user')].shape)
torch.Size([5, 5])
Case3: Add self-etypes for a heterogeneous graph Case3: Add self-etypes for a heterogeneous graph
...@@ -451,9 +478,12 @@ class AddSelfLoop(BaseTransform): ...@@ -451,9 +478,12 @@ class AddSelfLoop(BaseTransform):
>>> print(new_g.edges(etype=('game', 'self', 'game'))) >>> print(new_g.edges(etype=('game', 'self', 'game')))
(tensor([0, 1]), tensor([0, 1])) (tensor([0, 1]), tensor([0, 1]))
""" """
def __init__(self, allow_duplicate=False, new_etypes=False):
def __init__(self, allow_duplicate=False, new_etypes=False, edge_feat_names=None, fill_data=1.):
self.allow_duplicate = allow_duplicate self.allow_duplicate = allow_duplicate
self.new_etypes = new_etypes self.new_etypes = new_etypes
self.edge_feat_names = edge_feat_names
self.fill_data = fill_data
def transform_etype(self, c_etype, g): def transform_etype(self, c_etype, g):
r""" r"""
...@@ -480,7 +510,8 @@ class AddSelfLoop(BaseTransform): ...@@ -480,7 +510,8 @@ class AddSelfLoop(BaseTransform):
if not self.allow_duplicate: if not self.allow_duplicate:
g = functional.remove_self_loop(g, etype=c_etype) g = functional.remove_self_loop(g, etype=c_etype)
return functional.add_self_loop(g, etype=c_etype) return functional.add_self_loop(g, edge_feat_names=self.edge_feat_names,
fill_data=self.fill_data, etype=c_etype)
def __call__(self, g): def __call__(self, g):
for c_etype in g.canonical_etypes: for c_etype in g.canonical_etypes:
...@@ -501,6 +532,7 @@ class AddSelfLoop(BaseTransform): ...@@ -501,6 +532,7 @@ class AddSelfLoop(BaseTransform):
data_dict[c_etype] = g.edges(etype=c_etype) data_dict[c_etype] = g.edges(etype=c_etype)
g = update_graph_structure(g, data_dict) g = update_graph_structure(g, data_dict)
return g return g
class RemoveSelfLoop(BaseTransform): class RemoveSelfLoop(BaseTransform):
......
...@@ -549,11 +549,13 @@ COOMatrix CSRRowWiseSampling( ...@@ -549,11 +549,13 @@ COOMatrix CSRRowWiseSampling(
CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) { CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
COOMatrix ret; COOMatrix ret;
if (IsNullArray(prob)) { if (IsNullArray(prob)) {
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", { ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSamplingUniform", {
ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace); ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
}); });
} else { } else {
ATEN_CSR_SWITCH(mat, XPU, IdType, "CSRRowWiseSampling", { // prob is pinned and rows on GPU is valid
CHECK_VALID_CONTEXT(prob, rows);
ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", { ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
ret = impl::CSRRowWiseSampling<XPU, IdType, FloatType>( ret = impl::CSRRowWiseSampling<XPU, IdType, FloatType>(
mat, rows, num_samples, prob, replace); mat, rows, num_samples, prob, replace);
......
...@@ -7,13 +7,11 @@ ...@@ -7,13 +7,11 @@
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_ #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_ #define DGL_ARRAY_CUDA_DGL_CUB_CUH_
// include cub in a safe manner // This should be defined in CMakeLists.txt
#define CUB_NS_PREFIX namespace dgl { #ifndef THRUST_CUB_WRAPPED_NAMESPACE
#define CUB_NS_POSTFIX } static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
#define CUB_NS_QUALIFIER ::dgl::cub #endif
#include "cub/cub.cuh" #include "cub/cub.cuh"
#undef CUB_NS_QUALIFIER
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif #endif
/*! /*!
* Copyright (c) 2021 by Contributors * Copyright (c) 2021 by Contributors
* \file array/cuda/rowwise_sampling.cu * \file array/cuda/rowwise_sampling.cu
* \brief rowwise sampling * \brief uniform rowwise sampling
*/ */
#include <dgl/random.h> #include <dgl/random.h>
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "../../array/cuda/atomic.cuh" #include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
using namespace dgl::aten::cuda; using namespace dgl::aten::cuda;
namespace dgl { namespace dgl {
...@@ -21,7 +22,7 @@ namespace impl { ...@@ -21,7 +22,7 @@ namespace impl {
namespace { namespace {
constexpr int CTA_SIZE = 128; constexpr int BLOCK_SIZE = 128;
/** /**
* @brief Compute the size of each row in the sampled CSR, without replacement. * @brief Compute the size of each row in the sampled CSR, without replacement.
...@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel( ...@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
const IdType * const in_rows, const IdType * const in_rows,
const IdType * const in_ptr, const IdType * const in_ptr,
IdType * const out_deg) { IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x*blockDim.x; const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) { if (tIdx < num_rows) {
const int in_row = in_rows[tIdx]; const int in_row = in_rows[tIdx];
const int out_row = tIdx; const int out_row = tIdx;
out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row+1]-in_ptr[in_row]); out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
if (out_row == num_rows-1) { if (out_row == num_rows - 1) {
// make the prefixsum work // make the prefixsum work
out_deg[num_rows] = 0; out_deg[num_rows] = 0;
} }
...@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel( ...@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const IdType * const in_rows, const IdType * const in_rows,
const IdType * const in_ptr, const IdType * const in_ptr,
IdType * const out_deg) { IdType * const out_deg) {
const int tIdx = threadIdx.x + blockIdx.x*blockDim.x; const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) { if (tIdx < num_rows) {
const int64_t in_row = in_rows[tIdx]; const int64_t in_row = in_rows[tIdx];
const int64_t out_row = tIdx; const int64_t out_row = tIdx;
if (in_ptr[in_row+1]-in_ptr[in_row] == 0) { if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) {
out_deg[out_row] = 0; out_deg[out_row] = 0;
} else { } else {
out_deg[out_row] = static_cast<IdType>(num_picks); out_deg[out_row] = static_cast<IdType>(num_picks);
} }
if (out_row == num_rows-1) { if (out_row == num_rows - 1) {
// make the prefixsum work // make the prefixsum work
out_deg[num_rows] = 0; out_deg[num_rows] = 0;
} }
...@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel( ...@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
} }
/** /**
* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix, * @brief Perform row-wise uniform sampling on a CSR matrix,
* without replacement. * and generate a COO matrix, without replacement.
* *
* @tparam IdType The ID type used for matrices. * @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock. * @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use. * @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row. * @param num_picks The number of non-zeros to pick per row.
...@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel( ...@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
* @param out_cols The columns of the output COO (output). * @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output). * @param out_idxs The data array of the output COO (output).
*/ */
template<typename IdType, int BLOCK_CTAS, int TILE_SIZE> template<typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleKernel( __global__ void _CSRRowWiseSampleUniformKernel(
const uint64_t rand_seed, const uint64_t rand_seed,
const int64_t num_picks, const int64_t num_picks,
const int64_t num_rows, const int64_t num_rows,
...@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel( ...@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
IdType * const out_cols, IdType * const out_cols,
IdType * const out_idxs) { IdType * const out_idxs) {
// we assign one warp per row // we assign one warp per row
assert(blockDim.x == CTA_SIZE); assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y; int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows); const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; curandStatePhilox4_32_10_t rng;
curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng); curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row]; const int64_t in_row_start = in_ptr[row];
const int64_t deg = in_ptr[row+1] - in_row_start; const int64_t deg = in_ptr[row + 1] - in_row_start;
const int64_t out_row_start = out_ptr[out_row]; const int64_t out_row_start = out_ptr[out_row];
if (deg <= num_picks) { if (deg <= num_picks) {
// just copy row // just copy row when there is not enough nodes to sample.
for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) { for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const IdType in_idx = in_row_start+idx; const IdType in_idx = in_row_start + idx;
out_rows[out_row_start+idx] = row; out_rows[out_row_start + idx] = row;
out_cols[out_row_start+idx] = in_index[in_idx]; out_cols[out_row_start + idx] = in_index[in_idx];
out_idxs[out_row_start+idx] = data ? data[in_idx] : in_idx; out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx;
} }
} else { } else {
// generate permutation list via reservoir algorithm // generate permutation list via reservoir algorithm
for (int idx = threadIdx.x; idx < num_picks; idx+=CTA_SIZE) { for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
out_idxs[out_row_start+idx] = idx; out_idxs[out_row_start + idx] = idx;
} }
__syncthreads(); __syncthreads();
for (int idx = num_picks+threadIdx.x; idx < deg; idx+=CTA_SIZE) { for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int num = curand(&rng)%(idx+1); const int num = curand(&rng) % (idx + 1);
if (num < num_picks) { if (num < num_picks) {
// use max so as to achieve the replacement order the serial // use max so as to achieve the replacement order the serial
// algorithm would have // algorithm would have
AtomicMax(out_idxs+out_row_start+num, idx); AtomicMax(out_idxs + out_row_start + num, idx);
} }
} }
__syncthreads(); __syncthreads();
// copy permutation over // copy permutation over
for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) { for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const IdType perm_idx = out_idxs[out_row_start+idx]+in_row_start; const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start;
out_rows[out_row_start+idx] = row; out_rows[out_row_start + idx] = row;
out_cols[out_row_start+idx] = in_index[perm_idx]; out_cols[out_row_start + idx] = in_index[perm_idx];
if (data) { out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx;
out_idxs[out_row_start+idx] = data[perm_idx];
}
} }
} }
out_row += 1;
out_row += BLOCK_CTAS;
} }
} }
/** /**
* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix, * @brief Perform row-wise uniform sampling on a CSR matrix,
* with replacement. * and generate a COO matrix, with replacement.
* *
* @tparam IdType The ID type used for matrices. * @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock. * @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use. * @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row. * @param num_picks The number of non-zeros to pick per row.
...@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel( ...@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
* @param out_cols The columns of the output COO (output). * @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output). * @param out_idxs The data array of the output COO (output).
*/ */
template<typename IdType, int BLOCK_CTAS, int TILE_SIZE> template<typename IdType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleReplaceKernel( __global__ void _CSRRowWiseSampleUniformReplaceKernel(
const uint64_t rand_seed, const uint64_t rand_seed,
const int64_t num_picks, const int64_t num_picks,
const int64_t num_rows, const int64_t num_rows,
...@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel( ...@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
IdType * const out_cols, IdType * const out_cols,
IdType * const out_idxs) { IdType * const out_idxs) {
// we assign one warp per row // we assign one warp per row
assert(blockDim.x == CTA_SIZE); assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y; int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows); const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng; curandStatePhilox4_32_10_t rng;
curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng); curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) { while (out_row < last_row) {
const int64_t row = in_rows[out_row]; const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row]; const int64_t in_row_start = in_ptr[row];
const int64_t out_row_start = out_ptr[out_row]; const int64_t out_row_start = out_ptr[out_row];
const int64_t deg = in_ptr[row + 1] - in_row_start;
const int64_t deg = in_ptr[row+1] - in_row_start;
if (deg > 0) { if (deg > 0) {
// each thread then blindly copies in rows only if deg > 0. // each thread then blindly copies in rows only if deg > 0.
for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) { for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
const int64_t edge = curand(&rng) % deg; const int64_t edge = curand(&rng) % deg;
const int64_t out_idx = out_row_start+idx; const int64_t out_idx = out_row_start + idx;
out_rows[out_idx] = row; out_rows[out_idx] = row;
out_cols[out_idx] = in_index[in_row_start+edge]; out_cols[out_idx] = in_index[in_row_start + edge];
out_idxs[out_idx] = data ? data[in_row_start+edge] : in_row_start+edge; out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
} }
} }
out_row += BLOCK_CTAS; out_row += 1;
} }
} }
} // namespace } // namespace
/////////////////////////////// CSR ///////////////////////////////
///////////////////////////// CSR sampling //////////////////////////
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat, COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
...@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat, ...@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// compute degree // compute degree
IdType * out_deg = static_cast<IdType*>( IdType * out_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
if (replace) { if (replace) {
const dim3 block(512); const dim3 block(512);
const dim3 grid((num_rows+block.x-1)/block.x); const dim3 grid((num_rows + block.x - 1) / block.x);
_CSRRowWiseSampleDegreeReplaceKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeReplaceKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg); num_picks, num_rows, slice_rows, in_ptr, out_deg);
} else { } else {
const dim3 block(512); const dim3 block(512);
const dim3 grid((num_rows+block.x-1)/block.x); const dim3 grid((num_rows + block.x - 1) / block.x);
_CSRRowWiseSampleDegreeKernel<<<grid, block, 0, stream>>>( CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg); num_picks, num_rows, slice_rows, in_ptr, out_deg);
} }
// fill out_ptr // fill out_ptr
IdType * out_ptr = static_cast<IdType*>( IdType * out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType))); device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
size_t prefix_temp_size = 0; size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size, CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
out_deg, out_deg,
...@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat, ...@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent // a cudaevent
IdType new_len; IdType new_len;
device->CopyDataFromTo(out_ptr, num_rows*sizeof(new_len), &new_len, 0, device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
sizeof(new_len), sizeof(new_len),
ctx, ctx,
DGLContext{kDLCPU, 0}, DGLContext{kDLCPU, 0},
mat.indptr->dtype, mat.indptr->dtype,
stream); stream);
CUDA_CALL(cudaEventRecord(copyEvent, stream)); CUDA_CALL(cudaEventRecord(copyEvent, stream));
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
// select edges // select edges
if (replace) { // the number of rows each thread block will cover
constexpr int BLOCK_CTAS = 128/CTA_SIZE; constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
// the number of rows each thread block will cover if (replace) { // with replacement
constexpr int TILE_SIZE = BLOCK_CTAS; const dim3 block(BLOCK_SIZE);
const dim3 block(CTA_SIZE, BLOCK_CTAS); const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE); CUDA_KERNEL_CALL(
_CSRRowWiseSampleReplaceKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>( (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
grid, block, 0, stream,
random_seed, random_seed,
num_picks, num_picks,
num_rows, num_rows,
...@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat, ...@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
out_rows, out_rows,
out_cols, out_cols,
out_idxs); out_idxs);
} else { } else { // without replacement
constexpr int BLOCK_CTAS = 128/CTA_SIZE; const dim3 block(BLOCK_SIZE);
// the number of rows each thread block will cover const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
constexpr int TILE_SIZE = BLOCK_CTAS; CUDA_KERNEL_CALL(
const dim3 block(CTA_SIZE, BLOCK_CTAS); (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE); grid, block, 0, stream,
_CSRRowWiseSampleKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
random_seed, random_seed,
num_picks, num_picks,
num_rows, num_rows,
......
/*!
* Copyright (c) 2022 by Contributors
* \file array/cuda/rowwise_sampling_prob.cu
* \brief weighted rowwise sampling. The degree computing kernels and
* host-side functions are partially borrowed from the uniform rowwise
* sampling code rowwise_sampling.cu.
* \author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <curand_kernel.h>
#include <numeric>
#include "./dgl_cub.cuh"
#include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
// require CUB 1.17 to use DeviceSegmentedSort
static_assert(CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
using namespace dgl::aten::cuda;
namespace dgl {
namespace aten {
namespace impl {
namespace {
constexpr int BLOCK_SIZE = 128;
/**
* @brief Compute the size of each row in the sampled CSR, without replacement.
* temp_deg is calculated for rows with deg > num_picks.
* For these rows, we will calculate their A-Res values and sort them to get top-num_picks.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
*/
template<typename IdType>
__global__ void _CSRRowWiseSampleDegreeKernel(
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg,
IdType * const temp_deg) {
const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
const int64_t in_row = in_rows[tIdx];
const int64_t out_row = tIdx;
const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
// temp_deg is used to generate ares_ptr
temp_deg[out_row] = deg > static_cast<IdType>(num_picks) ? deg : 0;
out_deg[out_row] = min(static_cast<IdType>(num_picks), deg);
if (out_row == num_rows - 1) {
// make the prefixsum work
out_deg[num_rows] = 0;
temp_deg[num_rows] = 0;
}
}
}
/**
* @brief Compute the size of each row in the sampled CSR, with replacement.
* We need the actual in degree of each row to store CDF values.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
*/
template<typename IdType>
__global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
IdType * const out_deg,
IdType * const temp_deg) {
const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (tIdx < num_rows) {
const int64_t in_row = in_rows[tIdx];
const int64_t out_row = tIdx;
const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
temp_deg[out_row] = deg;
out_deg[out_row] = deg == 0 ? 0 : static_cast<IdType>(num_picks);
if (out_row == num_rows - 1) {
// make the prefixsum work
out_deg[num_rows] = 0;
temp_deg[num_rows] = 0;
}
}
}
/**
* @brief Equivalent to numpy expression: array[idx[off:off + len]]
*
* @tparam IdType The ID type used for indices.
* @tparam FloatType The float type used for array values.
* @param array The array to be selected.
* @param idx_data The index mapping array.
* @param index The index of value to be selected.
* @param offset The offset to start.
* @param out The selected value (output).
*/
template<typename IdType, typename FloatType>
__device__ void _DoubleSlice(
const FloatType * const array,
const IdType * const idx_data,
const IdType idx,
const IdType offset,
FloatType* const out) {
if (idx_data) {
*out = array[idx_data[offset + idx]];
} else {
*out = array[offset + idx];
}
}
/**
* @brief Compute A-Res value. A-Res value needs to be calculated only if deg
* is greater than num_picks in weighted rowwise sampling without replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param data The data array of the input CSR.
* @param prob The probability array of the input CSR.
* @param ares_ptr The offset to write each row to in the A-res array.
* @param ares_idxs The A-Res value corresponding index array, the index of input CSR (output).
* @param ares The A-Res value array (output).
* @author pengqirong (OPPO)
*/
template<typename IdType, typename FloatType, int TILE_SIZE>
__global__ void _CSRAResValueKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
const IdType * const data,
const FloatType * const prob,
const IdType * const ares_ptr,
IdType * const ares_idxs,
FloatType * const ares) {
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row];
const int64_t deg = in_ptr[row + 1] - in_row_start;
// A-Res value needs to be calculated only if deg is greater than num_picks
// in weighted rowwise sampling without replacement
if (deg > num_picks) {
const int64_t ares_row_start = ares_ptr[out_row];
for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
const int64_t in_idx = in_row_start + idx;
const int64_t ares_idx = ares_row_start + idx;
FloatType item_prob;
_DoubleSlice<IdType, FloatType>(prob, data, idx, in_row_start, &item_prob);
// compute A-Res value
ares[ares_idx] = static_cast<FloatType>(__powf(curand_uniform(&rng), 1.0f / item_prob));
ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
}
}
out_row += 1;
}
}
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
* without replacement. After sorting, we select top-num_picks items.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_cols The columns array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param ares_ptr The offset to write each row to in the ares array.
* @param sort_ares_idxs The sorted A-Res value corresponding index array, the index of input CSR.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @author pengqirong (OPPO)
*/
template<typename IdType, typename FloatType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleKernel(
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
const IdType * const in_cols,
const IdType * const data,
const IdType * const out_ptr,
const IdType * const ares_ptr,
const IdType * const sort_ares_idxs,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs) {
// we assign one warp per row
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row];
const int64_t out_row_start = out_ptr[out_row];
const int64_t deg = in_ptr[row + 1] - in_row_start;
if (deg > num_picks) {
const int64_t ares_row_start = ares_ptr[out_row];
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get in and out index, the in_idx is one of top num_picks A-Res value
// corresponding index in input CSR.
const int64_t out_idx = out_row_start + idx;
const int64_t ares_idx = ares_row_start + idx;
const int64_t in_idx = sort_ares_idxs[ares_idx];
// copy permutation over
out_rows[out_idx] = static_cast<IdType>(row);
out_cols[out_idx] = in_cols[in_idx];
out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
}
} else {
for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
// get in and out index
const int64_t out_idx = out_row_start + idx;
const int64_t in_idx = in_row_start + idx;
// copy permutation over
out_rows[out_idx] = static_cast<IdType>(row);
out_cols[out_idx] = in_cols[in_idx];
out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
}
}
out_row += 1;
}
}
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
template<typename FloatType>
struct BlockPrefixCallbackOp {
// Running prefix
FloatType running_total;
// Constructor
__device__ BlockPrefixCallbackOp(FloatType running_total) : running_total(running_total) {}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__ FloatType operator()(FloatType block_aggregate) {
FloatType old_prefix = running_total;
running_total += block_aggregate;
return old_prefix;
}
};
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
* with replacement. We store the CDF (unnormalized) of all neighbors of a row
* in global memory and use binary search to find inverse indices as selected items.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_cols The columns array of the input CSR.
* @param data The data array of the input CSR.
* @param prob The probability array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param cdf_ptr The offset of each cdf segment.
* @param cdf The global buffer to store cdf segments.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @author pengqirong (OPPO)
*/
template<typename IdType, typename FloatType, int TILE_SIZE>
__global__ void _CSRRowWiseSampleReplaceKernel(
const uint64_t rand_seed,
const int64_t num_picks,
const int64_t num_rows,
const IdType * const in_rows,
const IdType * const in_ptr,
const IdType * const in_cols,
const IdType * const data,
const FloatType * const prob,
const IdType * const out_ptr,
const IdType * const cdf_ptr,
FloatType * const cdf,
IdType * const out_rows,
IdType * const out_cols,
IdType * const out_idxs
) {
// we assign one warp per row
assert(blockDim.x == BLOCK_SIZE);
int64_t out_row = blockIdx.x * TILE_SIZE;
const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
curandStatePhilox4_32_10_t rng;
curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
while (out_row < last_row) {
const int64_t row = in_rows[out_row];
const int64_t in_row_start = in_ptr[row];
const int64_t out_row_start = out_ptr[out_row];
const int64_t cdf_row_start = cdf_ptr[out_row];
const int64_t deg = in_ptr[row + 1] - in_row_start;
const FloatType MIN_THREAD_DATA = static_cast<FloatType>(0.0f);
if (deg > 0) {
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
// Allocate shared memory for BlockScan
__shared__ typename BlockScan::TempStorage temp_storage;
// Initialize running total
BlockPrefixCallbackOp<FloatType> prefix_op(MIN_THREAD_DATA);
int64_t max_iter = (1 + (deg - 1) / BLOCK_SIZE) * BLOCK_SIZE;
// Have the block iterate over segments of items
for (int64_t idx = threadIdx.x; idx < max_iter; idx += BLOCK_SIZE) {
// Load a segment of consecutive items that are blocked across threads
FloatType thread_data;
if (idx < deg)
_DoubleSlice<IdType, FloatType>(prob, data, idx, in_row_start, &thread_data);
else
thread_data = MIN_THREAD_DATA;
thread_data = max(thread_data, MIN_THREAD_DATA);
// Collectively compute the block-wide inclusive prefix sum
BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, prefix_op);
__syncthreads();
// Store scanned items to cdf array
if (idx < deg) {
cdf[cdf_row_start + idx] = thread_data;
}
}
__syncthreads();
for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
// get random value
FloatType sum = cdf[cdf_row_start + deg - 1];
FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
// get the offset of the first value within cdf array which is greater than random value.
int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
&cdf[cdf_row_start], deg, rand);
item = min(item, deg - 1);
// get in and out index
const int64_t in_idx = in_row_start + item;
const int64_t out_idx = out_row_start + idx;
// copy permutation over
out_rows[out_idx] = static_cast<IdType>(row);
out_cols[out_idx] = in_cols[in_idx];
out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
}
}
out_row += 1;
}
}
} // namespace
/////////////////////////////// CSR ///////////////////////////////
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix.
* Use CDF sampling algorithm for with replacement:
* 1) Calculate the CDF of all neighbor's prob.
* 2) For each [0, num_picks), generate a rand ~ U(0, 1).
* Use binary search to find its index in the CDF array as a chosen item.
* Use A-Res sampling algorithm for without replacement:
* 1) For rows with deg > num_picks, calculate A-Res values for all neighbors.
* 2) Sort the A-Res array and select top-num_picks as chosen items.
*
* @tparam XPU The device type used for matrices.
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @param mat The CSR matrix.
* @param rows The set of rows to pick.
* @param num_picks The number of non-zeros to pick per row.
* @param prob The probability array of the input CSR.
* @param replace Is replacement sampling?
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
template <DLDeviceType XPU, typename IdType, typename FloatType>
COOMatrix CSRRowWiseSampling(CSRMatrix mat,
IdArray rows,
int64_t num_picks,
FloatArray prob,
bool replace) {
const auto& ctx = rows->ctx;
auto device = runtime::DeviceAPI::Get(ctx);
// TODO(dlasalle): Once the device api supports getting the stream from the
// context, that should be used instead of the default stream here.
cudaStream_t stream = 0;
const int64_t num_rows = rows->shape[0];
const IdType * const slice_rows = static_cast<const IdType*>(rows->data);
IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
const IdType * const in_ptr = static_cast<const IdType*>(mat.indptr->data);
const IdType * const in_cols = static_cast<const IdType*>(mat.indices->data);
IdType* const out_rows = static_cast<IdType*>(picked_row->data);
IdType* const out_cols = static_cast<IdType*>(picked_col->data);
IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
const IdType* const data = CSRHasData(mat) ?
static_cast<IdType*>(mat.data->data) : nullptr;
const FloatType* const prob_data = static_cast<const FloatType*>(prob->data);
// compute degree
// out_deg: the size of each row in the sampled matrix
// temp_deg: the size of each row we will manipulate in sampling
// 1) for w/o replacement: in degree if it's greater than num_picks else 0
// 2) for w/ replacement: in degree
IdType * out_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
IdType * temp_deg = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
if (replace) {
const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeReplaceKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg, temp_deg);
} else {
const dim3 block(512);
const dim3 grid((num_rows + block.x - 1) / block.x);
CUDA_KERNEL_CALL(
_CSRRowWiseSampleDegreeKernel,
grid, block, 0, stream,
num_picks, num_rows, slice_rows, in_ptr, out_deg, temp_deg);
}
// fill temp_ptr
IdType * temp_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows + 1)*sizeof(IdType)));
size_t prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
temp_deg,
temp_ptr,
num_rows + 1,
stream));
void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
temp_deg,
temp_ptr,
num_rows + 1,
stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, temp_deg);
// TODO(Xin): The copy here is too small, and the overhead of creating
// cuda events cannot be ignored. Just use synchronized copy.
IdType temp_len;
device->CopyDataFromTo(temp_ptr, num_rows * sizeof(temp_len), &temp_len, 0,
sizeof(temp_len),
ctx,
DGLContext{kDLCPU, 0},
mat.indptr->dtype,
stream);
device->StreamSync(ctx, stream);
// fill out_ptr
IdType * out_ptr = static_cast<IdType*>(
device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
prefix_temp_size = 0;
CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
out_deg,
out_ptr,
num_rows+1,
stream));
prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
out_deg,
out_ptr,
num_rows+1,
stream));
device->FreeWorkspace(ctx, prefix_temp);
device->FreeWorkspace(ctx, out_deg);
cudaEvent_t copyEvent;
CUDA_CALL(cudaEventCreate(&copyEvent));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent
IdType new_len;
device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
sizeof(new_len),
ctx,
DGLContext{kDLCPU, 0},
mat.indptr->dtype,
stream);
CUDA_CALL(cudaEventRecord(copyEvent, stream));
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one segment for each row).
// 2) for w/o replacement, it's used to store a-res segments (one segment for
// each row with degree > num_picks)
FloatType * temp = static_cast<FloatType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
const uint64_t rand_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
// select edges
// the number of rows each thread block will cover
constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
if (replace) { // with replacement.
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleReplaceKernel<IdType, FloatType, TILE_SIZE>),
grid, block, 0, stream,
rand_seed,
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
prob_data,
out_ptr,
temp_ptr,
temp,
out_rows,
out_cols,
out_idxs);
device->FreeWorkspace(ctx, temp);
} else { // without replacement
IdType* temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, (temp_len) * sizeof(IdType)));
// Compute A-Res value. A-Res value needs to be calculated only if deg
// is greater than num_picks in weighted rowwise sampling without replacement.
const dim3 block(BLOCK_SIZE);
const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
CUDA_KERNEL_CALL(
(_CSRAResValueKernel<IdType, FloatType, TILE_SIZE>),
grid, block, 0, stream,
rand_seed,
num_picks,
num_rows,
slice_rows,
in_ptr,
data,
prob_data,
temp_ptr,
temp_idxs,
temp);
// sort A-Res value array.
FloatType* sort_temp = static_cast<FloatType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
IdType* sort_temp_idxs = static_cast<IdType*>(
device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
void *d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage,
temp_storage_bytes,
sort_keys,
sort_values,
temp_len,
num_rows,
temp_ptr,
temp_ptr + 1));
d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
d_temp_storage,
temp_storage_bytes,
sort_keys,
sort_values,
temp_len,
num_rows,
temp_ptr,
temp_ptr + 1));
device->FreeWorkspace(ctx, d_temp_storage);
device->FreeWorkspace(ctx, temp);
device->FreeWorkspace(ctx, temp_idxs);
device->FreeWorkspace(ctx, sort_temp);
device->FreeWorkspace(ctx, sort_temp_idxs);
// select tok-num_picks as results
CUDA_KERNEL_CALL(
(_CSRRowWiseSampleKernel<IdType, FloatType, TILE_SIZE>),
grid, block, 0, stream,
num_picks,
num_rows,
slice_rows,
in_ptr,
in_cols,
data,
out_ptr,
temp_ptr,
sort_values.Current(),
out_rows,
out_cols,
out_idxs);
}
device->FreeWorkspace(ctx, temp_ptr);
device->FreeWorkspace(ctx, out_ptr);
// wait for copying `new_len` to finish
CUDA_CALL(cudaEventSynchronize(copyEvent));
CUDA_CALL(cudaEventDestroy(copyEvent));
picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
return COOMatrix(mat.num_rows, mat.num_cols, picked_row,
picked_col, picked_idx);
}
template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, float>(
CSRMatrix, IdArray, int64_t, FloatArray, bool);
template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, float>(
CSRMatrix, IdArray, int64_t, FloatArray, bool);
template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, double>(
CSRMatrix, IdArray, int64_t, FloatArray, bool);
template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, double>(
CSRMatrix, IdArray, int64_t, FloatArray, bool);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -9,39 +9,6 @@ ...@@ -9,39 +9,6 @@
namespace dgl { namespace dgl {
namespace aten { namespace aten {
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template <int XPU, typename IdType, int bits>
void SDDMMCsrHetero(const std::string& op,
const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& vec_lhs,
const std::vector<NDArray>& vec_rhs,
std::vector<NDArray> vec_out,
int lhs_target,
int rhs_target,
const std::vector<dgl_type_t>& lhs_eid,
const std::vector<dgl_type_t>& rhs_eid) {
SWITCH_BITS(bits, DType, {
SWITCH_OP(op, Op, {
SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
/* Call SDDMM CUDA kernel for each relation type sequentially */
for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
CSRMatrix csr = vec_csr[etype];
NDArray lhs = vec_lhs[lhs_eid[etype]];
NDArray rhs = vec_rhs[rhs_eid[etype]];
NDArray out = vec_out[etype];
cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
bcast, csr, lhs, rhs, out);
}
});
});
});
}
/*! /*!
* \brief CUDA implementation of g-SDDMM on heterograph using * \brief CUDA implementation of g-SDDMM on heterograph using
Csr format. Csr format.
...@@ -76,49 +43,6 @@ void SDDMMCooHetero(const std::string& op, ...@@ -76,49 +43,6 @@ void SDDMMCooHetero(const std::string& op,
} }
template void SDDMMCsrHetero<kDLGPU, int32_t, 16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int32_t, 32>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 32>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int32_t, 64>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 64>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCooHetero<kDLGPU, int32_t, 16>( template void SDDMMCooHetero<kDLGPU, int32_t, 16>(
const std::string& op, const BcastOff& bcast, const std::string& op, const BcastOff& bcast,
const std::vector<COOMatrix>& vec_coo, const std::vector<COOMatrix>& vec_coo,
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cu
* \brief SDDMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
namespace dgl {
namespace aten {
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template <int XPU, typename IdType, int bits>
void SDDMMCsrHetero(const std::string& op,
const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& vec_lhs,
const std::vector<NDArray>& vec_rhs,
std::vector<NDArray> vec_out,
int lhs_target,
int rhs_target,
const std::vector<dgl_type_t>& lhs_eid,
const std::vector<dgl_type_t>& rhs_eid) {
SWITCH_BITS(bits, DType, {
SWITCH_OP(op, Op, {
SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
/* Call SDDMM CUDA kernel for each relation type sequentially */
for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
CSRMatrix csr = vec_csr[etype];
NDArray lhs = vec_lhs[lhs_eid[etype]];
NDArray rhs = vec_rhs[rhs_eid[etype]];
NDArray out = vec_out[etype];
cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
bcast, csr, lhs, rhs, out);
}
});
});
});
}
template void SDDMMCsrHetero<kDLGPU, int32_t, 16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 16>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int32_t, 32>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 32>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int32_t, 64>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
template void SDDMMCsrHetero<kDLGPU, int64_t, 64>(
const std::string& op, const BcastOff& bcast,
const std::vector<CSRMatrix>& vec_csr,
const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
std::vector<NDArray> out, int lhs_target, int rhs_target,
const std::vector<dgl_type_t>& in_eid,
const std::vector<dgl_type_t>& out_eid);
} // namespace aten
} // namespace dgl
/*! /*!
* Copyright (c) 2021 by Contributors * Copyright (c) 2021-2022 by Contributors
* \file graph/sampling/randomwalk_gpu.cu * \file graph/sampling/randomwalk_gpu.cu
* \brief DGL sampler * \brief CUDA random walk sampleing
*/ */
#include <dgl/array.h> #include <dgl/array.h>
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <utility> #include <utility>
#include <tuple> #include <tuple>
#include "../../../array/cuda/dgl_cub.cuh"
#include "../../../runtime/cuda/cuda_common.h" #include "../../../runtime/cuda/cuda_common.h"
#include "frequency_hashmap.cuh" #include "frequency_hashmap.cuh"
...@@ -89,6 +90,81 @@ __global__ void _RandomWalkKernel( ...@@ -89,6 +90,81 @@ __global__ void _RandomWalkKernel(
} }
} }
template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
__global__ void _RandomWalkBiasedKernel(
const uint64_t rand_seed,
const IdType *seed_data,
const int64_t num_seeds,
const IdType *metapath_data,
const uint64_t max_num_steps,
const GraphKernelData<IdType> *graphs,
const FloatType **probs,
const FloatType **prob_sums,
const FloatType *restart_prob_data,
const int64_t restart_prob_size,
const int64_t max_nodes,
IdType *out_traces_data,
IdType *out_eids_data) {
assert(BLOCK_SIZE == blockDim.x);
int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x;
int64_t last_idx = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
int64_t trace_length = (max_num_steps + 1);
curandState rng;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init(rand_seed + idx, 0, 0, &rng);
while (idx < last_idx) {
IdType curr = seed_data[idx];
assert(curr < max_nodes);
IdType *traces_data_ptr = &out_traces_data[idx * trace_length];
IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps];
*(traces_data_ptr++) = curr;
int64_t step_idx;
for (step_idx = 0; step_idx < max_num_steps; ++step_idx) {
IdType metapath_id = metapath_data[step_idx];
const GraphKernelData<IdType> &graph = graphs[metapath_id];
const int64_t in_row_start = graph.in_ptr[curr];
const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr];
if (deg == 0) { // the degree is zero
break;
}
// randomly select by weight
const FloatType *prob_sum = prob_sums[metapath_id];
const FloatType *prob = probs[metapath_id];
int64_t num;
if (prob == nullptr) {
num = curand(&rng) % deg;
} else {
auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
FloatType sum_w{0.};
for (num = 0; num < deg; ++num) {
sum_w += prob[in_row_start + num];
if (sum_w >= rnd_sum_w) break;
}
}
IdType pick = graph.in_cols[in_row_start + num];
IdType eid = (graph.data? graph.data[in_row_start + num] : in_row_start + num);
*traces_data_ptr = pick;
*eids_data_ptr = eid;
if ((restart_prob_size > 1) && (curand_uniform(&rng) < restart_prob_data[step_idx])) {
break;
} else if ((restart_prob_size == 1) && (curand_uniform(&rng) < restart_prob_data[0])) {
break;
}
++traces_data_ptr; ++eids_data_ptr;
curr = pick;
}
for (; step_idx < max_num_steps; ++step_idx) {
*(traces_data_ptr++) = -1;
*(eids_data_ptr++) = -1;
}
idx += BLOCK_SIZE;
}
}
} // namespace } // namespace
// random walk for uniform choice // random walk for uniform choice
...@@ -167,6 +243,143 @@ std::pair<IdArray, IdArray> RandomWalkUniform( ...@@ -167,6 +243,143 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
return std::make_pair(traces, eids); return std::make_pair(traces, eids);
} }
/**
* \brief Random walk for biased choice. We use inverse transform sampling to
* choose the next step.
*/
template <DLDeviceType XPU, typename FloatType, typename IdType>
std::pair<IdArray, IdArray> RandomWalkBiased(
const HeteroGraphPtr hg,
const IdArray seeds,
const TypeArray metapath,
const std::vector<FloatArray> &prob,
FloatArray restart_prob) {
const int64_t max_num_steps = metapath->shape[0];
const IdType *metapath_data = static_cast<IdType *>(metapath->data);
const int64_t begin_ntype = hg->meta_graph()->FindEdge(metapath_data[0]).first;
const int64_t max_nodes = hg->NumVertices(begin_ntype);
int64_t num_etypes = hg->NumEdgeTypes();
auto ctx = seeds->ctx;
const IdType *seed_data = static_cast<const IdType*>(seeds->data);
CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
const int64_t num_seeds = seeds->shape[0];
int64_t trace_length = max_num_steps + 1;
IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
IdType *traces_data = traces.Ptr<IdType>();
IdType *eids_data = eids.Ptr<IdType>();
cudaStream_t stream = 0;
auto device = DeviceAPI::Get(ctx);
// new probs and prob sums pointers
assert(num_etypes == static_cast<int64_t>(prob.size()));
std::unique_ptr<FloatType *[]> probs(new FloatType *[prob.size()]);
std::unique_ptr<FloatType *[]> prob_sums(new FloatType *[prob.size()]);
std::vector<FloatArray> prob_sums_arr;
prob_sums_arr.reserve(prob.size());
// graphs
std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
for (int64_t etype = 0; etype < num_etypes; ++etype) {
const CSRMatrix &csr = hg->GetCSRMatrix(etype);
h_graphs[etype].in_ptr = static_cast<const IdType*>(csr.indptr->data);
h_graphs[etype].in_cols = static_cast<const IdType*>(csr.indices->data);
h_graphs[etype].data = (CSRHasData(csr) ? static_cast<const IdType*>(csr.data->data) : nullptr);
int64_t num_segments = csr.indptr->shape[0] - 1;
// will handle empty probs in the kernel
if (IsNullArray(prob[etype])) {
probs[etype] = nullptr;
prob_sums[etype] = nullptr;
continue;
}
probs[etype] = prob[etype].Ptr<FloatType>();
prob_sums_arr.push_back(FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
// calculate the sum of the neighbor weights
const IdType *d_offsets = static_cast<const IdType*>(csr.indptr->data);
size_t temp_storage_size = 0;
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_size,
probs[etype],
prob_sums[etype],
num_segments,
d_offsets,
d_offsets + 1));
void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
CUDA_CALL(cub::DeviceSegmentedReduce::Sum(temp_storage, temp_storage_size,
probs[etype],
prob_sums[etype],
num_segments,
d_offsets,
d_offsets + 1));
device->FreeWorkspace(ctx, temp_storage);
}
// copy graph metadata pointers to GPU
auto d_graphs = static_cast<GraphKernelData<IdType>*>(
device->AllocWorkspace(ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
device->CopyDataFromTo(h_graphs.data(), 0, d_graphs, 0,
(num_etypes) * sizeof(GraphKernelData<IdType>),
DGLContext{kDLCPU, 0},
ctx,
hg->GetCSRMatrix(0).indptr->dtype,
stream);
// copy probs pointers to GPU
const FloatType **probs_dev = static_cast<const FloatType **>(
device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
device->CopyDataFromTo(probs.get(), 0, probs_dev, 0,
(num_etypes) * sizeof(FloatType *),
DGLContext{kDLCPU, 0},
ctx,
prob[0]->dtype,
stream);
// copy probs_sum pointers to GPU
const FloatType **prob_sums_dev = static_cast<const FloatType **>(
device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
device->CopyDataFromTo(prob_sums.get(), 0, prob_sums_dev, 0,
(num_etypes) * sizeof(FloatType *),
DGLContext{kDLCPU, 0},
ctx,
prob[0]->dtype,
stream);
// copy metapath to GPU
auto d_metapath = metapath.CopyTo(ctx);
const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
constexpr int BLOCK_SIZE = 256;
constexpr int TILE_SIZE = BLOCK_SIZE * 4;
dim3 block(256);
dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
CHECK(restart_prob->ctx.device_type == kDLGPU) << "restart prob should be in GPU.";
CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
const int64_t restart_prob_size = restart_prob->shape[0];
CUDA_KERNEL_CALL(
(_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>),
grid, block, 0, stream,
random_seed,
seed_data,
num_seeds,
d_metapath_data,
max_num_steps,
d_graphs,
probs_dev,
prob_sums_dev,
restart_prob_data,
restart_prob_size,
max_nodes,
traces_data,
eids_data);
device->FreeWorkspace(ctx, d_graphs);
device->FreeWorkspace(ctx, probs_dev);
device->FreeWorkspace(ctx, prob_sums_dev);
return std::make_pair(traces, eids);
}
template<DLDeviceType XPU, typename IdType> template<DLDeviceType XPU, typename IdType>
std::pair<IdArray, IdArray> RandomWalk( std::pair<IdArray, IdArray> RandomWalk(
const HeteroGraphPtr hg, const HeteroGraphPtr hg,
...@@ -174,16 +387,25 @@ std::pair<IdArray, IdArray> RandomWalk( ...@@ -174,16 +387,25 @@ std::pair<IdArray, IdArray> RandomWalk(
const TypeArray metapath, const TypeArray metapath,
const std::vector<FloatArray> &prob) { const std::vector<FloatArray> &prob) {
// not support no-uniform choice now bool isUniform = true;
for (const auto &etype_prob : prob) { for (const auto &etype_prob : prob) {
if (!IsNullArray(etype_prob)) { if (!IsNullArray(etype_prob)) {
LOG(FATAL) << "Non-uniform choice is not supported in GPU."; isUniform = false;
break;
} }
} }
auto restart_prob = NDArray::Empty( auto restart_prob = NDArray::Empty(
{0}, DLDataType{kDLFloat, 32, 1}, DGLContext{XPU, 0}); {0}, DLDataType{kDLFloat, 32, 1}, DGLContext{XPU, 0});
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob); if (!isUniform) {
std::pair<IdArray, IdArray> ret;
ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
ret = RandomWalkBiased<XPU, FloatType, IdType>(hg, seeds, metapath, prob, restart_prob);
});
return ret;
} else {
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
}
} }
template<DLDeviceType XPU, typename IdType> template<DLDeviceType XPU, typename IdType>
...@@ -194,12 +416,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart( ...@@ -194,12 +416,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
const std::vector<FloatArray> &prob, const std::vector<FloatArray> &prob,
double restart_prob) { double restart_prob) {
// not support no-uniform choice now bool isUniform = true;
for (const auto &etype_prob : prob) { for (const auto &etype_prob : prob) {
if (!IsNullArray(etype_prob)) { if (!IsNullArray(etype_prob)) {
LOG(FATAL) << "Non-uniform choice is not supported in GPU."; isUniform = false;
break;
} }
} }
auto device_ctx = seeds->ctx; auto device_ctx = seeds->ctx;
auto restart_prob_array = NDArray::Empty( auto restart_prob_array = NDArray::Empty(
{1}, DLDataType{kDLFloat, 64, 1}, device_ctx); {1}, DLDataType{kDLFloat, 64, 1}, device_ctx);
...@@ -214,7 +438,16 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart( ...@@ -214,7 +438,16 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
restart_prob_array->dtype, stream); restart_prob_array->dtype, stream);
device->StreamSync(device_ctx, stream); device->StreamSync(device_ctx, stream);
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob_array); if (!isUniform) {
std::pair<IdArray, IdArray> ret;
ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
ret = RandomWalkBiased<XPU, FloatType, IdType>(
hg, seeds, metapath, prob, restart_prob_array);
});
return ret;
} else {
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob_array);
}
} }
template<DLDeviceType XPU, typename IdType> template<DLDeviceType XPU, typename IdType>
...@@ -225,14 +458,23 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart( ...@@ -225,14 +458,23 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
const std::vector<FloatArray> &prob, const std::vector<FloatArray> &prob,
FloatArray restart_prob) { FloatArray restart_prob) {
// not support no-uniform choice now bool isUniform = true;
for (const auto &etype_prob : prob) { for (const auto &etype_prob : prob) {
if (!IsNullArray(etype_prob)) { if (!IsNullArray(etype_prob)) {
LOG(FATAL) << "Non-uniform choice is not supported in GPU."; isUniform = false;
break;
} }
} }
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob); if (!isUniform) {
std::pair<IdArray, IdArray> ret;
ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
ret = RandomWalkBiased<XPU, FloatType, IdType>(hg, seeds, metapath, prob, restart_prob);
});
return ret;
} else {
return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
}
} }
template<DLDeviceType XPU, typename IdxType> template<DLDeviceType XPU, typename IdxType>
......
...@@ -44,9 +44,14 @@ void CheckRandomWalkInputs( ...@@ -44,9 +44,14 @@ void CheckRandomWalkInputs(
} }
for (uint64_t i = 0; i < prob.size(); ++i) { for (uint64_t i = 0; i < prob.size(); ++i) {
FloatArray p = prob[i]; FloatArray p = prob[i];
CHECK_EQ(hg->Context(), p->ctx) << "Expected prob (" << p->ctx << ")" << " to have the same " \
<< "context as graph (" << hg->Context() << ").";
CHECK_FLOAT(p, "probability"); CHECK_FLOAT(p, "probability");
if (p.GetSize() != 0) if (p.GetSize() != 0) {
CHECK_EQ(hg->IsPinned(), p.IsPinned())
<< "The prob array should have the same pinning status as the graph";
CHECK_NDIM(p, 1, "probability"); CHECK_NDIM(p, 1, "probability");
}
} }
} }
......
...@@ -29,13 +29,15 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed") ...@@ -29,13 +29,15 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
} }
}); });
#ifdef DGL_USE_CUDA #ifdef DGL_USE_CUDA
auto* thr_entry = CUDAThreadEntry::ThreadLocal(); if (DeviceAPI::Get(kDLGPU)->IsAvailable()) {
if (!thr_entry->curand_gen) { auto* thr_entry = CUDAThreadEntry::ThreadLocal();
CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT)); if (!thr_entry->curand_gen) {
CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
}
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
thr_entry->curand_gen,
static_cast<uint64_t>(seed)));
} }
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
thr_entry->curand_gen,
static_cast<uint64_t>(seed)));
#endif // DGL_USE_CUDA #endif // DGL_USE_CUDA
}); });
......
...@@ -64,14 +64,22 @@ bool TCPSocket::Bind(const char * ip, int port) { ...@@ -64,14 +64,22 @@ bool TCPSocket::Bind(const char * ip, int port) {
SAI sa_server; SAI sa_server;
sa_server.sin_family = AF_INET; sa_server.sin_family = AF_INET;
sa_server.sin_port = htons(port); sa_server.sin_port = htons(port);
int retval = 0; int ret = 0;
ret = inet_pton(AF_INET, ip, &sa_server.sin_addr);
if (ret == 0) {
LOG(ERROR) << "Invalid IP: " << ip;
return false;
} else if (ret < 0) {
LOG(ERROR) << "Failed to convert [" << ip
<< "] to binary form, error: " << strerror(errno);
return false;
}
do { // retry if EINTR failure appears do { // retry if EINTR failure appears
if (0 < inet_pton(AF_INET, ip, &sa_server.sin_addr) && if (0 <= (ret = bind(socket_, reinterpret_cast<SA *>(&sa_server),
0 <= (retval = bind(socket_, reinterpret_cast<SA*>(&sa_server), sizeof(sa_server)))) {
sizeof(sa_server)))) {
return true; return true;
} }
} while (retval == -1 && errno == EINTR); } while (ret == -1 && errno == EINTR);
LOG(ERROR) << "Failed bind on " << ip << ":" << port << " , error: " << strerror(errno); LOG(ERROR) << "Failed bind on " << ip << ":" << port << " , error: " << strerror(errno);
return false; return false;
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
* \brief GPU specific API * \brief GPU specific API
*/ */
#include <dgl/runtime/device_api.h> #include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <dmlc/thread_local.h> #include <dmlc/thread_local.h>
#include <dgl/runtime/registry.h> #include <dgl/runtime/registry.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -15,6 +15,23 @@ namespace runtime { ...@@ -15,6 +15,23 @@ namespace runtime {
class CUDADeviceAPI final : public DeviceAPI { class CUDADeviceAPI final : public DeviceAPI {
public: public:
CUDADeviceAPI() {
int count;
auto err = cudaGetDeviceCount(&count);
switch (err) {
case cudaSuccess:
break;
default:
count = 0;
cudaGetLastError();
}
is_available_ = count > 0;
}
bool IsAvailable() final {
return is_available_;
}
void SetDevice(DGLContext ctx) final { void SetDevice(DGLContext ctx) final {
CUDA_CALL(cudaSetDevice(ctx.device_id)); CUDA_CALL(cudaSetDevice(ctx.device_id));
} }
...@@ -224,11 +241,21 @@ class CUDADeviceAPI final : public DeviceAPI { ...@@ -224,11 +241,21 @@ class CUDADeviceAPI final : public DeviceAPI {
} }
void* AllocWorkspace(DGLContext ctx, size_t size, DGLType type_hint) final { void* AllocWorkspace(DGLContext ctx, size_t size, DGLType type_hint) final {
return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); // Redirect to PyTorch's allocator when available.
SetDevice(ctx);
TensorDispatcher* td = TensorDispatcher::Global();
if (td->IsAvailable())
return td->AllocWorkspace(size);
else
return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
} }
void FreeWorkspace(DGLContext ctx, void* data) final { void FreeWorkspace(DGLContext ctx, void* data) final {
CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); TensorDispatcher* td = TensorDispatcher::Global();
if (td->IsAvailable())
td->FreeWorkspace(data);
else
CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
} }
static const std::shared_ptr<CUDADeviceAPI>& Global() { static const std::shared_ptr<CUDADeviceAPI>& Global() {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment