Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
e9b624fe
Unverified
Commit
e9b624fe
authored
Aug 11, 2022
by
Minjie Wang
Committed by
GitHub
Aug 11, 2022
Browse files
Merge branch 'master' into dist_part
parents
8086d1ed
a88e7f7e
Changes
167
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1312 additions
and
248 deletions
+1312
-248
python/dgl/nn/pytorch/hetero.py
python/dgl/nn/pytorch/hetero.py
+7
-0
python/dgl/ops/edge_softmax.py
python/dgl/ops/edge_softmax.py
+1
-0
python/dgl/readout.py
python/dgl/readout.py
+20
-11
python/dgl/sampling/neighbor.py
python/dgl/sampling/neighbor.py
+3
-4
python/dgl/sampling/pinsage.py
python/dgl/sampling/pinsage.py
+9
-2
python/dgl/sampling/randomwalks.py
python/dgl/sampling/randomwalks.py
+16
-11
python/dgl/sparse.py
python/dgl/sparse.py
+3
-3
python/dgl/transforms/functional.py
python/dgl/transforms/functional.py
+56
-12
python/dgl/transforms/module.py
python/dgl/transforms/module.py
+41
-9
src/array/array.cc
src/array/array.cc
+4
-2
src/array/cuda/dgl_cub.cuh
src/array/cuda/dgl_cub.cuh
+5
-7
src/array/cuda/rowwise_sampling.cu
src/array/cuda/rowwise_sampling.cu
+81
-85
src/array/cuda/rowwise_sampling_prob.cu
src/array/cuda/rowwise_sampling_prob.cu
+668
-0
src/array/cuda/sddmm_hetero_coo.cu
src/array/cuda/sddmm_hetero_coo.cu
+0
-76
src/array/cuda/sddmm_hetero_csr.cu
src/array/cuda/sddmm_hetero_csr.cu
+88
-0
src/graph/sampling/randomwalks/randomwalk_gpu.cu
src/graph/sampling/randomwalks/randomwalk_gpu.cu
+253
-11
src/graph/sampling/randomwalks/randomwalks.cc
src/graph/sampling/randomwalks/randomwalks.cc
+6
-1
src/random/random.cc
src/random/random.cc
+8
-6
src/rpc/network/tcp_socket.cc
src/rpc/network/tcp_socket.cc
+13
-5
src/runtime/cuda/cuda_device_api.cc
src/runtime/cuda/cuda_device_api.cc
+30
-3
No files found.
python/dgl/nn/pytorch/hetero.py
View file @
e9b624fe
...
@@ -360,6 +360,13 @@ class HeteroEmbedding(nn.Module):
...
@@ -360,6 +360,13 @@ class HeteroEmbedding(nn.Module):
"""
"""
return
{
self
.
raw_keys
[
typ
]:
emb
.
weight
for
typ
,
emb
in
self
.
embeds
.
items
()}
return
{
self
.
raw_keys
[
typ
]:
emb
.
weight
for
typ
,
emb
in
self
.
embeds
.
items
()}
def
reset_parameters
(
self
):
"""
Use the xavier method in nn.init module to make the parameters uniformly distributed
"""
for
typ
in
self
.
embeds
.
keys
():
nn
.
init
.
xavier_uniform_
(
self
.
embeds
[
typ
].
weight
)
def
forward
(
self
,
input_ids
):
def
forward
(
self
,
input_ids
):
"""Forward function
"""Forward function
...
...
python/dgl/ops/edge_softmax.py
View file @
e9b624fe
...
@@ -135,6 +135,7 @@ def edge_softmax(graph, logits, eids=ALL, norm_by='dst'):
...
@@ -135,6 +135,7 @@ def edge_softmax(graph, logits, eids=ALL, norm_by='dst'):
eids
=
eids
,
norm_by
=
norm_by
)
eids
=
eids
,
norm_by
=
norm_by
)
else
:
else
:
logits_list
=
[
None
]
*
graph
.
_graph
.
number_of_etypes
()
logits_list
=
[
None
]
*
graph
.
_graph
.
number_of_etypes
()
logits
=
{
graph
.
to_canonical_etype
(
k
):
v
for
k
,
v
in
logits
.
items
()}
for
rel
in
graph
.
canonical_etypes
:
for
rel
in
graph
.
canonical_etypes
:
etid
=
graph
.
get_etype_id
(
rel
)
etid
=
graph
.
get_etype_id
(
rel
)
logits_list
[
etid
]
=
logits
[
rel
]
logits_list
[
etid
]
=
logits
[
rel
]
...
...
python/dgl/readout.py
View file @
e9b624fe
"""Classes and functions for batching multiple graphs together."""
"""Classes and functions for batching multiple graphs together."""
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
.base
import
DGLError
from
.base
import
DGLError
,
dgl_warning
from
.
import
backend
as
F
from
.
import
backend
as
F
from
.ops
import
segment
from
.ops
import
segment
...
@@ -365,8 +365,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
...
@@ -365,8 +365,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
graph : DGLGraph
graph : DGLGraph
The graph.
The graph.
graph_feat : tensor
graph_feat : tensor
The feature to broadcast. Tensor shape is :math:`(*)` for single graph, and
The feature to broadcast. Tensor shape is :math:`(B, *)` for batched graph,
:math:`(B, *)` for batched graph.
where :math:`B` is the batch size.
ntype : str, optional
ntype : str, optional
Node type. Can be omitted if there is only one node type.
Node type. Can be omitted if there is only one node type.
...
@@ -403,9 +404,11 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
...
@@ -403,9 +404,11 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
Broadcast feature to all nodes in the single graph.
Broadcast feature to all nodes in the single graph (the feature tensor shape
to broadcast should be :math:`(1, *)`).
>>> dgl.broadcast_nodes(g1, feat[0])
>>> feat0 = th.unsqueeze(feat[0], 0)
>>> dgl.broadcast_nodes(g1, feat0)
tensor([[0.4325, 0.7710, 0.5541, 0.0544, 0.9368],
tensor([[0.4325, 0.7710, 0.5541, 0.0544, 0.9368],
[0.4325, 0.7710, 0.5541, 0.0544, 0.9368]])
[0.4325, 0.7710, 0.5541, 0.0544, 0.9368]])
...
@@ -413,7 +416,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
...
@@ -413,7 +416,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
--------
--------
broadcast_edges
broadcast_edges
"""
"""
if
len
(
F
.
shape
(
graph_feat
))
==
1
:
if
(
F
.
shape
(
graph_feat
)[
0
]
!=
graph
.
batch_size
and
graph
.
batch_size
==
1
):
dgl_warning
(
'For a single graph, use a tensor of shape (1, *) for graph_feat.'
' The support of shape (*) will be deprecated.'
)
graph_feat
=
F
.
unsqueeze
(
graph_feat
,
dim
=
0
)
graph_feat
=
F
.
unsqueeze
(
graph_feat
,
dim
=
0
)
return
F
.
repeat
(
graph_feat
,
graph
.
batch_num_nodes
(
ntype
),
dim
=
0
)
return
F
.
repeat
(
graph_feat
,
graph
.
batch_num_nodes
(
ntype
),
dim
=
0
)
...
@@ -434,8 +439,8 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
...
@@ -434,8 +439,8 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
graph : DGLGraph
graph : DGLGraph
The graph.
The graph.
graph_feat : tensor
graph_feat : tensor
The feature to broadcast. Tensor shape is :math:`(*)` for
single
graph,
and
The feature to broadcast. Tensor shape is :math:`(
B,
*)` for
batched
graph,
:math:`
(B, *)` for batched graph
.
where
:math:`
B` is the batch size
.
etype : str, typle of str, optional
etype : str, typle of str, optional
Edge type. Can be omitted if there is only one edge type in the graph.
Edge type. Can be omitted if there is only one edge type in the graph.
...
@@ -470,9 +475,11 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
...
@@ -470,9 +475,11 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
Broadcast feature to all edges in the single graph.
Broadcast feature to all edges in the single graph (the feature tensor shape
to broadcast should be :math:`(1, *)`).
>>> dgl.broadcast_edges(g2, feat[1])
>>> feat1 = th.unsqueeze(feat[1], 0)
>>> dgl.broadcast_edges(g2, feat1)
tensor([[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
tensor([[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
[0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
...
@@ -480,7 +487,9 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
...
@@ -480,7 +487,9 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
--------
--------
broadcast_nodes
broadcast_nodes
"""
"""
if
len
(
F
.
shape
(
graph_feat
))
==
1
:
if
(
F
.
shape
(
graph_feat
)[
0
]
!=
graph
.
batch_size
and
graph
.
batch_size
==
1
):
dgl_warning
(
'For a single graph, use a tensor of shape (1, *) for graph_feat.'
' The support of shape (*) will be deprecated.'
)
graph_feat
=
F
.
unsqueeze
(
graph_feat
,
dim
=
0
)
graph_feat
=
F
.
unsqueeze
(
graph_feat
,
dim
=
0
)
return
F
.
repeat
(
graph_feat
,
graph
.
batch_num_edges
(
etype
),
dim
=
0
)
return
F
.
repeat
(
graph_feat
,
graph
.
batch_num_edges
(
etype
),
dim
=
0
)
...
...
python/dgl/sampling/neighbor.py
View file @
e9b624fe
...
@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
...
@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
The features must be non-negative floats, and the sum of the features of
The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have
inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined.
to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
replace : bool, optional
replace : bool, optional
If True, sample with replacement.
If True, sample with replacement.
copy_ndata: bool, optional
copy_ndata: bool, optional
...
@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
...
@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
Node/edge features are not preserved. The original IDs of
Node/edge features are not preserved. The original IDs of
the sampled edges are stored as the `dgl.EID` feature in the returned graph.
the sampled edges are stored as the `dgl.EID` feature in the returned graph.
GPU sampling is supported for this function. Refer to :ref:`guide-minibatch-gpu-sampling`
for more details.
Parameters
Parameters
----------
----------
g : DGLGraph
g : DGLGraph
...
@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
...
@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
The features must be non-negative floats, and the sum of the features of
The features must be non-negative floats, and the sum of the features of
inbound/outbound edges for every node must be positive (though they don't have
inbound/outbound edges for every node must be positive (though they don't have
to sum up to one). Otherwise, the result will be undefined.
to sum up to one). Otherwise, the result will be undefined.
If :attr:`prob` is not None, GPU sampling is not supported.
exclude_edges: tensor or dict
exclude_edges: tensor or dict
Edge IDs to exclude during sampling neighbors for the seed nodes.
Edge IDs to exclude during sampling neighbors for the seed nodes.
...
...
python/dgl/sampling/pinsage.py
View file @
e9b624fe
...
@@ -38,6 +38,9 @@ class RandomWalkNeighborSampler(object):
...
@@ -38,6 +38,9 @@ class RandomWalkNeighborSampler(object):
This is a generalization of PinSAGE sampler which only works on bidirectional bipartite
This is a generalization of PinSAGE sampler which only works on bidirectional bipartite
graphs.
graphs.
UVA and GPU sampling is supported for this sampler.
Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
Parameters
Parameters
----------
----------
G : DGLGraph
G : DGLGraph
...
@@ -104,13 +107,14 @@ class RandomWalkNeighborSampler(object):
...
@@ -104,13 +107,14 @@ class RandomWalkNeighborSampler(object):
A tensor of given node IDs of node type ``ntype`` to generate neighbors from. The
A tensor of given node IDs of node type ``ntype`` to generate neighbors from. The
node type ``ntype`` is the beginning and ending node type of the given metapath.
node type ``ntype`` is the beginning and ending node type of the given metapath.
It must be on CPU and have the same dtype as the ID type of the graph.
It must be on the same device as the graph and have the same dtype
as the ID type of the graph.
Returns
Returns
-------
-------
g : DGLGraph
g : DGLGraph
A homogeneous graph constructed by selecting neighbors for each given node according
A homogeneous graph constructed by selecting neighbors for each given node according
to the algorithm above.
The returned graph is on CPU.
to the algorithm above.
"""
"""
seed_nodes
=
utils
.
prepare_tensor
(
self
.
G
,
seed_nodes
,
'seed_nodes'
)
seed_nodes
=
utils
.
prepare_tensor
(
self
.
G
,
seed_nodes
,
'seed_nodes'
)
self
.
restart_prob
=
F
.
copy_to
(
self
.
restart_prob
,
F
.
context
(
seed_nodes
))
self
.
restart_prob
=
F
.
copy_to
(
self
.
restart_prob
,
F
.
context
(
seed_nodes
))
...
@@ -147,6 +151,9 @@ class PinSAGESampler(RandomWalkNeighborSampler):
...
@@ -147,6 +151,9 @@ class PinSAGESampler(RandomWalkNeighborSampler):
The edges of the returned homogeneous graph will connect to the given nodes from their most
The edges of the returned homogeneous graph will connect to the given nodes from their most
commonly visited nodes, with a feature indicating the number of visits.
commonly visited nodes, with a feature indicating the number of visits.
UVA and GPU sampling is supported for this sampler.
Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
Parameters
Parameters
----------
----------
G : DGLGraph
G : DGLGraph
...
...
python/dgl/sampling/randomwalks.py
View file @
e9b624fe
...
@@ -30,6 +30,8 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
...
@@ -30,6 +30,8 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
If a random walk stops in advance, DGL pads the trace with -1 to have the same
If a random walk stops in advance, DGL pads the trace with -1 to have the same
length.
length.
This function supports the graph on GPU and UVA sampling.
Parameters
Parameters
----------
----------
g : DGLGraph
g : DGLGraph
...
@@ -37,8 +39,9 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
...
@@ -37,8 +39,9 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
nodes : Tensor
nodes : Tensor
Node ID tensor from which the random walk traces starts.
Node ID tensor from which the random walk traces starts.
The tensor must have the same dtype as the ID type
The tensor must have the same dtype as the ID type of the graph.
of the graph.
The tensor must be on the same device as the graph or
on the GPU when the graph is pinned (UVA sampling).
metapath : list[str or tuple of str], optional
metapath : list[str or tuple of str], optional
Metapath, specified as a list of edge types.
Metapath, specified as a list of edge types.
...
@@ -60,12 +63,15 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
...
@@ -60,12 +63,15 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
must be positive for the outbound edges of all nodes (although they don't have
must be positive for the outbound edges of all nodes (although they don't have
to sum up to one). The result will be undefined otherwise.
to sum up to one). The result will be undefined otherwise.
The feature tensor must be on the same device as the graph.
If omitted, DGL assumes that the neighbors are picked uniformly.
If omitted, DGL assumes that the neighbors are picked uniformly.
restart_prob : float or Tensor, optional
restart_prob : float or Tensor, optional
Probability to terminate the current trace before each transition.
Probability to terminate the current trace before each transition.
If a tensor is given, :attr:`restart_prob` should have the same length as
If a tensor is given, :attr:`restart_prob` should be on the same device as the graph
:attr:`metapath` or :attr:`length`.
or on the GPU when the graph is pinned (UVA sampling),
and have the same length as :attr:`metapath` or :attr:`length`.
return_eids : bool, optional
return_eids : bool, optional
If True, additionally return the edge IDs traversed.
If True, additionally return the edge IDs traversed.
...
@@ -176,19 +182,16 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
...
@@ -176,19 +182,16 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
metapath
=
F
.
to_dgl_nd
(
F
.
astype
(
F
.
tensor
(
metapath
),
g
.
idtype
))
metapath
=
F
.
to_dgl_nd
(
F
.
astype
(
F
.
tensor
(
metapath
),
g
.
idtype
))
# Load the probability tensor from the edge frames
# Load the probability tensor from the edge frames
ctx
=
utils
.
to_dgl_context
(
g
.
device
)
if
prob
is
None
:
if
prob
is
None
:
p_nd
=
[
nd
.
array
([],
ctx
=
nodes
.
ctx
)
for
_
in
g
.
canonical_etypes
]
p_nd
=
[
nd
.
array
([],
ctx
=
ctx
)
for
_
in
g
.
canonical_etypes
]
else
:
else
:
p_nd
=
[]
p_nd
=
[]
for
etype
in
g
.
canonical_etypes
:
for
etype
in
g
.
canonical_etypes
:
if
prob
in
g
.
edges
[
etype
].
data
:
if
prob
in
g
.
edges
[
etype
].
data
:
prob_nd
=
F
.
to_dgl_nd
(
g
.
edges
[
etype
].
data
[
prob
])
prob_nd
=
F
.
to_dgl_nd
(
g
.
edges
[
etype
].
data
[
prob
])
if
prob_nd
.
ctx
!=
nodes
.
ctx
:
raise
ValueError
(
'context of seed node array and edges[%s].data[%s] are different'
%
(
etype
,
prob
))
else
:
else
:
prob_nd
=
nd
.
array
([],
ctx
=
nodes
.
ctx
)
prob_nd
=
nd
.
array
([],
ctx
=
ctx
)
p_nd
.
append
(
prob_nd
)
p_nd
.
append
(
prob_nd
)
# Actual random walk
# Actual random walk
...
@@ -198,9 +201,11 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
...
@@ -198,9 +201,11 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
restart_prob
=
F
.
to_dgl_nd
(
restart_prob
)
restart_prob
=
F
.
to_dgl_nd
(
restart_prob
)
traces
,
eids
,
types
=
_CAPI_DGLSamplingRandomWalkWithStepwiseRestart
(
traces
,
eids
,
types
=
_CAPI_DGLSamplingRandomWalkWithStepwiseRestart
(
gidx
,
nodes
,
metapath
,
p_nd
,
restart_prob
)
gidx
,
nodes
,
metapath
,
p_nd
,
restart_prob
)
el
se
:
el
if
isinstance
(
restart_prob
,
float
)
:
traces
,
eids
,
types
=
_CAPI_DGLSamplingRandomWalkWithRestart
(
traces
,
eids
,
types
=
_CAPI_DGLSamplingRandomWalkWithRestart
(
gidx
,
nodes
,
metapath
,
p_nd
,
restart_prob
)
gidx
,
nodes
,
metapath
,
p_nd
,
restart_prob
)
else
:
raise
TypeError
(
"restart_prob should be float or Tensor."
)
traces
=
F
.
from_dgl_nd
(
traces
)
traces
=
F
.
from_dgl_nd
(
traces
)
types
=
F
.
from_dgl_nd
(
types
)
types
=
F
.
from_dgl_nd
(
types
)
...
...
python/dgl/sparse.py
View file @
e9b624fe
...
@@ -365,11 +365,11 @@ def _gspmm_hetero(gidx, op, reduce_op, u_len, u_and_e_tuple):
...
@@ -365,11 +365,11 @@ def _gspmm_hetero(gidx, op, reduce_op, u_len, u_and_e_tuple):
for
l
,
arg_u_nd
in
enumerate
(
list_arg_u_nd
):
for
l
,
arg_u_nd
in
enumerate
(
list_arg_u_nd
):
# TODO(Israt): l or src_id as index of lhs
# TODO(Israt): l or src_id as index of lhs
list_arg_u
[
l
]
=
None
if
list_arg_u
[
l
]
is
None
else
F
.
zerocopy_from_dgl_ndarray
(
arg_u_nd
)
list_arg_u
[
l
]
=
None
if
list_arg_u
[
l
]
is
None
else
F
.
zerocopy_from_dgl_ndarray
(
arg_u_nd
)
if
expand_u
and
use_cmp
:
if
list_arg_u
[
l
]
is
not
None
and
expand_u
and
use_cmp
:
list_arg_u
[
l
]
=
F
.
squeeze
(
list_arg_u
[
l
],
-
1
)
list_arg_u
[
l
]
=
F
.
squeeze
(
list_arg_u
[
l
],
-
1
)
for
l
,
arg_e_nd
in
enumerate
(
list_arg_e_nd
):
for
l
,
arg_e_nd
in
enumerate
(
list_arg_e_nd
):
list_arg_e
[
l
]
=
None
if
list_arg_e
[
l
]
is
None
else
F
.
zerocopy_from_dgl_ndarray
(
arg_e_nd
)
list_arg_e
[
l
]
=
None
if
list_arg_e
[
l
]
is
None
else
F
.
zerocopy_from_dgl_ndarray
(
arg_e_nd
)
if
expand_e
and
use_cmp
:
if
list_arg_e
[
l
]
is
not
None
and
expand_e
and
use_cmp
:
list_arg_e
[
l
]
=
F
.
squeeze
(
list_arg_e
[
l
],
-
1
)
list_arg_e
[
l
]
=
F
.
squeeze
(
list_arg_e
[
l
],
-
1
)
for
l
,
arg_u_ntype_nd
in
enumerate
(
list_arg_u_ntype_nd
):
for
l
,
arg_u_ntype_nd
in
enumerate
(
list_arg_u_ntype_nd
):
list_arg_u_ntype
[
l
]
=
None
if
arg_u_ntype_nd
is
None
\
list_arg_u_ntype
[
l
]
=
None
if
arg_u_ntype_nd
is
None
\
...
@@ -562,7 +562,7 @@ def _gsddmm_hetero(gidx, op, lhs_len, lhs_target='u', rhs_target='v', lhs_and_rh
...
@@ -562,7 +562,7 @@ def _gsddmm_hetero(gidx, op, lhs_len, lhs_target='u', rhs_target='v', lhs_and_rh
e
=
out_list
[
l
]
e
=
out_list
[
l
]
e
=
F
.
tensor
([])
if
e
is
None
else
e
e
=
F
.
tensor
([])
if
e
is
None
else
e
if
(
expand_lhs
or
not
use_lhs
)
and
(
expand_rhs
or
not
use_rhs
):
if
(
expand_lhs
or
not
use_lhs
)
and
(
expand_rhs
or
not
use_rhs
):
e
=
F
.
squeeze
(
v
,
-
1
)
e
=
F
.
squeeze
(
e
,
-
1
)
out_list
[
l
]
=
e
out_list
[
l
]
=
e
out
=
tuple
(
out_list
)
out
=
tuple
(
out_list
)
return
out
return
out
...
...
python/dgl/transforms/functional.py
View file @
e9b624fe
...
@@ -40,6 +40,7 @@ from ..partition import metis_partition_assignment
...
@@ -40,6 +40,7 @@ from ..partition import metis_partition_assignment
from
..partition
import
partition_graph_with_halo
from
..partition
import
partition_graph_with_halo
from
..partition
import
metis_partition
from
..partition
import
metis_partition
from
..
import
subgraph
from
..
import
subgraph
from
..
import
function
# TO BE DEPRECATED
# TO BE DEPRECATED
from
.._deprecate.graph
import
DGLGraph
as
DGLGraphStale
from
.._deprecate.graph
import
DGLGraph
as
DGLGraphStale
...
@@ -1764,13 +1765,24 @@ def remove_nodes(g, nids, ntype=None, store_ids=False):
...
@@ -1764,13 +1765,24 @@ def remove_nodes(g, nids, ntype=None, store_ids=False):
g
.
remove_nodes
(
nids
,
ntype
=
ntype
,
store_ids
=
store_ids
)
g
.
remove_nodes
(
nids
,
ntype
=
ntype
,
store_ids
=
store_ids
)
return
g
return
g
def
add_self_loop
(
g
,
etype
=
None
):
def
add_self_loop
(
g
,
edge_feat_names
=
None
,
fill_data
=
1.
,
etype
=
None
):
r
"""Add self-loops for each node in the graph and return a new graph.
r
"""Add self-loops for each node in the graph and return a new graph.
Parameters
Parameters
----------
----------
g : DGLGraph
g : DGLGraph
The graph.
The graph.
edge_feat_names : list[str], optional
The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
to all self-loop features. Default: None.
fill_data : int, float or str, optional
The value to fill the self-loop features. Default: 1.
* If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
``fill_data``.
* if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
features of the incoming edges of the corresponding nodes. The supported aggregation are:
``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
etype : str or (str, str, str), optional
etype : str or (str, str, str), optional
The type names of the edges. The allowed type name formats are:
The type names of the edges. The allowed type name formats are:
...
@@ -1792,7 +1804,6 @@ def add_self_loop(g, etype=None):
...
@@ -1792,7 +1804,6 @@ def add_self_loop(g, etype=None):
* The function adds self-loops regardless of whether they already exist or not.
* The function adds self-loops regardless of whether they already exist or not.
If one wishes to have exactly one self-loop for every node,
If one wishes to have exactly one self-loop for every node,
call :func:`remove_self_loop` before invoking :func:`add_self_loop`.
call :func:`remove_self_loop` before invoking :func:`add_self_loop`.
* Features of the new edges (self-loop edges) will be filled with zeros.
* This function discards the batch information. Please use
* This function discards the batch information. Please use
:func:`dgl.DGLGraph.set_batch_num_nodes`
:func:`dgl.DGLGraph.set_batch_num_nodes`
and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
...
@@ -1808,7 +1819,7 @@ def add_self_loop(g, etype=None):
...
@@ -1808,7 +1819,7 @@ def add_self_loop(g, etype=None):
>>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0])))
>>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0])))
>>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
>>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
>>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
>>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
>>> g = dgl.add_self_loop(g)
>>> g = dgl.add_self_loop(g
, fill_data='sum'
)
>>> g
>>> g
Graph(num_nodes=3, num_edges=6,
Graph(num_nodes=3, num_edges=6,
ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
...
@@ -1817,8 +1828,8 @@ def add_self_loop(g, etype=None):
...
@@ -1817,8 +1828,8 @@ def add_self_loop(g, etype=None):
tensor([[0.],
tensor([[0.],
[1.],
[1.],
[2.],
[2.],
[
0
.],
[
2
.],
[
0
.],
[
1
.],
[0.]])
[0.]])
**Heterogeneous Graphs**
**Heterogeneous Graphs**
...
@@ -1831,17 +1842,49 @@ def add_self_loop(g, etype=None):
...
@@ -1831,17 +1842,49 @@ def add_self_loop(g, etype=None):
>>> g = dgl.add_self_loop(g, etype='follows')
>>> g = dgl.add_self_loop(g, etype='follows')
>>> g
>>> g
Graph(num_nodes={'user': 3, 'game': 2},
Graph(num_nodes={'user': 3, 'game': 2},
num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
metagraph=[('user', 'user'), ('user', 'game')])
metagraph=[('user', 'user'), ('user', 'game')])
"""
"""
etype
=
g
.
to_canonical_etype
(
etype
)
etype
=
g
.
to_canonical_etype
(
etype
)
data
=
{}
reduce_funcs
=
{
'sum'
:
function
.
sum
,
'mean'
:
function
.
mean
,
'max'
:
function
.
max
,
'min'
:
function
.
min
}
if
edge_feat_names
is
None
:
edge_feat_names
=
g
.
edges
[
etype
].
data
.
keys
()
if
etype
[
0
]
!=
etype
[
2
]:
if
etype
[
0
]
!=
etype
[
2
]:
raise
DGLError
(
raise
DGLError
(
'add_self_loop does not support unidirectional bipartite graphs: {}.'
\
'add_self_loop does not support unidirectional bipartite graphs: {}.'
\
'Please make sure the types of head node and tail node are identical.'
\
'Please make sure the types of head node and tail node are identical.'
\
''
.
format
(
etype
))
''
.
format
(
etype
))
for
feat_name
in
edge_feat_names
:
if
isinstance
(
fill_data
,
(
int
,
float
)):
dtype
=
g
.
edges
[
etype
].
data
[
feat_name
].
dtype
dshape
=
g
.
edges
[
etype
].
data
[
feat_name
].
shape
tmp_fill_data
=
F
.
copy_to
(
F
.
astype
(
F
.
tensor
([
fill_data
]),
dtype
),
g
.
device
)
if
len
(
dshape
)
>
1
:
data
[
feat_name
]
=
F
.
zeros
((
g
.
num_nodes
(
etype
[
0
]),
*
dshape
[
1
:]),
dtype
,
g
.
device
)
+
tmp_fill_data
else
:
data
[
feat_name
]
=
F
.
zeros
((
g
.
num_nodes
(
etype
[
0
]),),
dtype
,
g
.
device
)
+
tmp_fill_data
elif
isinstance
(
fill_data
,
str
):
if
fill_data
not
in
reduce_funcs
.
keys
():
raise
DGLError
(
'Unsupported aggregation: {}'
.
format
(
fill_data
))
reducer
=
reduce_funcs
[
fill_data
]
with
g
.
local_scope
():
g
.
update_all
(
function
.
copy_e
(
feat_name
,
"h"
),
reducer
(
'h'
,
'h'
),
etype
=
etype
)
data
[
feat_name
]
=
g
.
nodes
[
etype
[
0
]].
data
[
'h'
]
nodes
=
g
.
nodes
(
etype
[
0
])
nodes
=
g
.
nodes
(
etype
[
0
])
new_g
=
add_edges
(
g
,
nodes
,
nodes
,
etype
=
etype
)
if
len
(
data
):
new_g
=
add_edges
(
g
,
nodes
,
nodes
,
data
=
data
,
etype
=
etype
)
else
:
new_g
=
add_edges
(
g
,
nodes
,
nodes
,
etype
=
etype
)
return
new_g
return
new_g
DGLHeteroGraph
.
add_self_loop
=
utils
.
alias_func
(
add_self_loop
)
DGLHeteroGraph
.
add_self_loop
=
utils
.
alias_func
(
add_self_loop
)
...
@@ -2872,6 +2915,8 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
...
@@ -2872,6 +2915,8 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``node``.
``tag_type`` is ``node``.
>>> import dgl
>>> import dgl
>>> import torch
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g.adjacency_matrix(scipy_fmt='csr').nonzero()
>>> g.adjacency_matrix(scipy_fmt='csr').nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
...
@@ -2890,11 +2935,10 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
...
@@ -2890,11 +2935,10 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``edge``.
``tag_type`` is ``edge``.
>>> from dgl import backend as F
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
>>> g.edges()
>>> g.edges()
(tensor([0, 0, 0, 0, 0, 1, 1, 1]), tensor([0, 1, 2, 3, 4, 0, 1, 2]))
(tensor([0, 0, 0, 0, 0, 1, 1, 1]), tensor([0, 1, 2, 3, 4, 0, 1, 2]))
>>> tag =
F
.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> tag =
torch
.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> g_sorted = dgl.sort_csr_by_tag(g, tag, tag_type='edge')
>>> g_sorted = dgl.sort_csr_by_tag(g, tag, tag_type='edge')
>>> g_sorted.adj(scipy_fmt='csr').nonzero()
>>> g_sorted.adj(scipy_fmt='csr').nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
...
@@ -2995,6 +3039,7 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
...
@@ -2995,6 +3039,7 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``node``.
``tag_type`` is ``node``.
>>> import dgl
>>> import dgl
>>> import torch
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
>>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
...
@@ -3013,9 +3058,8 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
...
@@ -3013,9 +3058,8 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
``tag_type`` is ``edge``.
``tag_type`` is ``edge``.
>>> from dgl import backend as F
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
>>> tag =
F
.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> tag =
torch
.tensor([1, 1, 0, 2, 0, 1, 1, 0])
>>> g_sorted = dgl.sort_csc_by_tag(g, tag, tag_type='edge')
>>> g_sorted = dgl.sort_csc_by_tag(g, tag, tag_type='edge')
>>> g_sorted.adj(scipy_fmt='csr', transpose=True).nonzero()
>>> g_sorted.adj(scipy_fmt='csr', transpose=True).nonzero()
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
(array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
...
...
python/dgl/transforms/module.py
View file @
e9b624fe
...
@@ -415,6 +415,17 @@ class AddSelfLoop(BaseTransform):
...
@@ -415,6 +415,17 @@ class AddSelfLoop(BaseTransform):
If False, it will first remove self-loops to prevent duplicate self-loops.
If False, it will first remove self-loops to prevent duplicate self-loops.
new_etypes : bool, optional
new_etypes : bool, optional
If True, it will add an edge type 'self' per node type, which holds self-loops.
If True, it will add an edge type 'self' per node type, which holds self-loops.
edge_feat_names : list[str], optional
The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
to all self-loop features. Default: None.
fill_data : int, float or str, optional
The value to fill the self-loop features. Default: 1.
* If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
``fill_data``.
* if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
features of the incoming edges of the corresponding nodes. The supported aggregation are:
``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
Example
Example
-------
-------
...
@@ -424,23 +435,39 @@ class AddSelfLoop(BaseTransform):
...
@@ -424,23 +435,39 @@ class AddSelfLoop(BaseTransform):
Case1: Add self-loops for a homogeneous graph
Case1: Add self-loops for a homogeneous graph
>>> transform = AddSelfLoop()
>>> transform = AddSelfLoop(fill_data='sum')
>>> g = dgl.graph(([1, 1], [1, 2]))
>>> g = dgl.graph(([0, 0, 2], [2, 1, 0]))
>>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
>>> new_g = transform(g)
>>> new_g = transform(g)
>>> print(new_g.edges())
>>> print(new_g.edges())
(tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
(tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
>>> print(new_g.edata('he'))
tensor([[0.],
[1.],
[2.],
[2.],
[1.],
[0.]])
Case2: Add self-loops for a heterogeneous graph
Case2: Add self-loops for a heterogeneous graph
>>> transform = AddSelfLoop(fill_data='sum')
>>> g = dgl.heterograph({
>>> g = dgl.heterograph({
... ('user', 'plays', 'game'): ([0], [1]),
... ('user', 'follows', 'user'): (torch.tensor([1, 2]),
... ('user', 'follows', 'user'): ([1], [2])
... torch.tensor([0, 1])),
... })
... ('user', 'plays', 'game'): (torch.tensor([0, 1]),
... torch.tensor([0, 1]))})
>>> g.edata['feat'] = {('user', 'follows', 'user'): torch.randn(2, 5),
... ('user', 'plays', 'game'): torch.randn(2, 5)}
>>> g.edata['feat1'] = {('user', 'follows', 'user'): torch.randn(2, 15),
... ('user', 'plays', 'game'): torch.randn(2, 15)}
>>> new_g = transform(g)
>>> new_g = transform(g)
>>> print(new_g.edges(etype='plays'))
>>> print(new_g.edges(etype='plays'))
(tensor([0]), tensor([1]))
(tensor([0
, 1
]), tensor([
0,
1]))
>>> print(new_g.edges(etype='follows'))
>>> print(new_g.edges(etype='follows'))
(tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
(tensor([1, 2]), tensor([0, 1]))
>>> print(new_g.edata['feat'][('user', 'follows', 'user')].shape)
torch.Size([5, 5])
Case3: Add self-etypes for a heterogeneous graph
Case3: Add self-etypes for a heterogeneous graph
...
@@ -451,9 +478,12 @@ class AddSelfLoop(BaseTransform):
...
@@ -451,9 +478,12 @@ class AddSelfLoop(BaseTransform):
>>> print(new_g.edges(etype=('game', 'self', 'game')))
>>> print(new_g.edges(etype=('game', 'self', 'game')))
(tensor([0, 1]), tensor([0, 1]))
(tensor([0, 1]), tensor([0, 1]))
"""
"""
def
__init__
(
self
,
allow_duplicate
=
False
,
new_etypes
=
False
):
def
__init__
(
self
,
allow_duplicate
=
False
,
new_etypes
=
False
,
edge_feat_names
=
None
,
fill_data
=
1.
):
self
.
allow_duplicate
=
allow_duplicate
self
.
allow_duplicate
=
allow_duplicate
self
.
new_etypes
=
new_etypes
self
.
new_etypes
=
new_etypes
self
.
edge_feat_names
=
edge_feat_names
self
.
fill_data
=
fill_data
def
transform_etype
(
self
,
c_etype
,
g
):
def
transform_etype
(
self
,
c_etype
,
g
):
r
"""
r
"""
...
@@ -480,7 +510,8 @@ class AddSelfLoop(BaseTransform):
...
@@ -480,7 +510,8 @@ class AddSelfLoop(BaseTransform):
if
not
self
.
allow_duplicate
:
if
not
self
.
allow_duplicate
:
g
=
functional
.
remove_self_loop
(
g
,
etype
=
c_etype
)
g
=
functional
.
remove_self_loop
(
g
,
etype
=
c_etype
)
return
functional
.
add_self_loop
(
g
,
etype
=
c_etype
)
return
functional
.
add_self_loop
(
g
,
edge_feat_names
=
self
.
edge_feat_names
,
fill_data
=
self
.
fill_data
,
etype
=
c_etype
)
def
__call__
(
self
,
g
):
def
__call__
(
self
,
g
):
for
c_etype
in
g
.
canonical_etypes
:
for
c_etype
in
g
.
canonical_etypes
:
...
@@ -501,6 +532,7 @@ class AddSelfLoop(BaseTransform):
...
@@ -501,6 +532,7 @@ class AddSelfLoop(BaseTransform):
data_dict
[
c_etype
]
=
g
.
edges
(
etype
=
c_etype
)
data_dict
[
c_etype
]
=
g
.
edges
(
etype
=
c_etype
)
g
=
update_graph_structure
(
g
,
data_dict
)
g
=
update_graph_structure
(
g
,
data_dict
)
return
g
return
g
class
RemoveSelfLoop
(
BaseTransform
):
class
RemoveSelfLoop
(
BaseTransform
):
...
...
src/array/array.cc
View file @
e9b624fe
...
@@ -549,11 +549,13 @@ COOMatrix CSRRowWiseSampling(
...
@@ -549,11 +549,13 @@ COOMatrix CSRRowWiseSampling(
CSRMatrix
mat
,
IdArray
rows
,
int64_t
num_samples
,
FloatArray
prob
,
bool
replace
)
{
CSRMatrix
mat
,
IdArray
rows
,
int64_t
num_samples
,
FloatArray
prob
,
bool
replace
)
{
COOMatrix
ret
;
COOMatrix
ret
;
if
(
IsNullArray
(
prob
))
{
if
(
IsNullArray
(
prob
))
{
ATEN_CSR_SWITCH_CUDA_UVA
(
mat
,
rows
,
XPU
,
IdType
,
"CSRRowWiseSampling"
,
{
ATEN_CSR_SWITCH_CUDA_UVA
(
mat
,
rows
,
XPU
,
IdType
,
"CSRRowWiseSampling
Uniform
"
,
{
ret
=
impl
::
CSRRowWiseSamplingUniform
<
XPU
,
IdType
>
(
mat
,
rows
,
num_samples
,
replace
);
ret
=
impl
::
CSRRowWiseSamplingUniform
<
XPU
,
IdType
>
(
mat
,
rows
,
num_samples
,
replace
);
});
});
}
else
{
}
else
{
ATEN_CSR_SWITCH
(
mat
,
XPU
,
IdType
,
"CSRRowWiseSampling"
,
{
// prob is pinned and rows on GPU is valid
CHECK_VALID_CONTEXT
(
prob
,
rows
);
ATEN_CSR_SWITCH_CUDA_UVA
(
mat
,
rows
,
XPU
,
IdType
,
"CSRRowWiseSampling"
,
{
ATEN_FLOAT_TYPE_SWITCH
(
prob
->
dtype
,
FloatType
,
"probability"
,
{
ATEN_FLOAT_TYPE_SWITCH
(
prob
->
dtype
,
FloatType
,
"probability"
,
{
ret
=
impl
::
CSRRowWiseSampling
<
XPU
,
IdType
,
FloatType
>
(
ret
=
impl
::
CSRRowWiseSampling
<
XPU
,
IdType
,
FloatType
>
(
mat
,
rows
,
num_samples
,
prob
,
replace
);
mat
,
rows
,
num_samples
,
prob
,
replace
);
...
...
src/array/cuda/dgl_cub.cuh
View file @
e9b624fe
...
@@ -7,13 +7,11 @@
...
@@ -7,13 +7,11 @@
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
#define DGL_ARRAY_CUDA_DGL_CUB_CUH_
// include cub in a safe manner
// This should be defined in CMakeLists.txt
#define CUB_NS_PREFIX namespace dgl {
#ifndef THRUST_CUB_WRAPPED_NAMESPACE
#define CUB_NS_POSTFIX }
static_assert
(
false
,
"THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL."
);
#define CUB_NS_QUALIFIER ::dgl::cub
#endif
#include "cub/cub.cuh"
#include "cub/cub.cuh"
#undef CUB_NS_QUALIFIER
#undef CUB_NS_POSTFIX
#undef CUB_NS_PREFIX
#endif
#endif
src/array/cuda/rowwise_sampling.cu
View file @
e9b624fe
/*!
/*!
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021 by Contributors
* \file array/cuda/rowwise_sampling.cu
* \file array/cuda/rowwise_sampling.cu
* \brief rowwise sampling
* \brief
uniform
rowwise sampling
*/
*/
#include <dgl/random.h>
#include <dgl/random.h>
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
#include "../../array/cuda/atomic.cuh"
#include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
#include "../../runtime/cuda/cuda_common.h"
using
namespace
dgl
::
aten
::
cuda
;
using
namespace
dgl
::
aten
::
cuda
;
namespace
dgl
{
namespace
dgl
{
...
@@ -21,7 +22,7 @@ namespace impl {
...
@@ -21,7 +22,7 @@ namespace impl {
namespace
{
namespace
{
constexpr
int
CTA
_SIZE
=
128
;
constexpr
int
BLOCK
_SIZE
=
128
;
/**
/**
* @brief Compute the size of each row in the sampled CSR, without replacement.
* @brief Compute the size of each row in the sampled CSR, without replacement.
...
@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
...
@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
const
IdType
*
const
in_ptr
,
IdType
*
const
out_deg
)
{
IdType
*
const
out_deg
)
{
const
int
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tIdx
<
num_rows
)
{
if
(
tIdx
<
num_rows
)
{
const
int
in_row
=
in_rows
[
tIdx
];
const
int
in_row
=
in_rows
[
tIdx
];
const
int
out_row
=
tIdx
;
const
int
out_row
=
tIdx
;
out_deg
[
out_row
]
=
min
(
static_cast
<
IdType
>
(
num_picks
),
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
]);
out_deg
[
out_row
]
=
min
(
static_cast
<
IdType
>
(
num_picks
),
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
]);
if
(
out_row
==
num_rows
-
1
)
{
if
(
out_row
==
num_rows
-
1
)
{
// make the prefixsum work
// make the prefixsum work
out_deg
[
num_rows
]
=
0
;
out_deg
[
num_rows
]
=
0
;
}
}
...
@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
...
@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
const
IdType
*
const
in_ptr
,
IdType
*
const
out_deg
)
{
IdType
*
const
out_deg
)
{
const
int
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tIdx
<
num_rows
)
{
if
(
tIdx
<
num_rows
)
{
const
int64_t
in_row
=
in_rows
[
tIdx
];
const
int64_t
in_row
=
in_rows
[
tIdx
];
const
int64_t
out_row
=
tIdx
;
const
int64_t
out_row
=
tIdx
;
if
(
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
]
==
0
)
{
if
(
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
]
==
0
)
{
out_deg
[
out_row
]
=
0
;
out_deg
[
out_row
]
=
0
;
}
else
{
}
else
{
out_deg
[
out_row
]
=
static_cast
<
IdType
>
(
num_picks
);
out_deg
[
out_row
]
=
static_cast
<
IdType
>
(
num_picks
);
}
}
if
(
out_row
==
num_rows
-
1
)
{
if
(
out_row
==
num_rows
-
1
)
{
// make the prefixsum work
// make the prefixsum work
out_deg
[
num_rows
]
=
0
;
out_deg
[
num_rows
]
=
0
;
}
}
...
@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
...
@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
}
}
/**
/**
* @brief Perform row-wise sampling on a CSR matrix,
and generate a COO matrix,
* @brief Perform row-wise
uniform
sampling on a CSR matrix,
* without replacement.
*
and generate a COO matrix,
without replacement.
*
*
* @tparam IdType The ID type used for matrices.
* @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_picks The number of non-zeros to pick per row.
...
@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
...
@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
* @param out_cols The columns of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
*/
template
<
typename
IdType
,
int
BLOCK_CTAS
,
int
TILE_SIZE
>
template
<
typename
IdType
,
int
TILE_SIZE
>
__global__
void
_CSRRowWiseSampleKernel
(
__global__
void
_CSRRowWiseSample
Uniform
Kernel
(
const
uint64_t
rand_seed
,
const
uint64_t
rand_seed
,
const
int64_t
num_picks
,
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
int64_t
num_rows
,
...
@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
...
@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
IdType
*
const
out_cols
,
IdType
*
const
out_cols
,
IdType
*
const
out_idxs
)
{
IdType
*
const
out_idxs
)
{
// we assign one warp per row
// we assign one warp per row
assert
(
blockDim
.
x
==
CTA
_SIZE
);
assert
(
blockDim
.
x
==
BLOCK
_SIZE
);
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
+
threadIdx
.
y
;
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
;
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
curandStatePhilox4_32_10_t
rng
;
curandStatePhilox4_32_10_t
rng
;
curand_init
(
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
)
*
blockDim
.
y
+
threadIdx
.
y
,
threadIdx
.
x
,
0
,
&
rng
);
curand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
while
(
out_row
<
last_row
)
{
while
(
out_row
<
last_row
)
{
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
if
(
deg
<=
num_picks
)
{
if
(
deg
<=
num_picks
)
{
// just copy row
// just copy row
when there is not enough nodes to sample.
for
(
int
idx
=
threadIdx
.
x
;
idx
<
deg
;
idx
+=
CTA
_SIZE
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
deg
;
idx
+=
BLOCK
_SIZE
)
{
const
IdType
in_idx
=
in_row_start
+
idx
;
const
IdType
in_idx
=
in_row_start
+
idx
;
out_rows
[
out_row_start
+
idx
]
=
row
;
out_rows
[
out_row_start
+
idx
]
=
row
;
out_cols
[
out_row_start
+
idx
]
=
in_index
[
in_idx
];
out_cols
[
out_row_start
+
idx
]
=
in_index
[
in_idx
];
out_idxs
[
out_row_start
+
idx
]
=
data
?
data
[
in_idx
]
:
in_idx
;
out_idxs
[
out_row_start
+
idx
]
=
data
?
data
[
in_idx
]
:
in_idx
;
}
}
}
else
{
}
else
{
// generate permutation list via reservoir algorithm
// generate permutation list via reservoir algorithm
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
CTA
_SIZE
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
BLOCK
_SIZE
)
{
out_idxs
[
out_row_start
+
idx
]
=
idx
;
out_idxs
[
out_row_start
+
idx
]
=
idx
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
idx
=
num_picks
+
threadIdx
.
x
;
idx
<
deg
;
idx
+=
CTA
_SIZE
)
{
for
(
int
idx
=
num_picks
+
threadIdx
.
x
;
idx
<
deg
;
idx
+=
BLOCK
_SIZE
)
{
const
int
num
=
curand
(
&
rng
)
%
(
idx
+
1
);
const
int
num
=
curand
(
&
rng
)
%
(
idx
+
1
);
if
(
num
<
num_picks
)
{
if
(
num
<
num_picks
)
{
// use max so as to achieve the replacement order the serial
// use max so as to achieve the replacement order the serial
// algorithm would have
// algorithm would have
AtomicMax
(
out_idxs
+
out_row_start
+
num
,
idx
);
AtomicMax
(
out_idxs
+
out_row_start
+
num
,
idx
);
}
}
}
}
__syncthreads
();
__syncthreads
();
// copy permutation over
// copy permutation over
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
CTA_SIZE
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
BLOCK_SIZE
)
{
const
IdType
perm_idx
=
out_idxs
[
out_row_start
+
idx
]
+
in_row_start
;
const
IdType
perm_idx
=
out_idxs
[
out_row_start
+
idx
]
+
in_row_start
;
out_rows
[
out_row_start
+
idx
]
=
row
;
out_rows
[
out_row_start
+
idx
]
=
row
;
out_cols
[
out_row_start
+
idx
]
=
in_index
[
perm_idx
];
out_cols
[
out_row_start
+
idx
]
=
in_index
[
perm_idx
];
if
(
data
)
{
out_idxs
[
out_row_start
+
idx
]
=
data
?
data
[
perm_idx
]
:
perm_idx
;
out_idxs
[
out_row_start
+
idx
]
=
data
[
perm_idx
];
}
}
}
}
}
out_row
+=
1
;
out_row
+=
BLOCK_CTAS
;
}
}
}
}
/**
/**
* @brief Perform row-wise sampling on a CSR matrix,
and generate a COO matrix,
* @brief Perform row-wise
uniform
sampling on a CSR matrix,
* with replacement.
*
and generate a COO matrix,
with replacement.
*
*
* @tparam IdType The ID type used for matrices.
* @tparam IdType The ID type used for matrices.
* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_picks The number of non-zeros to pick per row.
...
@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
...
@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
* @param out_cols The columns of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @param out_idxs The data array of the output COO (output).
*/
*/
template
<
typename
IdType
,
int
BLOCK_CTAS
,
int
TILE_SIZE
>
template
<
typename
IdType
,
int
TILE_SIZE
>
__global__
void
_CSRRowWiseSampleReplaceKernel
(
__global__
void
_CSRRowWiseSample
Uniform
ReplaceKernel
(
const
uint64_t
rand_seed
,
const
uint64_t
rand_seed
,
const
int64_t
num_picks
,
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
int64_t
num_rows
,
...
@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
...
@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
IdType
*
const
out_cols
,
IdType
*
const
out_cols
,
IdType
*
const
out_idxs
)
{
IdType
*
const
out_idxs
)
{
// we assign one warp per row
// we assign one warp per row
assert
(
blockDim
.
x
==
CTA
_SIZE
);
assert
(
blockDim
.
x
==
BLOCK
_SIZE
);
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
+
threadIdx
.
y
;
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
;
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
curandStatePhilox4_32_10_t
rng
;
curandStatePhilox4_32_10_t
rng
;
curand_init
(
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
)
*
blockDim
.
y
+
threadIdx
.
y
,
threadIdx
.
x
,
0
,
&
rng
);
curand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
while
(
out_row
<
last_row
)
{
while
(
out_row
<
last_row
)
{
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
if
(
deg
>
0
)
{
if
(
deg
>
0
)
{
// each thread then blindly copies in rows only if deg > 0.
// each thread then blindly copies in rows only if deg > 0.
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
CTA
_SIZE
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
BLOCK
_SIZE
)
{
const
int64_t
edge
=
curand
(
&
rng
)
%
deg
;
const
int64_t
edge
=
curand
(
&
rng
)
%
deg
;
const
int64_t
out_idx
=
out_row_start
+
idx
;
const
int64_t
out_idx
=
out_row_start
+
idx
;
out_rows
[
out_idx
]
=
row
;
out_rows
[
out_idx
]
=
row
;
out_cols
[
out_idx
]
=
in_index
[
in_row_start
+
edge
];
out_cols
[
out_idx
]
=
in_index
[
in_row_start
+
edge
];
out_idxs
[
out_idx
]
=
data
?
data
[
in_row_start
+
edge
]
:
in_row_start
+
edge
;
out_idxs
[
out_idx
]
=
data
?
data
[
in_row_start
+
edge
]
:
in_row_start
+
edge
;
}
}
}
}
out_row
+=
BLOCK_CTAS
;
out_row
+=
1
;
}
}
}
}
}
// namespace
}
// namespace
/////////////////////////////// CSR ///////////////////////////////
///////////////////////////// CSR sampling //////////////////////////
template
<
DLDeviceType
XPU
,
typename
IdType
>
template
<
DLDeviceType
XPU
,
typename
IdType
>
COOMatrix
CSRRowWiseSamplingUniform
(
CSRMatrix
mat
,
COOMatrix
CSRRowWiseSamplingUniform
(
CSRMatrix
mat
,
...
@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
...
@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// compute degree
// compute degree
IdType
*
out_deg
=
static_cast
<
IdType
*>
(
IdType
*
out_deg
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
if
(
replace
)
{
if
(
replace
)
{
const
dim3
block
(
512
);
const
dim3
block
(
512
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
_CSRRowWiseSampleDegreeReplaceKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
CUDA_KERNEL_CALL
(
_CSRRowWiseSampleDegreeReplaceKernel
,
grid
,
block
,
0
,
stream
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
);
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
);
}
else
{
}
else
{
const
dim3
block
(
512
);
const
dim3
block
(
512
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
_CSRRowWiseSampleDegreeKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
CUDA_KERNEL_CALL
(
_CSRRowWiseSampleDegreeKernel
,
grid
,
block
,
0
,
stream
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
);
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
);
}
}
// fill out_ptr
// fill out_ptr
IdType
*
out_ptr
=
static_cast
<
IdType
*>
(
IdType
*
out_ptr
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
size_t
prefix_temp_size
=
0
;
size_t
prefix_temp_size
=
0
;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
nullptr
,
prefix_temp_size
,
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
nullptr
,
prefix_temp_size
,
out_deg
,
out_deg
,
...
@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
...
@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent
// a cudaevent
IdType
new_len
;
IdType
new_len
;
device
->
CopyDataFromTo
(
out_ptr
,
num_rows
*
sizeof
(
new_len
),
&
new_len
,
0
,
device
->
CopyDataFromTo
(
out_ptr
,
num_rows
*
sizeof
(
new_len
),
&
new_len
,
0
,
sizeof
(
new_len
),
sizeof
(
new_len
),
ctx
,
ctx
,
DGLContext
{
kDLCPU
,
0
},
DGLContext
{
kDLCPU
,
0
},
mat
.
indptr
->
dtype
,
mat
.
indptr
->
dtype
,
stream
);
stream
);
CUDA_CALL
(
cudaEventRecord
(
copyEvent
,
stream
));
CUDA_CALL
(
cudaEventRecord
(
copyEvent
,
stream
));
const
uint64_t
random_seed
=
RandomEngine
::
ThreadLocal
()
->
RandInt
(
1000000000
);
const
uint64_t
random_seed
=
RandomEngine
::
ThreadLocal
()
->
RandInt
(
1000000000
);
// select edges
// select edges
if
(
replace
)
{
// the number of rows each thread block will cover
constexpr
int
BLOCK_CTAS
=
128
/
CTA_SIZE
;
constexpr
int
TILE_SIZE
=
128
/
BLOCK_SIZE
;
// the number of rows each thread block will cover
if
(
replace
)
{
// with replacement
constexpr
int
TILE_SIZE
=
BLOCK_CTAS
;
const
dim3
block
(
BLOCK_SIZE
);
const
dim3
block
(
CTA_SIZE
,
BLOCK_CTAS
);
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
CUDA_KERNEL_CALL
(
_CSRRowWiseSampleReplaceKernel
<
IdType
,
BLOCK_CTAS
,
TILE_SIZE
><<<
grid
,
block
,
0
,
stream
>>>
(
(
_CSRRowWiseSampleUniformReplaceKernel
<
IdType
,
TILE_SIZE
>
),
grid
,
block
,
0
,
stream
,
random_seed
,
random_seed
,
num_picks
,
num_picks
,
num_rows
,
num_rows
,
...
@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
...
@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
out_rows
,
out_rows
,
out_cols
,
out_cols
,
out_idxs
);
out_idxs
);
}
else
{
}
else
{
// without replacement
constexpr
int
BLOCK_CTAS
=
128
/
CTA_SIZE
;
const
dim3
block
(
BLOCK_SIZE
);
// the number of rows each thread block will cover
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
constexpr
int
TILE_SIZE
=
BLOCK_CTAS
;
CUDA_KERNEL_CALL
(
const
dim3
block
(
CTA_SIZE
,
BLOCK_CTAS
);
(
_CSRRowWiseSampleUniformKernel
<
IdType
,
TILE_SIZE
>
),
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
grid
,
block
,
0
,
stream
,
_CSRRowWiseSampleKernel
<
IdType
,
BLOCK_CTAS
,
TILE_SIZE
><<<
grid
,
block
,
0
,
stream
>>>
(
random_seed
,
random_seed
,
num_picks
,
num_picks
,
num_rows
,
num_rows
,
...
...
src/array/cuda/rowwise_sampling_prob.cu
0 → 100644
View file @
e9b624fe
/*!
* Copyright (c) 2022 by Contributors
* \file array/cuda/rowwise_sampling_prob.cu
* \brief weighted rowwise sampling. The degree computing kernels and
* host-side functions are partially borrowed from the uniform rowwise
* sampling code rowwise_sampling.cu.
* \author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
#include <dgl/random.h>
#include <dgl/runtime/device_api.h>
#include <curand_kernel.h>
#include <numeric>
#include "./dgl_cub.cuh"
#include "../../array/cuda/atomic.cuh"
#include "../../runtime/cuda/cuda_common.h"
// require CUB 1.17 to use DeviceSegmentedSort
static_assert
(
CUB_VERSION
>=
101700
,
"Require CUB >= 1.17 to use DeviceSegmentedSort"
);
using
namespace
dgl
::
aten
::
cuda
;
namespace
dgl
{
namespace
aten
{
namespace
impl
{
namespace
{
constexpr
int
BLOCK_SIZE
=
128
;
/**
* @brief Compute the size of each row in the sampled CSR, without replacement.
* temp_deg is calculated for rows with deg > num_picks.
* For these rows, we will calculate their A-Res values and sort them to get top-num_picks.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
*/
template
<
typename
IdType
>
__global__
void
_CSRRowWiseSampleDegreeKernel
(
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
IdType
*
const
out_deg
,
IdType
*
const
temp_deg
)
{
const
int64_t
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tIdx
<
num_rows
)
{
const
int64_t
in_row
=
in_rows
[
tIdx
];
const
int64_t
out_row
=
tIdx
;
const
IdType
deg
=
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
];
// temp_deg is used to generate ares_ptr
temp_deg
[
out_row
]
=
deg
>
static_cast
<
IdType
>
(
num_picks
)
?
deg
:
0
;
out_deg
[
out_row
]
=
min
(
static_cast
<
IdType
>
(
num_picks
),
deg
);
if
(
out_row
==
num_rows
-
1
)
{
// make the prefixsum work
out_deg
[
num_rows
]
=
0
;
temp_deg
[
num_rows
]
=
0
;
}
}
}
/**
* @brief Compute the size of each row in the sampled CSR, with replacement.
* We need the actual in degree of each row to store CDF values.
*
* @tparam IdType The type of node and edge indexes.
* @param num_picks The number of non-zero entries to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The index where each row's edges start.
* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
*/
template
<
typename
IdType
>
__global__
void
_CSRRowWiseSampleDegreeReplaceKernel
(
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
IdType
*
const
out_deg
,
IdType
*
const
temp_deg
)
{
const
int64_t
tIdx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tIdx
<
num_rows
)
{
const
int64_t
in_row
=
in_rows
[
tIdx
];
const
int64_t
out_row
=
tIdx
;
const
IdType
deg
=
in_ptr
[
in_row
+
1
]
-
in_ptr
[
in_row
];
temp_deg
[
out_row
]
=
deg
;
out_deg
[
out_row
]
=
deg
==
0
?
0
:
static_cast
<
IdType
>
(
num_picks
);
if
(
out_row
==
num_rows
-
1
)
{
// make the prefixsum work
out_deg
[
num_rows
]
=
0
;
temp_deg
[
num_rows
]
=
0
;
}
}
}
/**
* @brief Equivalent to numpy expression: array[idx[off:off + len]]
*
* @tparam IdType The ID type used for indices.
* @tparam FloatType The float type used for array values.
* @param array The array to be selected.
* @param idx_data The index mapping array.
* @param index The index of value to be selected.
* @param offset The offset to start.
* @param out The selected value (output).
*/
template
<
typename
IdType
,
typename
FloatType
>
__device__
void
_DoubleSlice
(
const
FloatType
*
const
array
,
const
IdType
*
const
idx_data
,
const
IdType
idx
,
const
IdType
offset
,
FloatType
*
const
out
)
{
if
(
idx_data
)
{
*
out
=
array
[
idx_data
[
offset
+
idx
]];
}
else
{
*
out
=
array
[
offset
+
idx
];
}
}
/**
* @brief Compute A-Res value. A-Res value needs to be calculated only if deg
* is greater than num_picks in weighted rowwise sampling without replacement.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param data The data array of the input CSR.
* @param prob The probability array of the input CSR.
* @param ares_ptr The offset to write each row to in the A-res array.
* @param ares_idxs The A-Res value corresponding index array, the index of input CSR (output).
* @param ares The A-Res value array (output).
* @author pengqirong (OPPO)
*/
template
<
typename
IdType
,
typename
FloatType
,
int
TILE_SIZE
>
__global__
void
_CSRAResValueKernel
(
const
uint64_t
rand_seed
,
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
const
IdType
*
const
data
,
const
FloatType
*
const
prob
,
const
IdType
*
const
ares_ptr
,
IdType
*
const
ares_idxs
,
FloatType
*
const
ares
)
{
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
;
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
curandStatePhilox4_32_10_t
rng
;
curand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
while
(
out_row
<
last_row
)
{
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
// A-Res value needs to be calculated only if deg is greater than num_picks
// in weighted rowwise sampling without replacement
if
(
deg
>
num_picks
)
{
const
int64_t
ares_row_start
=
ares_ptr
[
out_row
];
for
(
int64_t
idx
=
threadIdx
.
x
;
idx
<
deg
;
idx
+=
BLOCK_SIZE
)
{
const
int64_t
in_idx
=
in_row_start
+
idx
;
const
int64_t
ares_idx
=
ares_row_start
+
idx
;
FloatType
item_prob
;
_DoubleSlice
<
IdType
,
FloatType
>
(
prob
,
data
,
idx
,
in_row_start
,
&
item_prob
);
// compute A-Res value
ares
[
ares_idx
]
=
static_cast
<
FloatType
>
(
__powf
(
curand_uniform
(
&
rng
),
1.0
f
/
item_prob
));
ares_idxs
[
ares_idx
]
=
static_cast
<
IdType
>
(
in_idx
);
}
}
out_row
+=
1
;
}
}
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
* without replacement. After sorting, we select top-num_picks items.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_cols The columns array of the input CSR.
* @param data The data array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param ares_ptr The offset to write each row to in the ares array.
* @param sort_ares_idxs The sorted A-Res value corresponding index array, the index of input CSR.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @author pengqirong (OPPO)
*/
template
<
typename
IdType
,
typename
FloatType
,
int
TILE_SIZE
>
__global__
void
_CSRRowWiseSampleKernel
(
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
const
IdType
*
const
in_cols
,
const
IdType
*
const
data
,
const
IdType
*
const
out_ptr
,
const
IdType
*
const
ares_ptr
,
const
IdType
*
const
sort_ares_idxs
,
IdType
*
const
out_rows
,
IdType
*
const
out_cols
,
IdType
*
const
out_idxs
)
{
// we assign one warp per row
assert
(
blockDim
.
x
==
BLOCK_SIZE
);
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
;
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
while
(
out_row
<
last_row
)
{
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
if
(
deg
>
num_picks
)
{
const
int64_t
ares_row_start
=
ares_ptr
[
out_row
];
for
(
int64_t
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
BLOCK_SIZE
)
{
// get in and out index, the in_idx is one of top num_picks A-Res value
// corresponding index in input CSR.
const
int64_t
out_idx
=
out_row_start
+
idx
;
const
int64_t
ares_idx
=
ares_row_start
+
idx
;
const
int64_t
in_idx
=
sort_ares_idxs
[
ares_idx
];
// copy permutation over
out_rows
[
out_idx
]
=
static_cast
<
IdType
>
(
row
);
out_cols
[
out_idx
]
=
in_cols
[
in_idx
];
out_idxs
[
out_idx
]
=
static_cast
<
IdType
>
(
data
?
data
[
in_idx
]
:
in_idx
);
}
}
else
{
for
(
int64_t
idx
=
threadIdx
.
x
;
idx
<
deg
;
idx
+=
BLOCK_SIZE
)
{
// get in and out index
const
int64_t
out_idx
=
out_row_start
+
idx
;
const
int64_t
in_idx
=
in_row_start
+
idx
;
// copy permutation over
out_rows
[
out_idx
]
=
static_cast
<
IdType
>
(
row
);
out_cols
[
out_idx
]
=
in_cols
[
in_idx
];
out_idxs
[
out_idx
]
=
static_cast
<
IdType
>
(
data
?
data
[
in_idx
]
:
in_idx
);
}
}
out_row
+=
1
;
}
}
// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
template
<
typename
FloatType
>
struct
BlockPrefixCallbackOp
{
// Running prefix
FloatType
running_total
;
// Constructor
__device__
BlockPrefixCallbackOp
(
FloatType
running_total
)
:
running_total
(
running_total
)
{}
// Callback operator to be entered by the first warp of threads in the block.
// Thread-0 is responsible for returning a value for seeding the block-wide scan.
__device__
FloatType
operator
()(
FloatType
block_aggregate
)
{
FloatType
old_prefix
=
running_total
;
running_total
+=
block_aggregate
;
return
old_prefix
;
}
};
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
* with replacement. We store the CDF (unnormalized) of all neighbors of a row
* in global memory and use binary search to find inverse indices as selected items.
*
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @tparam TILE_SIZE The number of rows covered by each threadblock.
* @param rand_seed The random seed to use.
* @param num_picks The number of non-zeros to pick per row.
* @param num_rows The number of rows to pick.
* @param in_rows The set of rows to pick.
* @param in_ptr The indptr array of the input CSR.
* @param in_cols The columns array of the input CSR.
* @param data The data array of the input CSR.
* @param prob The probability array of the input CSR.
* @param out_ptr The offset to write each row to in the output COO.
* @param cdf_ptr The offset of each cdf segment.
* @param cdf The global buffer to store cdf segments.
* @param out_rows The rows of the output COO (output).
* @param out_cols The columns of the output COO (output).
* @param out_idxs The data array of the output COO (output).
* @author pengqirong (OPPO)
*/
template
<
typename
IdType
,
typename
FloatType
,
int
TILE_SIZE
>
__global__
void
_CSRRowWiseSampleReplaceKernel
(
const
uint64_t
rand_seed
,
const
int64_t
num_picks
,
const
int64_t
num_rows
,
const
IdType
*
const
in_rows
,
const
IdType
*
const
in_ptr
,
const
IdType
*
const
in_cols
,
const
IdType
*
const
data
,
const
FloatType
*
const
prob
,
const
IdType
*
const
out_ptr
,
const
IdType
*
const
cdf_ptr
,
FloatType
*
const
cdf
,
IdType
*
const
out_rows
,
IdType
*
const
out_cols
,
IdType
*
const
out_idxs
)
{
// we assign one warp per row
assert
(
blockDim
.
x
==
BLOCK_SIZE
);
int64_t
out_row
=
blockIdx
.
x
*
TILE_SIZE
;
const
int64_t
last_row
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_rows
);
curandStatePhilox4_32_10_t
rng
;
curand_init
(
rand_seed
*
gridDim
.
x
+
blockIdx
.
x
,
threadIdx
.
x
,
0
,
&
rng
);
while
(
out_row
<
last_row
)
{
const
int64_t
row
=
in_rows
[
out_row
];
const
int64_t
in_row_start
=
in_ptr
[
row
];
const
int64_t
out_row_start
=
out_ptr
[
out_row
];
const
int64_t
cdf_row_start
=
cdf_ptr
[
out_row
];
const
int64_t
deg
=
in_ptr
[
row
+
1
]
-
in_row_start
;
const
FloatType
MIN_THREAD_DATA
=
static_cast
<
FloatType
>
(
0.0
f
);
if
(
deg
>
0
)
{
// Specialize BlockScan for a 1D block of BLOCK_SIZE threads
typedef
cub
::
BlockScan
<
FloatType
,
BLOCK_SIZE
>
BlockScan
;
// Allocate shared memory for BlockScan
__shared__
typename
BlockScan
::
TempStorage
temp_storage
;
// Initialize running total
BlockPrefixCallbackOp
<
FloatType
>
prefix_op
(
MIN_THREAD_DATA
);
int64_t
max_iter
=
(
1
+
(
deg
-
1
)
/
BLOCK_SIZE
)
*
BLOCK_SIZE
;
// Have the block iterate over segments of items
for
(
int64_t
idx
=
threadIdx
.
x
;
idx
<
max_iter
;
idx
+=
BLOCK_SIZE
)
{
// Load a segment of consecutive items that are blocked across threads
FloatType
thread_data
;
if
(
idx
<
deg
)
_DoubleSlice
<
IdType
,
FloatType
>
(
prob
,
data
,
idx
,
in_row_start
,
&
thread_data
);
else
thread_data
=
MIN_THREAD_DATA
;
thread_data
=
max
(
thread_data
,
MIN_THREAD_DATA
);
// Collectively compute the block-wide inclusive prefix sum
BlockScan
(
temp_storage
).
InclusiveSum
(
thread_data
,
thread_data
,
prefix_op
);
__syncthreads
();
// Store scanned items to cdf array
if
(
idx
<
deg
)
{
cdf
[
cdf_row_start
+
idx
]
=
thread_data
;
}
}
__syncthreads
();
for
(
int64_t
idx
=
threadIdx
.
x
;
idx
<
num_picks
;
idx
+=
BLOCK_SIZE
)
{
// get random value
FloatType
sum
=
cdf
[
cdf_row_start
+
deg
-
1
];
FloatType
rand
=
static_cast
<
FloatType
>
(
curand_uniform
(
&
rng
)
*
sum
);
// get the offset of the first value within cdf array which is greater than random value.
int64_t
item
=
cub
::
UpperBound
<
FloatType
*
,
int64_t
,
FloatType
>
(
&
cdf
[
cdf_row_start
],
deg
,
rand
);
item
=
min
(
item
,
deg
-
1
);
// get in and out index
const
int64_t
in_idx
=
in_row_start
+
item
;
const
int64_t
out_idx
=
out_row_start
+
idx
;
// copy permutation over
out_rows
[
out_idx
]
=
static_cast
<
IdType
>
(
row
);
out_cols
[
out_idx
]
=
in_cols
[
in_idx
];
out_idxs
[
out_idx
]
=
static_cast
<
IdType
>
(
data
?
data
[
in_idx
]
:
in_idx
);
}
}
out_row
+=
1
;
}
}
}
// namespace
/////////////////////////////// CSR ///////////////////////////////
/**
* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix.
* Use CDF sampling algorithm for with replacement:
* 1) Calculate the CDF of all neighbor's prob.
* 2) For each [0, num_picks), generate a rand ~ U(0, 1).
* Use binary search to find its index in the CDF array as a chosen item.
* Use A-Res sampling algorithm for without replacement:
* 1) For rows with deg > num_picks, calculate A-Res values for all neighbors.
* 2) Sort the A-Res array and select top-num_picks as chosen items.
*
* @tparam XPU The device type used for matrices.
* @tparam IdType The ID type used for matrices.
* @tparam FloatType The Float type used for matrices.
* @param mat The CSR matrix.
* @param rows The set of rows to pick.
* @param num_picks The number of non-zeros to pick per row.
* @param prob The probability array of the input CSR.
* @param replace Is replacement sampling?
* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
*/
template
<
DLDeviceType
XPU
,
typename
IdType
,
typename
FloatType
>
COOMatrix
CSRRowWiseSampling
(
CSRMatrix
mat
,
IdArray
rows
,
int64_t
num_picks
,
FloatArray
prob
,
bool
replace
)
{
const
auto
&
ctx
=
rows
->
ctx
;
auto
device
=
runtime
::
DeviceAPI
::
Get
(
ctx
);
// TODO(dlasalle): Once the device api supports getting the stream from the
// context, that should be used instead of the default stream here.
cudaStream_t
stream
=
0
;
const
int64_t
num_rows
=
rows
->
shape
[
0
];
const
IdType
*
const
slice_rows
=
static_cast
<
const
IdType
*>
(
rows
->
data
);
IdArray
picked_row
=
NewIdArray
(
num_rows
*
num_picks
,
ctx
,
sizeof
(
IdType
)
*
8
);
IdArray
picked_col
=
NewIdArray
(
num_rows
*
num_picks
,
ctx
,
sizeof
(
IdType
)
*
8
);
IdArray
picked_idx
=
NewIdArray
(
num_rows
*
num_picks
,
ctx
,
sizeof
(
IdType
)
*
8
);
const
IdType
*
const
in_ptr
=
static_cast
<
const
IdType
*>
(
mat
.
indptr
->
data
);
const
IdType
*
const
in_cols
=
static_cast
<
const
IdType
*>
(
mat
.
indices
->
data
);
IdType
*
const
out_rows
=
static_cast
<
IdType
*>
(
picked_row
->
data
);
IdType
*
const
out_cols
=
static_cast
<
IdType
*>
(
picked_col
->
data
);
IdType
*
const
out_idxs
=
static_cast
<
IdType
*>
(
picked_idx
->
data
);
const
IdType
*
const
data
=
CSRHasData
(
mat
)
?
static_cast
<
IdType
*>
(
mat
.
data
->
data
)
:
nullptr
;
const
FloatType
*
const
prob_data
=
static_cast
<
const
FloatType
*>
(
prob
->
data
);
// compute degree
// out_deg: the size of each row in the sampled matrix
// temp_deg: the size of each row we will manipulate in sampling
// 1) for w/o replacement: in degree if it's greater than num_picks else 0
// 2) for w/ replacement: in degree
IdType
*
out_deg
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
IdType
*
temp_deg
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
if
(
replace
)
{
const
dim3
block
(
512
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
CUDA_KERNEL_CALL
(
_CSRRowWiseSampleDegreeReplaceKernel
,
grid
,
block
,
0
,
stream
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
,
temp_deg
);
}
else
{
const
dim3
block
(
512
);
const
dim3
grid
((
num_rows
+
block
.
x
-
1
)
/
block
.
x
);
CUDA_KERNEL_CALL
(
_CSRRowWiseSampleDegreeKernel
,
grid
,
block
,
0
,
stream
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
out_deg
,
temp_deg
);
}
// fill temp_ptr
IdType
*
temp_ptr
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
size_t
prefix_temp_size
=
0
;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
nullptr
,
prefix_temp_size
,
temp_deg
,
temp_ptr
,
num_rows
+
1
,
stream
));
void
*
prefix_temp
=
device
->
AllocWorkspace
(
ctx
,
prefix_temp_size
);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
prefix_temp
,
prefix_temp_size
,
temp_deg
,
temp_ptr
,
num_rows
+
1
,
stream
));
device
->
FreeWorkspace
(
ctx
,
prefix_temp
);
device
->
FreeWorkspace
(
ctx
,
temp_deg
);
// TODO(Xin): The copy here is too small, and the overhead of creating
// cuda events cannot be ignored. Just use synchronized copy.
IdType
temp_len
;
device
->
CopyDataFromTo
(
temp_ptr
,
num_rows
*
sizeof
(
temp_len
),
&
temp_len
,
0
,
sizeof
(
temp_len
),
ctx
,
DGLContext
{
kDLCPU
,
0
},
mat
.
indptr
->
dtype
,
stream
);
device
->
StreamSync
(
ctx
,
stream
);
// fill out_ptr
IdType
*
out_ptr
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_rows
+
1
)
*
sizeof
(
IdType
)));
prefix_temp_size
=
0
;
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
nullptr
,
prefix_temp_size
,
out_deg
,
out_ptr
,
num_rows
+
1
,
stream
));
prefix_temp
=
device
->
AllocWorkspace
(
ctx
,
prefix_temp_size
);
CUDA_CALL
(
cub
::
DeviceScan
::
ExclusiveSum
(
prefix_temp
,
prefix_temp_size
,
out_deg
,
out_ptr
,
num_rows
+
1
,
stream
));
device
->
FreeWorkspace
(
ctx
,
prefix_temp
);
device
->
FreeWorkspace
(
ctx
,
out_deg
);
cudaEvent_t
copyEvent
;
CUDA_CALL
(
cudaEventCreate
(
&
copyEvent
));
// TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
// a cudaevent
IdType
new_len
;
device
->
CopyDataFromTo
(
out_ptr
,
num_rows
*
sizeof
(
new_len
),
&
new_len
,
0
,
sizeof
(
new_len
),
ctx
,
DGLContext
{
kDLCPU
,
0
},
mat
.
indptr
->
dtype
,
stream
);
CUDA_CALL
(
cudaEventRecord
(
copyEvent
,
stream
));
// allocate workspace
// 1) for w/ replacement, it's a global buffer to store cdf segments (one segment for each row).
// 2) for w/o replacement, it's used to store a-res segments (one segment for
// each row with degree > num_picks)
FloatType
*
temp
=
static_cast
<
FloatType
*>
(
device
->
AllocWorkspace
(
ctx
,
temp_len
*
sizeof
(
FloatType
)));
const
uint64_t
rand_seed
=
RandomEngine
::
ThreadLocal
()
->
RandInt
(
1000000000
);
// select edges
// the number of rows each thread block will cover
constexpr
int
TILE_SIZE
=
128
/
BLOCK_SIZE
;
if
(
replace
)
{
// with replacement.
const
dim3
block
(
BLOCK_SIZE
);
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
CUDA_KERNEL_CALL
(
(
_CSRRowWiseSampleReplaceKernel
<
IdType
,
FloatType
,
TILE_SIZE
>
),
grid
,
block
,
0
,
stream
,
rand_seed
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
in_cols
,
data
,
prob_data
,
out_ptr
,
temp_ptr
,
temp
,
out_rows
,
out_cols
,
out_idxs
);
device
->
FreeWorkspace
(
ctx
,
temp
);
}
else
{
// without replacement
IdType
*
temp_idxs
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
(
temp_len
)
*
sizeof
(
IdType
)));
// Compute A-Res value. A-Res value needs to be calculated only if deg
// is greater than num_picks in weighted rowwise sampling without replacement.
const
dim3
block
(
BLOCK_SIZE
);
const
dim3
grid
((
num_rows
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
CUDA_KERNEL_CALL
(
(
_CSRAResValueKernel
<
IdType
,
FloatType
,
TILE_SIZE
>
),
grid
,
block
,
0
,
stream
,
rand_seed
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
data
,
prob_data
,
temp_ptr
,
temp_idxs
,
temp
);
// sort A-Res value array.
FloatType
*
sort_temp
=
static_cast
<
FloatType
*>
(
device
->
AllocWorkspace
(
ctx
,
temp_len
*
sizeof
(
FloatType
)));
IdType
*
sort_temp_idxs
=
static_cast
<
IdType
*>
(
device
->
AllocWorkspace
(
ctx
,
temp_len
*
sizeof
(
IdType
)));
cub
::
DoubleBuffer
<
FloatType
>
sort_keys
(
temp
,
sort_temp
);
cub
::
DoubleBuffer
<
IdType
>
sort_values
(
temp_idxs
,
sort_temp_idxs
);
void
*
d_temp_storage
=
nullptr
;
size_t
temp_storage_bytes
=
0
;
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
d_temp_storage
,
temp_storage_bytes
,
sort_keys
,
sort_values
,
temp_len
,
num_rows
,
temp_ptr
,
temp_ptr
+
1
));
d_temp_storage
=
device
->
AllocWorkspace
(
ctx
,
temp_storage_bytes
);
CUDA_CALL
(
cub
::
DeviceSegmentedSort
::
SortPairsDescending
(
d_temp_storage
,
temp_storage_bytes
,
sort_keys
,
sort_values
,
temp_len
,
num_rows
,
temp_ptr
,
temp_ptr
+
1
));
device
->
FreeWorkspace
(
ctx
,
d_temp_storage
);
device
->
FreeWorkspace
(
ctx
,
temp
);
device
->
FreeWorkspace
(
ctx
,
temp_idxs
);
device
->
FreeWorkspace
(
ctx
,
sort_temp
);
device
->
FreeWorkspace
(
ctx
,
sort_temp_idxs
);
// select tok-num_picks as results
CUDA_KERNEL_CALL
(
(
_CSRRowWiseSampleKernel
<
IdType
,
FloatType
,
TILE_SIZE
>
),
grid
,
block
,
0
,
stream
,
num_picks
,
num_rows
,
slice_rows
,
in_ptr
,
in_cols
,
data
,
out_ptr
,
temp_ptr
,
sort_values
.
Current
(),
out_rows
,
out_cols
,
out_idxs
);
}
device
->
FreeWorkspace
(
ctx
,
temp_ptr
);
device
->
FreeWorkspace
(
ctx
,
out_ptr
);
// wait for copying `new_len` to finish
CUDA_CALL
(
cudaEventSynchronize
(
copyEvent
));
CUDA_CALL
(
cudaEventDestroy
(
copyEvent
));
picked_row
=
picked_row
.
CreateView
({
new_len
},
picked_row
->
dtype
);
picked_col
=
picked_col
.
CreateView
({
new_len
},
picked_col
->
dtype
);
picked_idx
=
picked_idx
.
CreateView
({
new_len
},
picked_idx
->
dtype
);
return
COOMatrix
(
mat
.
num_rows
,
mat
.
num_cols
,
picked_row
,
picked_col
,
picked_idx
);
}
template
COOMatrix
CSRRowWiseSampling
<
kDLGPU
,
int32_t
,
float
>(
CSRMatrix
,
IdArray
,
int64_t
,
FloatArray
,
bool
);
template
COOMatrix
CSRRowWiseSampling
<
kDLGPU
,
int64_t
,
float
>(
CSRMatrix
,
IdArray
,
int64_t
,
FloatArray
,
bool
);
template
COOMatrix
CSRRowWiseSampling
<
kDLGPU
,
int32_t
,
double
>(
CSRMatrix
,
IdArray
,
int64_t
,
FloatArray
,
bool
);
template
COOMatrix
CSRRowWiseSampling
<
kDLGPU
,
int64_t
,
double
>(
CSRMatrix
,
IdArray
,
int64_t
,
FloatArray
,
bool
);
}
// namespace impl
}
// namespace aten
}
// namespace dgl
src/array/cuda/sddmm_hetero.cu
→
src/array/cuda/sddmm_hetero
_coo
.cu
View file @
e9b624fe
...
@@ -9,39 +9,6 @@
...
@@ -9,39 +9,6 @@
namespace
dgl
{
namespace
dgl
{
namespace
aten
{
namespace
aten
{
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template
<
int
XPU
,
typename
IdType
,
int
bits
>
void
SDDMMCsrHetero
(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
vec_lhs
,
const
std
::
vector
<
NDArray
>&
vec_rhs
,
std
::
vector
<
NDArray
>
vec_out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
lhs_eid
,
const
std
::
vector
<
dgl_type_t
>&
rhs_eid
)
{
SWITCH_BITS
(
bits
,
DType
,
{
SWITCH_OP
(
op
,
Op
,
{
SWITCH_TARGET
(
lhs_target
,
rhs_target
,
LhsTarget
,
RhsTarget
,
{
/* Call SDDMM CUDA kernel for each relation type sequentially */
for
(
dgl_type_t
etype
=
0
;
etype
<
lhs_eid
.
size
();
++
etype
)
{
CSRMatrix
csr
=
vec_csr
[
etype
];
NDArray
lhs
=
vec_lhs
[
lhs_eid
[
etype
]];
NDArray
rhs
=
vec_rhs
[
rhs_eid
[
etype
]];
NDArray
out
=
vec_out
[
etype
];
cuda
::
SDDMMCsr
<
IdType
,
DType
,
Op
,
LhsTarget
,
RhsTarget
>
(
bcast
,
csr
,
lhs
,
rhs
,
out
);
}
});
});
});
}
/*!
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
* \brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
Csr format.
...
@@ -76,49 +43,6 @@ void SDDMMCooHetero(const std::string& op,
...
@@ -76,49 +43,6 @@ void SDDMMCooHetero(const std::string& op,
}
}
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
16
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
16
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
32
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
32
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
64
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
64
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCooHetero
<
kDLGPU
,
int32_t
,
16
>(
template
void
SDDMMCooHetero
<
kDLGPU
,
int32_t
,
16
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
COOMatrix
>&
vec_coo
,
const
std
::
vector
<
COOMatrix
>&
vec_coo
,
...
...
src/array/cuda/sddmm_hetero_csr.cu
0 → 100644
View file @
e9b624fe
/*!
* Copyright (c) 2020 by Contributors
* \file array/cuda/sddmm.cu
* \brief SDDMM C APIs and definitions.
*/
#include <dgl/array.h>
#include "./sddmm.cuh"
namespace
dgl
{
namespace
aten
{
/*!
* \brief CUDA implementation of g-SDDMM on heterograph using
Csr format.
*/
template
<
int
XPU
,
typename
IdType
,
int
bits
>
void
SDDMMCsrHetero
(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
vec_lhs
,
const
std
::
vector
<
NDArray
>&
vec_rhs
,
std
::
vector
<
NDArray
>
vec_out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
lhs_eid
,
const
std
::
vector
<
dgl_type_t
>&
rhs_eid
)
{
SWITCH_BITS
(
bits
,
DType
,
{
SWITCH_OP
(
op
,
Op
,
{
SWITCH_TARGET
(
lhs_target
,
rhs_target
,
LhsTarget
,
RhsTarget
,
{
/* Call SDDMM CUDA kernel for each relation type sequentially */
for
(
dgl_type_t
etype
=
0
;
etype
<
lhs_eid
.
size
();
++
etype
)
{
CSRMatrix
csr
=
vec_csr
[
etype
];
NDArray
lhs
=
vec_lhs
[
lhs_eid
[
etype
]];
NDArray
rhs
=
vec_rhs
[
rhs_eid
[
etype
]];
NDArray
out
=
vec_out
[
etype
];
cuda
::
SDDMMCsr
<
IdType
,
DType
,
Op
,
LhsTarget
,
RhsTarget
>
(
bcast
,
csr
,
lhs
,
rhs
,
out
);
}
});
});
});
}
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
16
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
16
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
32
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
32
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int32_t
,
64
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
template
void
SDDMMCsrHetero
<
kDLGPU
,
int64_t
,
64
>(
const
std
::
string
&
op
,
const
BcastOff
&
bcast
,
const
std
::
vector
<
CSRMatrix
>&
vec_csr
,
const
std
::
vector
<
NDArray
>&
lhs
,
const
std
::
vector
<
NDArray
>&
rhs
,
std
::
vector
<
NDArray
>
out
,
int
lhs_target
,
int
rhs_target
,
const
std
::
vector
<
dgl_type_t
>&
in_eid
,
const
std
::
vector
<
dgl_type_t
>&
out_eid
);
}
// namespace aten
}
// namespace dgl
src/graph/sampling/randomwalks/randomwalk_gpu.cu
View file @
e9b624fe
/*!
/*!
* Copyright (c) 2021 by Contributors
* Copyright (c) 2021
-2022
by Contributors
* \file graph/sampling/randomwalk_gpu.cu
* \file graph/sampling/randomwalk_gpu.cu
* \brief
DGL
sample
r
* \brief
CUDA random walk
sample
ing
*/
*/
#include <dgl/array.h>
#include <dgl/array.h>
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
#include <utility>
#include <utility>
#include <tuple>
#include <tuple>
#include "../../../array/cuda/dgl_cub.cuh"
#include "../../../runtime/cuda/cuda_common.h"
#include "../../../runtime/cuda/cuda_common.h"
#include "frequency_hashmap.cuh"
#include "frequency_hashmap.cuh"
...
@@ -89,6 +90,81 @@ __global__ void _RandomWalkKernel(
...
@@ -89,6 +90,81 @@ __global__ void _RandomWalkKernel(
}
}
}
}
template
<
typename
IdType
,
typename
FloatType
,
int
BLOCK_SIZE
,
int
TILE_SIZE
>
__global__
void
_RandomWalkBiasedKernel
(
const
uint64_t
rand_seed
,
const
IdType
*
seed_data
,
const
int64_t
num_seeds
,
const
IdType
*
metapath_data
,
const
uint64_t
max_num_steps
,
const
GraphKernelData
<
IdType
>
*
graphs
,
const
FloatType
**
probs
,
const
FloatType
**
prob_sums
,
const
FloatType
*
restart_prob_data
,
const
int64_t
restart_prob_size
,
const
int64_t
max_nodes
,
IdType
*
out_traces_data
,
IdType
*
out_eids_data
)
{
assert
(
BLOCK_SIZE
==
blockDim
.
x
);
int64_t
idx
=
blockIdx
.
x
*
TILE_SIZE
+
threadIdx
.
x
;
int64_t
last_idx
=
min
(
static_cast
<
int64_t
>
(
blockIdx
.
x
+
1
)
*
TILE_SIZE
,
num_seeds
);
int64_t
trace_length
=
(
max_num_steps
+
1
);
curandState
rng
;
// reference:
// https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
curand_init
(
rand_seed
+
idx
,
0
,
0
,
&
rng
);
while
(
idx
<
last_idx
)
{
IdType
curr
=
seed_data
[
idx
];
assert
(
curr
<
max_nodes
);
IdType
*
traces_data_ptr
=
&
out_traces_data
[
idx
*
trace_length
];
IdType
*
eids_data_ptr
=
&
out_eids_data
[
idx
*
max_num_steps
];
*
(
traces_data_ptr
++
)
=
curr
;
int64_t
step_idx
;
for
(
step_idx
=
0
;
step_idx
<
max_num_steps
;
++
step_idx
)
{
IdType
metapath_id
=
metapath_data
[
step_idx
];
const
GraphKernelData
<
IdType
>
&
graph
=
graphs
[
metapath_id
];
const
int64_t
in_row_start
=
graph
.
in_ptr
[
curr
];
const
int64_t
deg
=
graph
.
in_ptr
[
curr
+
1
]
-
graph
.
in_ptr
[
curr
];
if
(
deg
==
0
)
{
// the degree is zero
break
;
}
// randomly select by weight
const
FloatType
*
prob_sum
=
prob_sums
[
metapath_id
];
const
FloatType
*
prob
=
probs
[
metapath_id
];
int64_t
num
;
if
(
prob
==
nullptr
)
{
num
=
curand
(
&
rng
)
%
deg
;
}
else
{
auto
rnd_sum_w
=
prob_sum
[
curr
]
*
curand_uniform
(
&
rng
);
FloatType
sum_w
{
0.
};
for
(
num
=
0
;
num
<
deg
;
++
num
)
{
sum_w
+=
prob
[
in_row_start
+
num
];
if
(
sum_w
>=
rnd_sum_w
)
break
;
}
}
IdType
pick
=
graph
.
in_cols
[
in_row_start
+
num
];
IdType
eid
=
(
graph
.
data
?
graph
.
data
[
in_row_start
+
num
]
:
in_row_start
+
num
);
*
traces_data_ptr
=
pick
;
*
eids_data_ptr
=
eid
;
if
((
restart_prob_size
>
1
)
&&
(
curand_uniform
(
&
rng
)
<
restart_prob_data
[
step_idx
]))
{
break
;
}
else
if
((
restart_prob_size
==
1
)
&&
(
curand_uniform
(
&
rng
)
<
restart_prob_data
[
0
]))
{
break
;
}
++
traces_data_ptr
;
++
eids_data_ptr
;
curr
=
pick
;
}
for
(;
step_idx
<
max_num_steps
;
++
step_idx
)
{
*
(
traces_data_ptr
++
)
=
-
1
;
*
(
eids_data_ptr
++
)
=
-
1
;
}
idx
+=
BLOCK_SIZE
;
}
}
}
// namespace
}
// namespace
// random walk for uniform choice
// random walk for uniform choice
...
@@ -167,6 +243,143 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
...
@@ -167,6 +243,143 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
return
std
::
make_pair
(
traces
,
eids
);
return
std
::
make_pair
(
traces
,
eids
);
}
}
/**
* \brief Random walk for biased choice. We use inverse transform sampling to
* choose the next step.
*/
template
<
DLDeviceType
XPU
,
typename
FloatType
,
typename
IdType
>
std
::
pair
<
IdArray
,
IdArray
>
RandomWalkBiased
(
const
HeteroGraphPtr
hg
,
const
IdArray
seeds
,
const
TypeArray
metapath
,
const
std
::
vector
<
FloatArray
>
&
prob
,
FloatArray
restart_prob
)
{
const
int64_t
max_num_steps
=
metapath
->
shape
[
0
];
const
IdType
*
metapath_data
=
static_cast
<
IdType
*>
(
metapath
->
data
);
const
int64_t
begin_ntype
=
hg
->
meta_graph
()
->
FindEdge
(
metapath_data
[
0
]).
first
;
const
int64_t
max_nodes
=
hg
->
NumVertices
(
begin_ntype
);
int64_t
num_etypes
=
hg
->
NumEdgeTypes
();
auto
ctx
=
seeds
->
ctx
;
const
IdType
*
seed_data
=
static_cast
<
const
IdType
*>
(
seeds
->
data
);
CHECK
(
seeds
->
ndim
==
1
)
<<
"seeds shape is not one dimension."
;
const
int64_t
num_seeds
=
seeds
->
shape
[
0
];
int64_t
trace_length
=
max_num_steps
+
1
;
IdArray
traces
=
IdArray
::
Empty
({
num_seeds
,
trace_length
},
seeds
->
dtype
,
ctx
);
IdArray
eids
=
IdArray
::
Empty
({
num_seeds
,
max_num_steps
},
seeds
->
dtype
,
ctx
);
IdType
*
traces_data
=
traces
.
Ptr
<
IdType
>
();
IdType
*
eids_data
=
eids
.
Ptr
<
IdType
>
();
cudaStream_t
stream
=
0
;
auto
device
=
DeviceAPI
::
Get
(
ctx
);
// new probs and prob sums pointers
assert
(
num_etypes
==
static_cast
<
int64_t
>
(
prob
.
size
()));
std
::
unique_ptr
<
FloatType
*
[]
>
probs
(
new
FloatType
*
[
prob
.
size
()]);
std
::
unique_ptr
<
FloatType
*
[]
>
prob_sums
(
new
FloatType
*
[
prob
.
size
()]);
std
::
vector
<
FloatArray
>
prob_sums_arr
;
prob_sums_arr
.
reserve
(
prob
.
size
());
// graphs
std
::
vector
<
GraphKernelData
<
IdType
>>
h_graphs
(
num_etypes
);
for
(
int64_t
etype
=
0
;
etype
<
num_etypes
;
++
etype
)
{
const
CSRMatrix
&
csr
=
hg
->
GetCSRMatrix
(
etype
);
h_graphs
[
etype
].
in_ptr
=
static_cast
<
const
IdType
*>
(
csr
.
indptr
->
data
);
h_graphs
[
etype
].
in_cols
=
static_cast
<
const
IdType
*>
(
csr
.
indices
->
data
);
h_graphs
[
etype
].
data
=
(
CSRHasData
(
csr
)
?
static_cast
<
const
IdType
*>
(
csr
.
data
->
data
)
:
nullptr
);
int64_t
num_segments
=
csr
.
indptr
->
shape
[
0
]
-
1
;
// will handle empty probs in the kernel
if
(
IsNullArray
(
prob
[
etype
]))
{
probs
[
etype
]
=
nullptr
;
prob_sums
[
etype
]
=
nullptr
;
continue
;
}
probs
[
etype
]
=
prob
[
etype
].
Ptr
<
FloatType
>
();
prob_sums_arr
.
push_back
(
FloatArray
::
Empty
({
num_segments
},
prob
[
etype
]
->
dtype
,
ctx
));
prob_sums
[
etype
]
=
prob_sums_arr
[
etype
].
Ptr
<
FloatType
>
();
// calculate the sum of the neighbor weights
const
IdType
*
d_offsets
=
static_cast
<
const
IdType
*>
(
csr
.
indptr
->
data
);
size_t
temp_storage_size
=
0
;
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
nullptr
,
temp_storage_size
,
probs
[
etype
],
prob_sums
[
etype
],
num_segments
,
d_offsets
,
d_offsets
+
1
));
void
*
temp_storage
=
device
->
AllocWorkspace
(
ctx
,
temp_storage_size
);
CUDA_CALL
(
cub
::
DeviceSegmentedReduce
::
Sum
(
temp_storage
,
temp_storage_size
,
probs
[
etype
],
prob_sums
[
etype
],
num_segments
,
d_offsets
,
d_offsets
+
1
));
device
->
FreeWorkspace
(
ctx
,
temp_storage
);
}
// copy graph metadata pointers to GPU
auto
d_graphs
=
static_cast
<
GraphKernelData
<
IdType
>*>
(
device
->
AllocWorkspace
(
ctx
,
(
num_etypes
)
*
sizeof
(
GraphKernelData
<
IdType
>
)));
device
->
CopyDataFromTo
(
h_graphs
.
data
(),
0
,
d_graphs
,
0
,
(
num_etypes
)
*
sizeof
(
GraphKernelData
<
IdType
>
),
DGLContext
{
kDLCPU
,
0
},
ctx
,
hg
->
GetCSRMatrix
(
0
).
indptr
->
dtype
,
stream
);
// copy probs pointers to GPU
const
FloatType
**
probs_dev
=
static_cast
<
const
FloatType
**>
(
device
->
AllocWorkspace
(
ctx
,
num_etypes
*
sizeof
(
FloatType
*
)));
device
->
CopyDataFromTo
(
probs
.
get
(),
0
,
probs_dev
,
0
,
(
num_etypes
)
*
sizeof
(
FloatType
*
),
DGLContext
{
kDLCPU
,
0
},
ctx
,
prob
[
0
]
->
dtype
,
stream
);
// copy probs_sum pointers to GPU
const
FloatType
**
prob_sums_dev
=
static_cast
<
const
FloatType
**>
(
device
->
AllocWorkspace
(
ctx
,
num_etypes
*
sizeof
(
FloatType
*
)));
device
->
CopyDataFromTo
(
prob_sums
.
get
(),
0
,
prob_sums_dev
,
0
,
(
num_etypes
)
*
sizeof
(
FloatType
*
),
DGLContext
{
kDLCPU
,
0
},
ctx
,
prob
[
0
]
->
dtype
,
stream
);
// copy metapath to GPU
auto
d_metapath
=
metapath
.
CopyTo
(
ctx
);
const
IdType
*
d_metapath_data
=
static_cast
<
IdType
*>
(
d_metapath
->
data
);
constexpr
int
BLOCK_SIZE
=
256
;
constexpr
int
TILE_SIZE
=
BLOCK_SIZE
*
4
;
dim3
block
(
256
);
dim3
grid
((
num_seeds
+
TILE_SIZE
-
1
)
/
TILE_SIZE
);
const
uint64_t
random_seed
=
RandomEngine
::
ThreadLocal
()
->
RandInt
(
1000000000
);
CHECK
(
restart_prob
->
ctx
.
device_type
==
kDLGPU
)
<<
"restart prob should be in GPU."
;
CHECK
(
restart_prob
->
ndim
==
1
)
<<
"restart prob dimension should be 1."
;
const
FloatType
*
restart_prob_data
=
restart_prob
.
Ptr
<
FloatType
>
();
const
int64_t
restart_prob_size
=
restart_prob
->
shape
[
0
];
CUDA_KERNEL_CALL
(
(
_RandomWalkBiasedKernel
<
IdType
,
FloatType
,
BLOCK_SIZE
,
TILE_SIZE
>
),
grid
,
block
,
0
,
stream
,
random_seed
,
seed_data
,
num_seeds
,
d_metapath_data
,
max_num_steps
,
d_graphs
,
probs_dev
,
prob_sums_dev
,
restart_prob_data
,
restart_prob_size
,
max_nodes
,
traces_data
,
eids_data
);
device
->
FreeWorkspace
(
ctx
,
d_graphs
);
device
->
FreeWorkspace
(
ctx
,
probs_dev
);
device
->
FreeWorkspace
(
ctx
,
prob_sums_dev
);
return
std
::
make_pair
(
traces
,
eids
);
}
template
<
DLDeviceType
XPU
,
typename
IdType
>
template
<
DLDeviceType
XPU
,
typename
IdType
>
std
::
pair
<
IdArray
,
IdArray
>
RandomWalk
(
std
::
pair
<
IdArray
,
IdArray
>
RandomWalk
(
const
HeteroGraphPtr
hg
,
const
HeteroGraphPtr
hg
,
...
@@ -174,16 +387,25 @@ std::pair<IdArray, IdArray> RandomWalk(
...
@@ -174,16 +387,25 @@ std::pair<IdArray, IdArray> RandomWalk(
const
TypeArray
metapath
,
const
TypeArray
metapath
,
const
std
::
vector
<
FloatArray
>
&
prob
)
{
const
std
::
vector
<
FloatArray
>
&
prob
)
{
// not support no-uniform choice now
bool
isUniform
=
true
;
for
(
const
auto
&
etype_prob
:
prob
)
{
for
(
const
auto
&
etype_prob
:
prob
)
{
if
(
!
IsNullArray
(
etype_prob
))
{
if
(
!
IsNullArray
(
etype_prob
))
{
LOG
(
FATAL
)
<<
"Non-uniform choice is not supported in GPU."
;
isUniform
=
false
;
break
;
}
}
}
}
auto
restart_prob
=
NDArray
::
Empty
(
auto
restart_prob
=
NDArray
::
Empty
(
{
0
},
DLDataType
{
kDLFloat
,
32
,
1
},
DGLContext
{
XPU
,
0
});
{
0
},
DLDataType
{
kDLFloat
,
32
,
1
},
DGLContext
{
XPU
,
0
});
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob
);
if
(
!
isUniform
)
{
std
::
pair
<
IdArray
,
IdArray
>
ret
;
ATEN_FLOAT_TYPE_SWITCH
(
prob
[
0
]
->
dtype
,
FloatType
,
"probability"
,
{
ret
=
RandomWalkBiased
<
XPU
,
FloatType
,
IdType
>
(
hg
,
seeds
,
metapath
,
prob
,
restart_prob
);
});
return
ret
;
}
else
{
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob
);
}
}
}
template
<
DLDeviceType
XPU
,
typename
IdType
>
template
<
DLDeviceType
XPU
,
typename
IdType
>
...
@@ -194,12 +416,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
...
@@ -194,12 +416,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
const
std
::
vector
<
FloatArray
>
&
prob
,
const
std
::
vector
<
FloatArray
>
&
prob
,
double
restart_prob
)
{
double
restart_prob
)
{
// not support no-uniform choice now
bool
isUniform
=
true
;
for
(
const
auto
&
etype_prob
:
prob
)
{
for
(
const
auto
&
etype_prob
:
prob
)
{
if
(
!
IsNullArray
(
etype_prob
))
{
if
(
!
IsNullArray
(
etype_prob
))
{
LOG
(
FATAL
)
<<
"Non-uniform choice is not supported in GPU."
;
isUniform
=
false
;
break
;
}
}
}
}
auto
device_ctx
=
seeds
->
ctx
;
auto
device_ctx
=
seeds
->
ctx
;
auto
restart_prob_array
=
NDArray
::
Empty
(
auto
restart_prob_array
=
NDArray
::
Empty
(
{
1
},
DLDataType
{
kDLFloat
,
64
,
1
},
device_ctx
);
{
1
},
DLDataType
{
kDLFloat
,
64
,
1
},
device_ctx
);
...
@@ -214,7 +438,16 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
...
@@ -214,7 +438,16 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
restart_prob_array
->
dtype
,
stream
);
restart_prob_array
->
dtype
,
stream
);
device
->
StreamSync
(
device_ctx
,
stream
);
device
->
StreamSync
(
device_ctx
,
stream
);
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob_array
);
if
(
!
isUniform
)
{
std
::
pair
<
IdArray
,
IdArray
>
ret
;
ATEN_FLOAT_TYPE_SWITCH
(
prob
[
0
]
->
dtype
,
FloatType
,
"probability"
,
{
ret
=
RandomWalkBiased
<
XPU
,
FloatType
,
IdType
>
(
hg
,
seeds
,
metapath
,
prob
,
restart_prob_array
);
});
return
ret
;
}
else
{
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob_array
);
}
}
}
template
<
DLDeviceType
XPU
,
typename
IdType
>
template
<
DLDeviceType
XPU
,
typename
IdType
>
...
@@ -225,14 +458,23 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
...
@@ -225,14 +458,23 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
const
std
::
vector
<
FloatArray
>
&
prob
,
const
std
::
vector
<
FloatArray
>
&
prob
,
FloatArray
restart_prob
)
{
FloatArray
restart_prob
)
{
// not support no-uniform choice now
bool
isUniform
=
true
;
for
(
const
auto
&
etype_prob
:
prob
)
{
for
(
const
auto
&
etype_prob
:
prob
)
{
if
(
!
IsNullArray
(
etype_prob
))
{
if
(
!
IsNullArray
(
etype_prob
))
{
LOG
(
FATAL
)
<<
"Non-uniform choice is not supported in GPU."
;
isUniform
=
false
;
break
;
}
}
}
}
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob
);
if
(
!
isUniform
)
{
std
::
pair
<
IdArray
,
IdArray
>
ret
;
ATEN_FLOAT_TYPE_SWITCH
(
prob
[
0
]
->
dtype
,
FloatType
,
"probability"
,
{
ret
=
RandomWalkBiased
<
XPU
,
FloatType
,
IdType
>
(
hg
,
seeds
,
metapath
,
prob
,
restart_prob
);
});
return
ret
;
}
else
{
return
RandomWalkUniform
<
XPU
,
IdType
>
(
hg
,
seeds
,
metapath
,
restart_prob
);
}
}
}
template
<
DLDeviceType
XPU
,
typename
IdxType
>
template
<
DLDeviceType
XPU
,
typename
IdxType
>
...
...
src/graph/sampling/randomwalks/randomwalks.cc
View file @
e9b624fe
...
@@ -44,9 +44,14 @@ void CheckRandomWalkInputs(
...
@@ -44,9 +44,14 @@ void CheckRandomWalkInputs(
}
}
for
(
uint64_t
i
=
0
;
i
<
prob
.
size
();
++
i
)
{
for
(
uint64_t
i
=
0
;
i
<
prob
.
size
();
++
i
)
{
FloatArray
p
=
prob
[
i
];
FloatArray
p
=
prob
[
i
];
CHECK_EQ
(
hg
->
Context
(),
p
->
ctx
)
<<
"Expected prob ("
<<
p
->
ctx
<<
")"
<<
" to have the same "
\
<<
"context as graph ("
<<
hg
->
Context
()
<<
")."
;
CHECK_FLOAT
(
p
,
"probability"
);
CHECK_FLOAT
(
p
,
"probability"
);
if
(
p
.
GetSize
()
!=
0
)
if
(
p
.
GetSize
()
!=
0
)
{
CHECK_EQ
(
hg
->
IsPinned
(),
p
.
IsPinned
())
<<
"The prob array should have the same pinning status as the graph"
;
CHECK_NDIM
(
p
,
1
,
"probability"
);
CHECK_NDIM
(
p
,
1
,
"probability"
);
}
}
}
}
}
...
...
src/random/random.cc
View file @
e9b624fe
...
@@ -29,13 +29,15 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
...
@@ -29,13 +29,15 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
}
}
});
});
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_CUDA
auto
*
thr_entry
=
CUDAThreadEntry
::
ThreadLocal
();
if
(
DeviceAPI
::
Get
(
kDLGPU
)
->
IsAvailable
())
{
if
(
!
thr_entry
->
curand_gen
)
{
auto
*
thr_entry
=
CUDAThreadEntry
::
ThreadLocal
();
CURAND_CALL
(
curandCreateGenerator
(
&
thr_entry
->
curand_gen
,
CURAND_RNG_PSEUDO_DEFAULT
));
if
(
!
thr_entry
->
curand_gen
)
{
CURAND_CALL
(
curandCreateGenerator
(
&
thr_entry
->
curand_gen
,
CURAND_RNG_PSEUDO_DEFAULT
));
}
CURAND_CALL
(
curandSetPseudoRandomGeneratorSeed
(
thr_entry
->
curand_gen
,
static_cast
<
uint64_t
>
(
seed
)));
}
}
CURAND_CALL
(
curandSetPseudoRandomGeneratorSeed
(
thr_entry
->
curand_gen
,
static_cast
<
uint64_t
>
(
seed
)));
#endif // DGL_USE_CUDA
#endif // DGL_USE_CUDA
});
});
...
...
src/rpc/network/tcp_socket.cc
View file @
e9b624fe
...
@@ -64,14 +64,22 @@ bool TCPSocket::Bind(const char * ip, int port) {
...
@@ -64,14 +64,22 @@ bool TCPSocket::Bind(const char * ip, int port) {
SAI
sa_server
;
SAI
sa_server
;
sa_server
.
sin_family
=
AF_INET
;
sa_server
.
sin_family
=
AF_INET
;
sa_server
.
sin_port
=
htons
(
port
);
sa_server
.
sin_port
=
htons
(
port
);
int
retval
=
0
;
int
ret
=
0
;
ret
=
inet_pton
(
AF_INET
,
ip
,
&
sa_server
.
sin_addr
);
if
(
ret
==
0
)
{
LOG
(
ERROR
)
<<
"Invalid IP: "
<<
ip
;
return
false
;
}
else
if
(
ret
<
0
)
{
LOG
(
ERROR
)
<<
"Failed to convert ["
<<
ip
<<
"] to binary form, error: "
<<
strerror
(
errno
);
return
false
;
}
do
{
// retry if EINTR failure appears
do
{
// retry if EINTR failure appears
if
(
0
<
inet_pton
(
AF_INET
,
ip
,
&
sa_server
.
sin_addr
)
&&
if
(
0
<=
(
ret
=
bind
(
socket_
,
reinterpret_cast
<
SA
*>
(
&
sa_server
),
0
<=
(
retval
=
bind
(
socket_
,
reinterpret_cast
<
SA
*>
(
&
sa_server
),
sizeof
(
sa_server
))))
{
sizeof
(
sa_server
))))
{
return
true
;
return
true
;
}
}
}
while
(
ret
val
==
-
1
&&
errno
==
EINTR
);
}
while
(
ret
==
-
1
&&
errno
==
EINTR
);
LOG
(
ERROR
)
<<
"Failed bind on "
<<
ip
<<
":"
<<
port
<<
" , error: "
<<
strerror
(
errno
);
LOG
(
ERROR
)
<<
"Failed bind on "
<<
ip
<<
":"
<<
port
<<
" , error: "
<<
strerror
(
errno
);
return
false
;
return
false
;
...
...
src/runtime/cuda/cuda_device_api.cc
View file @
e9b624fe
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
* \brief GPU specific API
* \brief GPU specific API
*/
*/
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/tensordispatch.h>
#include <dmlc/thread_local.h>
#include <dmlc/thread_local.h>
#include <dgl/runtime/registry.h>
#include <dgl/runtime/registry.h>
#include <cuda_runtime.h>
#include <cuda_runtime.h>
...
@@ -15,6 +15,23 @@ namespace runtime {
...
@@ -15,6 +15,23 @@ namespace runtime {
class
CUDADeviceAPI
final
:
public
DeviceAPI
{
class
CUDADeviceAPI
final
:
public
DeviceAPI
{
public:
public:
CUDADeviceAPI
()
{
int
count
;
auto
err
=
cudaGetDeviceCount
(
&
count
);
switch
(
err
)
{
case
cudaSuccess
:
break
;
default:
count
=
0
;
cudaGetLastError
();
}
is_available_
=
count
>
0
;
}
bool
IsAvailable
()
final
{
return
is_available_
;
}
void
SetDevice
(
DGLContext
ctx
)
final
{
void
SetDevice
(
DGLContext
ctx
)
final
{
CUDA_CALL
(
cudaSetDevice
(
ctx
.
device_id
));
CUDA_CALL
(
cudaSetDevice
(
ctx
.
device_id
));
}
}
...
@@ -224,11 +241,21 @@ class CUDADeviceAPI final : public DeviceAPI {
...
@@ -224,11 +241,21 @@ class CUDADeviceAPI final : public DeviceAPI {
}
}
void
*
AllocWorkspace
(
DGLContext
ctx
,
size_t
size
,
DGLType
type_hint
)
final
{
void
*
AllocWorkspace
(
DGLContext
ctx
,
size_t
size
,
DGLType
type_hint
)
final
{
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
// Redirect to PyTorch's allocator when available.
SetDevice
(
ctx
);
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
return
td
->
AllocWorkspace
(
size
);
else
return
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
AllocWorkspace
(
ctx
,
size
);
}
}
void
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
final
{
void
FreeWorkspace
(
DGLContext
ctx
,
void
*
data
)
final
{
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
FreeWorkspace
(
ctx
,
data
);
TensorDispatcher
*
td
=
TensorDispatcher
::
Global
();
if
(
td
->
IsAvailable
())
td
->
FreeWorkspace
(
data
);
else
CUDAThreadEntry
::
ThreadLocal
()
->
pool
.
FreeWorkspace
(
ctx
,
data
);
}
}
static
const
std
::
shared_ptr
<
CUDADeviceAPI
>&
Global
()
{
static
const
std
::
shared_ptr
<
CUDADeviceAPI
>&
Global
()
{
...
...
Prev
1
…
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment