functional.py 115 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
##
#   Copyright 2019-2021 Contributors
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
#
16
"""Functional interface for transform"""
17

18
from collections.abc import Iterable, Mapping
19
from collections import defaultdict
20
import numpy as np
21
22
import scipy.sparse as sparse
import scipy.sparse.linalg
Da Zheng's avatar
Da Zheng committed
23

24
25
26
27
28
29
30
31
32
33
34
35
36
from .._ffi.function import _init_api
from ..base import dgl_warning, DGLError
from .. import convert
from ..heterograph import DGLHeteroGraph, DGLBlock
from ..heterograph_index import create_metagraph_index, create_heterograph_from_relations
from ..frame import Frame
from .. import ndarray as nd
from .. import backend as F
from .. import utils, batch
from ..partition import metis_partition_assignment
from ..partition import partition_graph_with_halo
from ..partition import metis_partition
from .. import subgraph
37

38
# TO BE DEPRECATED
39
from .._deprecate.graph import DGLGraph as DGLGraphStale
40

41
42
43
44
45
46
__all__ = [
    'line_graph',
    'khop_adj',
    'khop_graph',
    'reverse',
    'to_bidirected',
47
    'to_bidirected_stale',
48
    'add_reverse_edges',
49
50
51
    'laplacian_lambda_max',
    'knn_graph',
    'segmented_knn_graph',
52
53
54
55
    'add_edges',
    'add_nodes',
    'remove_edges',
    'remove_nodes',
56
57
58
59
    'add_self_loop',
    'remove_self_loop',
    'metapath_reachable_graph',
    'compact_graphs',
60
    'to_block',
61
    'to_simple',
62
    'to_simple_graph',
63
    'as_immutable_graph',
64
65
    'sort_csr_by_tag',
    'sort_csc_by_tag',
66
67
68
    'metis_partition_assignment',
    'partition_graph_with_halo',
    'metis_partition',
69
70
    'as_heterograph',
    'adj_product_graph',
71
    'adj_sum_graph',
72
    'reorder_graph'
73
    ]
74
75


76
77
78
79
80
81
82
83
84
85
def pairwise_squared_distance(x):
    """
    x : (n_samples, n_points, dims)
    return : (n_samples, n_points, n_points)
    """
    x2s = F.sum(x * x, -1, True)
    # assuming that __matmul__ is always implemented (true for PyTorch, MXNet and Chainer)
    return x2s + F.swapaxes(x2s, -1, -2) - 2 * x @ F.swapaxes(x, -1, -2)

#pylint: disable=invalid-name
86
87
def knn_graph(x, k, algorithm='bruteforce-blas', dist='euclidean'):
    r"""Construct a graph from a set of points according to k-nearest-neighbor (KNN)
88
    and return.
89

90
    The function transforms the coordinates/features of a point set
91
    into a directed homogeneous graph. The coordinates of the point
92
93
94
    set is specified as a matrix whose rows correspond to points and
    columns correspond to coordinate/feature dimensions.

95
    The nodes of the returned graph correspond to the points, where the predecessors
96
    of each point are its k-nearest neighbors measured by the chosen distance.
97

98
99
    If :attr:`x` is a 3D tensor, then each submatrix will be transformed
    into a separate graph. DGL then composes the graphs into a large
100
    graph of multiple connected components.
101

102
103
    See :doc:`the benchmark <../api/python/knn_benchmark>` for a complete benchmark result.

104
105
    Parameters
    ----------
106
107
    x : Tensor
        The point coordinates. It can be either on CPU or GPU.
108

109
        * If is 2D, ``x[i]`` corresponds to the i-th node in the KNN graph.
110

111
        * If is 3D, ``x[i]`` corresponds to the i-th KNN graph and
112
          ``x[i][j]`` corresponds to the j-th node in the i-th KNN graph.
113
    k : int
114
        The number of nearest neighbors per node.
115
116
117
    algorithm : str, optional
        Algorithm used to compute the k-nearest neighbors.

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
        * 'bruteforce-blas' will first compute the distance matrix
          using BLAS matrix multiplication operation provided by
          backend frameworks. Then use topk algorithm to get
          k-nearest neighbors. This method is fast when the point
          set is small but has :math:`O(N^2)` memory complexity where
          :math:`N` is the number of points.

        * 'bruteforce' will compute distances pair by pair and
          directly select the k-nearest neighbors during distance
          computation. This method is slower than 'bruteforce-blas'
          but has less memory overhead (i.e., :math:`O(Nk)` where :math:`N`
          is the number of points, :math:`k` is the number of nearest
          neighbors per node) since we do not need to store all distances.

        * 'bruteforce-sharemem' (CUDA only) is similar to 'bruteforce'
          but use shared memory in CUDA devices for buffer. This method is
          faster than 'bruteforce' when the dimension of input points
          is not large. This method is only available on CUDA device.

        * 'kd-tree' will use the kd-tree algorithm (CPU only).
          This method is suitable for low-dimensional data (e.g. 3D
          point clouds)

141
        * 'nn-descent' is an approximate approach from paper
142
143
144
145
          `Efficient k-nearest neighbor graph construction for generic similarity
          measures <https://www.cs.princeton.edu/cass/papers/www11.pdf>`_. This method
          will search for nearest neighbor candidates in "neighbors' neighbors".

146
147
148
149
150
151
152
        (default: 'bruteforce-blas')
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
        (default: 'euclidean')
153
154
155
156

    Returns
    -------
    DGLGraph
157
        The constructed graph. The node IDs are in the same order as :attr:`x`.
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

    Examples
    --------

    The following examples use PyTorch backend.

    >>> import dgl
    >>> import torch

    When :attr:`x` is a 2D tensor, a single KNN graph is constructed.

    >>> x = torch.tensor([[0.0, 0.0, 1.0],
    ...                   [1.0, 0.5, 0.5],
    ...                   [0.5, 0.2, 0.2],
    ...                   [0.3, 0.2, 0.4]])
    >>> knn_g = dgl.knn_graph(x, 2)  # Each node has two predecessors
    >>> knn_g.edges()
175
    (tensor([0, 1, 2, 2, 2, 3, 3, 3]), tensor([0, 1, 1, 2, 3, 0, 2, 3]))
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

    When :attr:`x` is a 3D tensor, DGL constructs multiple KNN graphs and
    and then composes them into a graph of multiple connected components.

    >>> x1 = torch.tensor([[0.0, 0.0, 1.0],
    ...                    [1.0, 0.5, 0.5],
    ...                    [0.5, 0.2, 0.2],
    ...                    [0.3, 0.2, 0.4]])
    >>> x2 = torch.tensor([[0.0, 1.0, 1.0],
    ...                    [0.3, 0.3, 0.3],
    ...                    [0.4, 0.4, 1.0],
    ...                    [0.3, 0.8, 0.2]])
    >>> x = torch.stack([x1, x2], dim=0)
    >>> knn_g = dgl.knn_graph(x, 2)  # Each node has two predecessors
    >>> knn_g.edges()
    (tensor([0, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 7]),
     tensor([0, 1, 1, 2, 3, 0, 2, 3, 4, 5, 6, 7, 4, 6, 5, 7]))
193
    """
194
195
196
197
198
199
200
201
202
203
    # check invalid k
    if k <= 0:
        raise DGLError("Invalid k value. expect k > 0, got k = {}".format(k))

    # check empty point set
    if F.shape(x)[0] == 0:
        raise DGLError("Find empty point set")

    if algorithm == 'bruteforce-blas':
        return _knn_graph_blas(x, k, dist=dist)
204
205
206
207
208
209
210
    else:
        if F.ndim(x) == 3:
            x_size = tuple(F.shape(x))
            x = F.reshape(x, (x_size[0] * x_size[1], x_size[2]))
            x_seg = x_size[0] * [x_size[1]]
        else:
            x_seg = [F.shape(x)[0]]
211
        out = knn(k, x, x_seg, algorithm=algorithm, dist=dist)
212
213
214
        row, col = out[1], out[0]
        return convert.graph((row, col))

215
216
217
218
219
220
def _knn_graph_blas(x, k, dist='euclidean'):
    r"""Construct a graph from a set of points according to k-nearest-neighbor (KNN).

    This function first compute the distance matrix using BLAS matrix multiplication
    operation provided by backend frameworks. Then use topk algorithm to get
    k-nearest neighbors.
221
222
223
224
225
226
227
228
229
230
231
232

    Parameters
    ----------
    x : Tensor
        The point coordinates. It can be either on CPU or GPU.

        * If is 2D, ``x[i]`` corresponds to the i-th node in the KNN graph.

        * If is 3D, ``x[i]`` corresponds to the i-th KNN graph and
          ``x[i][j]`` corresponds to the j-th node in the i-th KNN graph.
    k : int
        The number of nearest neighbors per node.
233
234
235
236
237
238
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
        (default: 'euclidean')
239
    """
240
241
242
243
    if F.ndim(x) == 2:
        x = F.unsqueeze(x, 0)
    n_samples, n_points, _ = F.shape(x)

244
245
246
247
248
249
250
251
252
253
254
    if k > n_points:
        dgl_warning("'k' should be less than or equal to the number of points in 'x'" \
                    "expect k <= {0}, got k = {1}, use k = {0}".format(n_points, k))
        k = n_points

    # if use cosine distance, normalize input points first
    # thus we can use euclidean distance to find knn equivalently.
    if dist == 'cosine':
        l2_norm = lambda v: F.sqrt(F.sum(v * v, dim=2, keepdims=True))
        x = x / (l2_norm(x) + 1e-5)

255
    ctx = F.context(x)
256
    dist = pairwise_squared_distance(x)
Jinjing Zhou's avatar
Jinjing Zhou committed
257
    k_indices = F.astype(F.argtopk(dist, k, 2, descending=False), F.int64)
258
259
260
261
262
263
264
265
    # index offset for each sample
    offset = F.arange(0, n_samples, ctx=ctx) * n_points
    offset = F.unsqueeze(offset, 1)
    src = F.reshape(k_indices, (n_samples, n_points * k))
    src = F.unsqueeze(src, 0) + offset
    dst = F.repeat(F.arange(0, n_points, ctx=ctx), k, dim=0)
    dst = F.unsqueeze(dst, 0) + offset
    return convert.graph((F.reshape(src, (-1,)), F.reshape(dst, (-1,))))
266
267

#pylint: disable=invalid-name
268
269
def segmented_knn_graph(x, k, segs, algorithm='bruteforce-blas', dist='euclidean'):
    r"""Construct multiple graphs from multiple sets of points according to
270
    k-nearest-neighbor (KNN) and return.
271

272
273
274
    Compared with :func:`dgl.knn_graph`, this allows multiple point sets with
    different capacity. The points from different sets are stored contiguously
    in the :attr:`x` tensor.
275
276
    :attr:`segs` specifies the number of points in each point set. The
    function constructs a KNN graph for each point set, where the predecessors
277
278
    of each point are its k-nearest neighbors measured by the Euclidean distance.
    DGL then composes all KNN graphs
279
    into a graph with multiple connected components.
280
281
282

    Parameters
    ----------
283
284
    x : Tensor
        Coordinates/features of points. Must be 2D. It can be either on CPU or GPU.
285
    k : int
286
        The number of nearest neighbors per node.
287
    segs : list[int]
288
289
        Number of points in each point set. The numbers in :attr:`segs`
        must sum up to the number of rows in :attr:`x`.
290
291
292
    algorithm : str, optional
        Algorithm used to compute the k-nearest neighbors.

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
        * 'bruteforce-blas' will first compute the distance matrix
          using BLAS matrix multiplication operation provided by
          backend frameworks. Then use topk algorithm to get
          k-nearest neighbors. This method is fast when the point
          set is small but has :math:`O(N^2)` memory complexity where
          :math:`N` is the number of points.

        * 'bruteforce' will compute distances pair by pair and
          directly select the k-nearest neighbors during distance
          computation. This method is slower than 'bruteforce-blas'
          but has less memory overhead (i.e., :math:`O(Nk)` where :math:`N`
          is the number of points, :math:`k` is the number of nearest
          neighbors per node) since we do not need to store all distances.

        * 'bruteforce-sharemem' (CUDA only) is similar to 'bruteforce'
          but use shared memory in CUDA devices for buffer. This method is
          faster than 'bruteforce' when the dimension of input points
          is not large. This method is only available on CUDA device.

        * 'kd-tree' will use the kd-tree algorithm (CPU only).
          This method is suitable for low-dimensional data (e.g. 3D
          point clouds)

316
        * 'nn-descent' is an approximate approach from paper
317
318
319
320
          `Efficient k-nearest neighbor graph construction for generic similarity
          measures <https://www.cs.princeton.edu/cass/papers/www11.pdf>`_. This method
          will search for nearest neighbor candidates in "neighbors' neighbors".

321
322
323
324
325
326
327
        (default: 'bruteforce-blas')
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
        (default: 'euclidean')
328
329
330
331

    Returns
    -------
    DGLGraph
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
        The graph. The node IDs are in the same order as :attr:`x`.

    Examples
    --------

    The following examples use PyTorch backend.

    >>> import dgl
    >>> import torch

    In the example below, the first point set has three points
    and the second point set has four points.

    >>> # Features/coordinates of the first point set
    >>> x1 = torch.tensor([[0.0, 0.5, 0.2],
    ...                    [0.1, 0.3, 0.2],
    ...                    [0.4, 0.2, 0.2]])
    >>> # Features/coordinates of the second point set
    >>> x2 = torch.tensor([[0.3, 0.2, 0.1],
    ...                    [0.5, 0.2, 0.3],
    ...                    [0.1, 0.1, 0.2],
    ...                    [0.6, 0.3, 0.3]])
    >>> x = torch.cat([x1, x2], dim=0)
    >>> segs = [x1.shape[0], x2.shape[0]]
    >>> knn_g = dgl.segmented_knn_graph(x, 2, segs)
    >>> knn_g.edges()
    (tensor([0, 0, 1, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6]),
     tensor([0, 1, 0, 1, 2, 2, 3, 5, 4, 6, 3, 5, 4, 6]))
360
    """
361
362
363
364
365
366
367
368
369
370
    # check invalid k
    if k <= 0:
        raise DGLError("Invalid k value. expect k > 0, got k = {}".format(k))

    # check empty point set
    if F.shape(x)[0] == 0:
        raise DGLError("Find empty point set")

    if algorithm == 'bruteforce-blas':
        return _segmented_knn_graph_blas(x, k, segs, dist=dist)
371
    else:
372
        out = knn(k, x, segs, algorithm=algorithm, dist=dist)
373
374
375
        row, col = out[1], out[0]
        return convert.graph((row, col))

376
377
378
379
380
381
382
def _segmented_knn_graph_blas(x, k, segs, dist='euclidean'):
    r"""Construct multiple graphs from multiple sets of points according to
    k-nearest-neighbor (KNN).

    This function first compute the distance matrix using BLAS matrix multiplication
    operation provided by backend frameworks. Then use topk algorithm to get
    k-nearest neighbors.
383
384
385
386
387
388
389
390
391
392

    Parameters
    ----------
    x : Tensor
        Coordinates/features of points. Must be 2D. It can be either on CPU or GPU.
    k : int
        The number of nearest neighbors per node.
    segs : list[int]
        Number of points in each point set. The numbers in :attr:`segs`
        must sum up to the number of rows in :attr:`x`.
393
394
395
396
397
398
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
        (default: 'euclidean')
399
    """
400
401
402
403
404
405
    # if use cosine distance, normalize input points first
    # thus we can use euclidean distance to find knn equivalently.
    if dist == 'cosine':
        l2_norm = lambda v: F.sqrt(F.sum(v * v, dim=1, keepdims=True))
        x = x / (l2_norm(x) + 1e-5)

406
407
    n_total_points, _ = F.shape(x)
    offset = np.insert(np.cumsum(segs), 0, 0)
408
409
410
411
412
    min_seg_size = np.min(segs)
    if k > min_seg_size:
        dgl_warning("'k' should be less than or equal to the number of points in 'x'" \
                    "expect k <= {0}, got k = {1}, use k = {0}".format(min_seg_size, k))
        k = min_seg_size
413
414

    h_list = F.split(x, segs, 0)
415
    src = [
416
        F.argtopk(pairwise_squared_distance(h_g), k, 1, descending=False) +
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
417
        int(offset[i])
418
        for i, h_g in enumerate(h_list)]
419
420
421
422
    src = F.cat(src, 0)
    ctx = F.context(x)
    dst = F.repeat(F.arange(0, n_total_points, ctx=ctx), k, dim=0)
    return convert.graph((F.reshape(src, (-1,)), F.reshape(dst, (-1,))))
423

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
def _nndescent_knn_graph(x, k, segs, num_iters=None, max_candidates=None,
                         delta=0.001, sample_rate=0.5, dist='euclidean'):
    r"""Construct multiple graphs from multiple sets of points according to
    **approximate** k-nearest-neighbor using NN-descent algorithm from paper
    `Efficient k-nearest neighbor graph construction for generic similarity
    measures <https://www.cs.princeton.edu/cass/papers/www11.pdf>`_.

    Parameters
    ----------
    x : Tensor
        Coordinates/features of points. Must be 2D. It can be either on CPU or GPU.
    k : int
        The number of nearest neighbors per node.
    segs : list[int]
        Number of points in each point set. The numbers in :attr:`segs`
        must sum up to the number of rows in :attr:`x`.
    num_iters : int, optional
        The maximum number of NN-descent iterations to perform. A value will be
        chosen based on the size of input by default.
        (Default: None)
    max_candidates : int, optional
        The maximum number of candidates to be considered during one iteration.
        Larger values will provide more accurate search results later, but
        potentially at non-negligible computation cost. A value will be chosen
        based on the number of neighbors by default.
        (Default: None)
    delta : float, optional
        A value controls the early abort. This function will abort if
        :math:`k * N * delta > c`, where :math:`N` is the number of points,
        :math:`c` is the number of updates during last iteration.
        (Default: 0.001)
    sample_rate : float, optional
        A value controls how many candidates sampled. It should be a float value
        between 0 and 1. Larger values will provide higher accuracy and converge
        speed but with higher time cost.
        (Default: 0.5)
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
        (default: 'euclidean')

    Returns
    -------
    DGLGraph
        The graph. The node IDs are in the same order as :attr:`x`.
    """
    num_points, _ = F.shape(x)
    if isinstance(segs, (tuple, list)):
        segs = F.tensor(segs)
    segs = F.copy_to(segs, F.context(x))

    if max_candidates is None:
        max_candidates = min(60, k)
    if num_iters is None:
        num_iters = max(10, int(round(np.log2(num_points))))
    max_candidates = int(sample_rate * max_candidates)

    # if use cosine distance, normalize input points first
    # thus we can use euclidean distance to find knn equivalently.
    if dist == 'cosine':
        l2_norm = lambda v: F.sqrt(F.sum(v * v, dim=1, keepdims=True))
        x = x / (l2_norm(x) + 1e-5)

    # k must less than or equal to min(segs)
    if k > F.min(segs, dim=0):
        raise DGLError("'k' must be less than or equal to the number of points in 'x'"
                       "expect 'k' <= {}, got 'k' = {}".format(F.min(segs, dim=0), k))
    if delta < 0 or delta > 1:
        raise DGLError("'delta' must in [0, 1], got 'delta' = {}".format(delta))

    offset = F.zeros((F.shape(segs)[0] + 1,), F.dtype(segs), F.context(segs))
    offset[1:] = F.cumsum(segs, dim=0)
    out = F.zeros((2, num_points * k), F.dtype(segs), F.context(segs))

    # points, offsets, out, k, num_iters, max_candidates, delta
    _CAPI_DGLNNDescent(F.to_dgl_nd(x), F.to_dgl_nd(offset),
                       F.zerocopy_to_dgl_ndarray_for_write(out),
                       k, num_iters, max_candidates, delta)
    return out

def knn(k, x, x_segs, y=None, y_segs=None, algorithm='bruteforce', dist='euclidean'):
507
    r"""For each element in each segment in :attr:`y`, find :attr:`k` nearest
508
509
    points in the same segment in :attr:`x`. If :attr:`y` is None, perform a self-query
    over :attr:`x`.
510
511
512
513
514
515
516

    This function allows multiple point sets with different capacity. The points
    from different sets are stored contiguously in the :attr:`x` and :attr:`y` tensor.
    :attr:`x_segs` and :attr:`y_segs` specifies the number of points in each point set.

    Parameters
    ----------
517
518
    k : int
        The number of nearest neighbors per node.
519
520
521
522
523
524
    x : Tensor
        The point coordinates in x. It can be either on CPU or GPU (must be the
        same as :attr:`y`). Must be 2D.
    x_segs : Union[List[int], Tensor]
        Number of points in each point set in :attr:`x`. The numbers in :attr:`x_segs`
        must sum up to the number of rows in :attr:`x`.
525
    y : Tensor, optional
526
527
        The point coordinates in y. It can be either on CPU or GPU (must be the
        same as :attr:`x`). Must be 2D.
528
529
        (default: None)
    y_segs : Union[List[int], Tensor], optional
530
531
        Number of points in each point set in :attr:`y`. The numbers in :attr:`y_segs`
        must sum up to the number of rows in :attr:`y`.
532
        (default: None)
533
534
    algorithm : str, optional
        Algorithm used to compute the k-nearest neighbors.
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551

        * 'bruteforce' will compute distances pair by pair and
          directly select the k-nearest neighbors during distance
          computation. This method is slower than 'bruteforce-blas'
          but has less memory overhead (i.e., :math:`O(Nk)` where :math:`N`
          is the number of points, :math:`k` is the number of nearest
          neighbors per node) since we do not need to store all distances.

        * 'bruteforce-sharemem' (CUDA only) is similar to 'bruteforce'
          but use shared memory in CUDA devices for buffer. This method is
          faster than 'bruteforce' when the dimension of input points
          is not large. This method is only available on CUDA device.

        * 'kd-tree' will use the kd-tree algorithm (CPU only).
          This method is suitable for low-dimensional data (e.g. 3D
          point clouds)

552
        * 'nn-descent' is an approximate approach from paper
553
554
555
556
557
          `Efficient k-nearest neighbor graph construction for generic similarity
          measures <https://www.cs.princeton.edu/cass/papers/www11.pdf>`_. This method
          will search for nearest neighbor candidates in "neighbors' neighbors".

        Note: Currently, 'nn-descent' only supports self-query cases, i.e. :attr:`y` is None.
558
        (default: 'bruteforce')
559
560
561
562
563
    dist : str, optional
        The distance metric used to compute distance between points. It can be the following
        metrics:
        * 'euclidean': Use Euclidean distance (L2 norm) :math:`\sqrt{\sum_{i} (x_{i} - y_{i})^{2}}`.
        * 'cosine': Use cosine distance.
564
        (default: 'euclidean')
565
566
567
568
569
570
571
572

    Returns
    -------
    Tensor
        Tensor with size `(2, k * num_points(y))`
        The first subtensor contains point indexs in :attr:`y`. The second subtensor contains
        point indexs in :attr:`x`
    """
573
574
575
576
577
578
579
580
581
582
583
    # TODO(lygztq) add support for querying different point sets using nn-descent.
    if algorithm == "nn-descent":
        if y is not None or y_segs is not None:
            raise DGLError("Currently 'nn-descent' only supports self-query cases.")
        return _nndescent_knn_graph(x, k, x_segs, dist=dist)

    # self query
    if y is None:
        y = x
        y_segs = x_segs

584
    assert F.context(x) == F.context(y)
585
586
587
588
589
590
591
    if isinstance(x_segs, (tuple, list)):
        x_segs = F.tensor(x_segs)
    if isinstance(y_segs, (tuple, list)):
        y_segs = F.tensor(y_segs)
    x_segs = F.copy_to(x_segs, F.context(x))
    y_segs = F.copy_to(y_segs, F.context(y))

592
593
594
595
596
597
598
599
600
601
602
603
604
605
    # k shoule be less than or equal to min(x_segs)
    min_num_points = F.min(x_segs, dim=0)
    if k > min_num_points:
        dgl_warning("'k' should be less than or equal to the number of points in 'x'" \
                    "expect k <= {0}, got k = {1}, use k = {0}".format(min_num_points, k))
        k = F.as_scalar(min_num_points)

    # invalid k
    if k <= 0:
        raise DGLError("Invalid k value. expect k > 0, got k = {}".format(k))

    # empty point set
    if F.shape(x)[0] == 0 or F.shape(y)[0] == 0:
        raise DGLError("Find empty point set")
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632

    dist = dist.lower()
    dist_metric_list = ['euclidean', 'cosine']
    if dist not in dist_metric_list:
        raise DGLError('Only {} are supported for distance'
                       'computation, got {}'.format(dist_metric_list, dist))

    x_offset = F.zeros((F.shape(x_segs)[0] + 1,), F.dtype(x_segs), F.context(x_segs))
    x_offset[1:] = F.cumsum(x_segs, dim=0)
    y_offset = F.zeros((F.shape(y_segs)[0] + 1,), F.dtype(y_segs), F.context(y_segs))
    y_offset[1:] = F.cumsum(y_segs, dim=0)

    out = F.zeros((2, F.shape(y)[0] * k), F.dtype(x_segs), F.context(x_segs))

    # if use cosine distance, normalize input points first
    # thus we can use euclidean distance to find knn equivalently.
    if dist == 'cosine':
        l2_norm = lambda v: F.sqrt(F.sum(v * v, dim=1, keepdims=True))
        x = x / (l2_norm(x) + 1e-5)
        y = y / (l2_norm(y) + 1e-5)

    _CAPI_DGLKNN(F.to_dgl_nd(x), F.to_dgl_nd(x_offset),
                 F.to_dgl_nd(y), F.to_dgl_nd(y_offset),
                 k, F.zerocopy_to_dgl_ndarray_for_write(out),
                 algorithm)
    return out

633
634
def to_bidirected(g, copy_ndata=False, readonly=None):
    r"""Convert the graph to a bi-directional simple graph and return.
635

636
    For an input graph :math:`G`, return a new graph :math:`G'` such that an edge
637
638
    :math:`(u, v)\in G'` exists if and only if there exists an edge
    :math:`(v, u)\in G`. The resulting graph :math:`G'` is a simple graph,
639
    meaning there is no parallel edge.
640

641
642
643
    The operation only works for edges whose two endpoints belong to the same node type.
    DGL will raise error if the input graph is heterogeneous and contains edges
    with different types of endpoints.
644
645
646
647
648
649
650

    Parameters
    ----------
    g : DGLGraph
        The input graph.
    copy_ndata: bool, optional
        If True, the node features of the bidirected graph are copied from the
651
        original graph. If False, the bidirected graph will not have any node features.
652
        (Default: False)
653
654
    readonly : bool
        **DEPRECATED**.
655
656
657

    Returns
    -------
658
    DGLGraph
659
660
        The bidirected graph

661
662
    Notes
    -----
663
664
665
    If :attr:`copy_ndata` is True, the resulting graph will share the node feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs.
666

667
668
669
670
671
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
    Examples
    --------
    The following examples use PyTorch backend.

    >>> import dgl
    >>> import torch as th
    >>> g = dgl.graph((th.tensor([0, 1, 2]), th.tensor([1, 2, 0])))
    >>> bg1 = dgl.to_bidirected(g)
    >>> bg1.edges()
    (tensor([0, 1, 2, 1, 2, 0]), tensor([1, 2, 0, 0, 1, 2]))

    The graph already have i->j and j->i

    >>> g = dgl.graph((th.tensor([0, 1, 2, 0]), th.tensor([1, 2, 0, 2])))
    >>> bg1 = dgl.to_bidirected(g)
    >>> bg1.edges()
    (tensor([0, 1, 2, 1, 2, 0]), tensor([1, 2, 0, 0, 1, 2]))

690
    **Heterogeneous graphs with Multiple Edge Types**
691
692

    >>> g = dgl.heterograph({
693
694
695
    ...     ('user', 'wins', 'user'): (th.tensor([0, 2, 0, 2]), th.tensor([1, 1, 2, 0])),
    ...     ('user', 'follows', 'user'): (th.tensor([1, 2, 1]), th.tensor([2, 1, 1]))
    ... })
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
    >>> bg1 = dgl.to_bidirected(g)
    >>> bg1.edges(etype='wins')
    (tensor([0, 0, 1, 1, 2, 2]), tensor([1, 2, 0, 2, 0, 1]))
    >>> bg1.edges(etype='follows')
    (tensor([1, 1, 2]), tensor([1, 2, 1]))
    """
    if readonly is not None:
        dgl_warning("Parameter readonly is deprecated" \
            "There will be no difference between readonly and non-readonly DGLGraph")

    for c_etype in g.canonical_etypes:
        if c_etype[0] != c_etype[2]:
            assert False, "to_bidirected is not well defined for " \
                "unidirectional bipartite graphs" \
                ", but {} is unidirectional bipartite".format(c_etype)

    g = add_reverse_edges(g, copy_ndata=copy_ndata, copy_edata=False)
    g = to_simple(g, return_counts=None, copy_ndata=copy_ndata, copy_edata=False)
    return g

def add_reverse_edges(g, readonly=None, copy_ndata=True,
                      copy_edata=False, ignore_bipartite=False):
718
    r"""Add a reversed edge for each edge in the input graph and return a new graph.
719
720
721
722
723

    For a graph with edges :math:`(i_1, j_1), \cdots, (i_n, j_n)`, this
    function creates a new graph with edges
    :math:`(i_1, j_1), \cdots, (i_n, j_n), (j_1, i_1), \cdots, (j_n, i_n)`.

724
725
726
727
    The operation only works for edges whose two endpoints belong to the same node type.
    DGL will raise error if the input graph is heterogeneous and contains edges
    with different types of endpoints. If :attr:`ignore_bipartite` is true, DGL will
    ignore those edges instead.
728
729
730
731

    Parameters
    ----------
    g : DGLGraph
732
        The input graph.
733
734
735
    readonly : bool, default to be True
        Deprecated. There will be no difference between readonly and non-readonly
    copy_ndata: bool, optional
736
737
738
        If True, the node features of the new graph are copied from
        the original graph. If False, the new graph will not have any
        node features.
739

740
741
742
        (Default: True)
    copy_edata: bool, optional
        If True, the features of the reversed edges will be identical to
743
        the original ones.
744
745
746

        If False, the new graph will not have any edge features.

747
748
749
        (Default: False)
    ignore_bipartite: bool, optional
        If True, unidirectional bipartite graphs are ignored and
750
        no error is raised. If False, an error will be raised if
751
        an edge type of the input heterogeneous graph is for a unidirectional
752
753
754
755
        bipartite graph.

    Returns
    -------
756
    DGLGraph
757
        The graph with reversed edges added.
758
759
760

    Notes
    -----
761
762
763
764
    If :attr:`copy_ndata` is True, the resulting graph will share the node feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs. On the contrary, the two graphs do not share
    the same edge feature storage.
765

766
767
768
769
770
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

771
772
    Examples
    --------
773
    **Homogeneous graphs**
774

775
    >>> g = dgl.graph((th.tensor([0, 0]), th.tensor([0, 1])))
776
    >>> bg1 = dgl.add_reverse_edges(g)
777
778
779
    >>> bg1.edges()
    (tensor([0, 0, 0, 1]), tensor([0, 1, 0, 0]))

780
    **Heterogeneous graphs**
781

782
    >>> g = dgl.heterograph({
783
784
785
786
    >>>     ('user', 'wins', 'user'): (th.tensor([0, 2, 0, 2, 2]), th.tensor([1, 1, 2, 1, 0])),
    >>>     ('user', 'plays', 'game'): (th.tensor([1, 2, 1]), th.tensor([2, 1, 1])),
    >>>     ('user', 'follows', 'user'): (th.tensor([1, 2, 1), th.tensor([0, 0, 0]))
    >>> })
787
788
789
    >>> g.nodes['game'].data['hv'] = th.ones(3, 1)
    >>> g.edges['wins'].data['h'] = th.tensor([0, 1, 2, 3, 4])

790
791
792
793
    The :func:`add_reverse_edges` operation is applied to the edge type
    ``('user', 'wins', 'user')`` and the edge type ``('user', 'follows', 'user')``.
    The edge type ``('user', 'plays', 'game')`` is ignored.  Both the node features and
    edge features are shared.
794

795
    >>> bg = dgl.add_reverse_edges(g, copy_ndata=True,
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
                               copy_edata=True, ignore_bipartite=True)
    >>> bg.edges(('user', 'wins', 'user'))
    (tensor([0, 2, 0, 2, 2, 1, 1, 2, 1, 0]), tensor([1, 1, 2, 1, 0, 0, 2, 0, 2, 2]))
    >>> bg.edges(('user', 'follows', 'user'))
    (tensor([1, 2, 1, 0, 0, 0]), tensor([0, 0, 0, 1, 2, 1]))
    >>> bg.edges(('user', 'plays', 'game'))
    (th.tensor([1, 2, 1]), th.tensor([2, 1, 1]))
    >>> bg.nodes['game'].data['hv']
    tensor([0, 0, 0])
    >>> bg.edges[('user', 'wins', 'user')].data['h']
    th.tensor([0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
    """
    if readonly is not None:
        dgl_warning("Parameter readonly is deprecated" \
            "There will be no difference between readonly and non-readonly DGLGraph")

812
813
814
815
816
    # get node cnt for each ntype
    num_nodes_dict = {}
    for ntype in g.ntypes:
        num_nodes_dict[ntype] = g.number_of_nodes(ntype)

817
    canonical_etypes = g.canonical_etypes
818
    num_nodes_dict = {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes}
819
820
821
822
823
    # fast path
    if ignore_bipartite is False:
        subgs = {}
        for c_etype in canonical_etypes:
            if c_etype[0] != c_etype[2]:
824
                assert False, "add_reverse_edges is not well defined for " \
825
826
827
828
829
830
                    "unidirectional bipartite graphs" \
                    ", but {} is unidirectional bipartite".format(c_etype)

            u, v = g.edges(form='uv', order='eid', etype=c_etype)
            subgs[c_etype] = (F.cat([u, v], dim=0), F.cat([v, u], dim=0))

831
        new_g = convert.heterograph(subgs, num_nodes_dict=num_nodes_dict)
832
833
834
835
836
837
838
839
840
841
    else:
        subgs = {}
        for c_etype in canonical_etypes:
            if c_etype[0] != c_etype[2]:
                u, v = g.edges(form='uv', order='eid', etype=c_etype)
                subgs[c_etype] = (u, v)
            else:
                u, v = g.edges(form='uv', order='eid', etype=c_etype)
                subgs[c_etype] = (F.cat([u, v], dim=0), F.cat([v, u], dim=0))

842
        new_g = convert.heterograph(subgs, num_nodes_dict=num_nodes_dict)
843
844
845

    # handle features
    if copy_ndata:
846
847
        node_frames = utils.extract_node_subframes(g, None)
        utils.set_new_frames(new_g, node_frames=node_frames)
848
849

    if copy_edata:
850
851
        # find indices
        eids = []
852
        for c_etype in canonical_etypes:
853
            eid = F.copy_to(F.arange(0, g.number_of_edges(c_etype)), new_g.device)
854
            if c_etype[0] != c_etype[2]:
855
                eids.append(eid)
856
            else:
857
858
859
860
861
                eids.append(F.cat([eid, eid], 0))

        edge_frames = utils.extract_edge_subframes(g, eids)
        utils.set_new_frames(new_g, edge_frames=edge_frames)

862
863
    return new_g

864
865
866
def line_graph(g, backtracking=True, shared=False):
    """Return the line graph of this graph.

867
    The line graph ``L(G)`` of a given graph ``G`` is defined as another graph where
868
    the nodes in ``L(G)`` correspond to the edges in ``G``.  For any pair of edges ``(u, v)``
869
870
    and ``(v, w)`` in ``G``, the corresponding node of edge ``(u, v)`` in ``L(G)`` will
    have an edge connecting to the corresponding node of edge ``(v, w)``.
871
872
873

    Parameters
    ----------
874
    g : DGLGraph
875
        Input graph.  Must be homogeneous.
876
    backtracking : bool, optional
877
878
879
880
        If False, the line graph node corresponding to edge ``(u, v)`` will not have
        an edge connecting to the line graph node corresponding to edge ``(v, u)``.

        Default: True.
881
882
883
    shared : bool, optional
        Whether to copy the edge features of the original graph as the node features
        of the result line graph.
884
885
886

    Returns
    -------
887
    G : DGLGraph
888
889
        The line graph of this graph.

890
891
    Notes
    -----
892
893
894
895
    * If :attr:`shared` is True, the node features of the resulting graph share the same
      storage with the edge features of the input graph. Hence, users should try to
      avoid in-place operations which will be visible to both graphs.
    * The function supports input graph on GPU but copies it to CPU during computation.
896
897
898
899
    * This function discards the batch information. Please use
      :func:`dgl.DGLGraph.set_batch_num_nodes`
      and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
      to maintain the information.
900
901
902

    Examples
    --------
903
904
905
906
907
908
    Assume that the graph has the following adjacency matrix: ::

       A = [[0, 0, 1],
            [1, 0, 1],
            [1, 1, 0]]

909
910
911
    >>> g = dgl.graph(([0, 1, 1, 2, 2],[2, 0, 2, 0, 1]), 'user', 'follows')
    >>> lg = g.line_graph()
    >>> lg
912
913
914
    Graph(num_nodes=5, num_edges=8,
    ndata_schemes={}
    edata_schemes={})
915
    >>> lg.edges()
916
    (tensor([0, 0, 1, 2, 2, 3, 4, 4]), tensor([3, 4, 0, 3, 4, 0, 1, 2]))
917
918
    >>> lg = g.line_graph(backtracking=False)
    >>> lg
919
920
921
    Graph(num_nodes=5, num_edges=4,
    ndata_schemes={}
    edata_schemes={})
922
    >>> lg.edges()
923
    (tensor([0, 1, 2, 4]), tensor([4, 0, 3, 1]))
924
    """
925
    assert g.is_homogeneous, \
926
        'only homogeneous graph is supported'
927
928
929
930

    dev = g.device
    lg = DGLHeteroGraph(_CAPI_DGLHeteroLineGraph(g._graph.copy_to(nd.cpu()), backtracking))
    lg = lg.to(dev)
931
    if shared:
932
933
934
        new_frames = utils.extract_edge_subframes(g, None)
        utils.set_new_frames(lg, node_frames=new_frames)

935
    return lg
936

937
DGLHeteroGraph.line_graph = utils.alias_func(line_graph)
938

939
def khop_adj(g, k):
940
    """Return the matrix of :math:`A^k` where :math:`A` is the adjacency matrix of the graph
941
    :math:`g`.
942

943
    The returned matrix is a 32-bit float dense matrix on CPU. The graph must be homogeneous.
944
945
946

    Parameters
    ----------
947
    g : DGLGraph
948
949
950
951
952
953
        The input graph.
    k : int
        The :math:`k` in :math:`A^k`.

    Returns
    -------
954
    Tensor
955
        The returned tensor.
956
957
958
959

    Examples
    --------
    >>> import dgl
960
    >>> g = dgl.graph(([0,1,2,3,4,0,1,2,3,4], [0,1,2,3,4,1,2,3,4,0]))
961
962
963
964
965
966
967
968
969
970
971
972
973
    >>> dgl.khop_adj(g, 1)
    tensor([[1., 0., 0., 0., 1.],
            [1., 1., 0., 0., 0.],
            [0., 1., 1., 0., 0.],
            [0., 0., 1., 1., 0.],
            [0., 0., 0., 1., 1.]])
    >>> dgl.khop_adj(g, 3)
    tensor([[1., 0., 1., 3., 3.],
            [3., 1., 0., 1., 3.],
            [3., 3., 1., 0., 1.],
            [1., 3., 3., 1., 0.],
            [0., 1., 3., 3., 1.]])
    """
974
    assert g.is_homogeneous, \
975
        'only homogeneous graph is supported'
976
    adj_k = g.adj(transpose=True, scipy_fmt=g.formats()['created'][0]) ** k
977
    return F.tensor(adj_k.todense().astype(np.float32))
978

979
980
981
982
983
984
985
def khop_graph(g, k, copy_ndata=True):
    """Return the graph whose edges connect the :attr:`k`-hop neighbors of the original graph.

    More specifically, an edge from node ``u`` and node ``v`` exists in the new graph if
    and only if a path with length :attr:`k` exists from node ``u`` to node ``v`` in the
    original graph.

986
987
988
989
990
    The adjacency matrix of the returned graph is :math:`A^k`
    (where :math:`A` is the adjacency matrix of :math:`g`).

    Parameters
    ----------
991
    g : DGLGraph
992
993
994
        The input graph.
    k : int
        The :math:`k` in `k`-hop graph.
995
996
997
998
999
1000
1001
    copy_ndata: bool, optional
        If True, the node features of the new graph are copied from the
        original graph.

        If False, the new graph will not have any node features.

        (Default: True)
1002
1003
1004

    Returns
    -------
1005
1006
1007
1008
1009
    DGLGraph
        The returned graph.

    Notes
    -----
1010
1011
1012
    If :attr:`copy_ndata` is True, the resulting graph will share the node feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs.
1013

1014
1015
1016
1017
1018
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1019
1020
1021
    Examples
    --------

Mufei Li's avatar
Mufei Li committed
1022
1023
1024
    Below gives an easy example:

    >>> import dgl
1025
    >>> g = dgl.graph(([0, 1], [1, 2]))
Mufei Li's avatar
Mufei Li committed
1026
1027
1028
1029
1030
1031
    >>> g_2 = dgl.transform.khop_graph(g, 2)
    >>> print(g_2.edges())
    (tensor([0]), tensor([2]))

    A more complicated example:

1032
    >>> import dgl
1033
    >>> g = dgl.graph(([0,1,2,3,4,0,1,2,3,4], [0,1,2,3,4,1,2,3,4,0]))
1034
1035
1036
1037
1038
1039
1040
1041
1042
    >>> dgl.khop_graph(g, 1)
    DGLGraph(num_nodes=5, num_edges=10,
             ndata_schemes={}
             edata_schemes={})
    >>> dgl.khop_graph(g, 3)
    DGLGraph(num_nodes=5, num_edges=40,
             ndata_schemes={}
             edata_schemes={})
    """
1043
    assert g.is_homogeneous, \
1044
        'only homogeneous graph is supported'
1045
    n = g.number_of_nodes()
1046
    adj_k = g.adj(transpose=False, scipy_fmt=g.formats()['created'][0]) ** k
1047
1048
1049
1050
1051
1052
    adj_k = adj_k.tocoo()
    multiplicity = adj_k.data
    row = np.repeat(adj_k.row, multiplicity)
    col = np.repeat(adj_k.col, multiplicity)
    # TODO(zihao): we should support creating multi-graph from scipy sparse matrix
    # in the future.
1053
    new_g = convert.graph((row, col), num_nodes=n, idtype=g.idtype, device=g.device)
1054
1055
1056
1057
1058
1059
1060

    # handle ndata
    if copy_ndata:
        node_frames = utils.extract_node_subframes(g, None)
        utils.set_new_frames(new_g, node_frames=node_frames)

    return new_g
1061

1062
def reverse(g, copy_ndata=True, copy_edata=False, *, share_ndata=None, share_edata=None):
1063
    r"""Return a new graph with every edges being the reverse ones in the input graph.
1064
1065

    The reverse (also called converse, transpose) of a graph with edges
1066
1067
    :math:`(i_1, j_1), (i_2, j_2), \cdots` of type ``(U, E, V)`` is a new graph with edges
    :math:`(j_1, i_1), (j_2, i_2), \cdots` of type ``(V, E, U)``.
1068

1069
1070
1071
    The returned graph shares the data structure with the original graph, i.e. dgl.reverse
    will not create extra storage for the reversed graph.

1072
1073
    Parameters
    ----------
1074
    g : DGLGraph
1075
1076
1077
        The input graph.
    copy_ndata: bool, optional
        If True, the node features of the reversed graph are copied from the
1078
        original graph. If False, the reversed graph will not have any node features.
1079
1080
1081
        (Default: True)
    copy_edata: bool, optional
        If True, the edge features of the reversed graph are copied from the
1082
        original graph. If False, the reversed graph will not have any edge features.
1083
1084
1085
1086
        (Default: False)

    Return
    ------
1087
    DGLGraph
1088
1089
1090
1091
        The reversed graph.

    Notes
    -----
1092
1093
1094
1095
    If :attr:`copy_ndata` or :attr:`copy_edata` is True,
    the resulting graph will share the node or edge feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs.
1096

1097
1098
1099
1100
1101
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1102
1103
    Examples
    --------
1104
    **Homogeneous graphs**
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122

    Create a graph to reverse.

    >>> import dgl
    >>> import torch as th
    >>> g = dgl.graph((th.tensor([0, 1, 2]), th.tensor([1, 2, 0])))
    >>> g.ndata['h'] = th.tensor([[0.], [1.], [2.]])
    >>> g.edata['h'] = th.tensor([[3.], [4.], [5.]])

    Reverse the graph.

    >>> rg = dgl.reverse(g, copy_edata=True)
    >>> rg.ndata['h']
    tensor([[0.],
            [1.],
            [2.]])

    The i-th edge in the reversed graph corresponds to the i-th edge in the
1123
    original graph. When :attr:`copy_edata` is True, they have the same features.
1124
1125
1126
1127
1128
1129
1130
1131

    >>> rg.edges()
    (tensor([1, 2, 0]), tensor([0, 1, 2]))
    >>> rg.edata['h']
    tensor([[3.],
            [4.],
            [5.]])

1132
    **Heterogenenous graphs**
1133
1134

    >>> g = dgl.heterograph({
1135
1136
1137
    ...     ('user', 'follows', 'user'): (th.tensor([0, 2]), th.tensor([1, 2])),
    ...     ('user', 'plays', 'game'): (th.tensor([1, 2, 1]), th.tensor([2, 1, 1]))
    ... })
1138
1139
1140
    >>> g.nodes['game'].data['hv'] = th.ones(3, 1)
    >>> g.edges['plays'].data['he'] = th.zeros(3, 1)

1141
    The resulting graph will have edge types
1142
    ``('user', 'follows', 'user)`` and ``('game', 'plays', 'user')``.
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152

    >>> rg = dgl.reverse(g, copy_ndata=True)
    >>> rg
    Graph(num_nodes={'game': 3, 'user': 3},
          num_edges={('user', 'follows', 'user'): 2, ('game', 'plays', 'user'): 3},
          metagraph=[('user', 'user'), ('game', 'user')])
    >>> rg.edges(etype='follows')
    (tensor([1, 2]), tensor([0, 2]))
    >>> rg.edges(etype='plays')
    (tensor([2, 1, 1]), tensor([1, 2, 1]))
1153
    >>> rg.nodes['game'].data['hv']
1154
1155
1156
1157
1158
1159
    tensor([[1.],
            [1.],
            [1.]])
    >>> rg.edges['plays'].data
    {}
    """
1160
1161
1162
1163
1164
1165
1166
    if share_ndata is not None:
        dgl_warning('share_ndata argument has been renamed to copy_ndata.')
        copy_ndata = share_ndata
    if share_edata is not None:
        dgl_warning('share_edata argument has been renamed to copy_edata.')
        copy_edata = share_edata
    if g.is_block:
1167
1168
1169
        # TODO(0.5 release, xiangsx) need to handle BLOCK
        # currently reversing a block results in undefined behavior
        raise DGLError('Reversing a block graph is not supported.')
1170
1171
    gidx = g._graph.reverse()
    new_g = DGLHeteroGraph(gidx, g.ntypes, g.etypes)
1172
1173
1174
1175
1176

    # handle ndata
    if copy_ndata:
        # for each ntype
        for ntype in g.ntypes:
1177
            new_g.nodes[ntype].data.update(g.nodes[ntype].data)
1178
1179
1180
1181

    # handle edata
    if copy_edata:
        # for each etype
1182
1183
1184
        for utype, etype, vtype in g.canonical_etypes:
            new_g.edges[vtype, etype, utype].data.update(
                g.edges[utype, etype, vtype].data)
1185
1186
1187

    return new_g

1188
DGLHeteroGraph.reverse = utils.alias_func(reverse)
1189

1190
1191
1192
def to_simple_graph(g):
    """Convert the graph to a simple graph with no multi-edge.

1193
    DEPRECATED: renamed to dgl.to_simple
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203

    Parameters
    ----------
    g : DGLGraph
        The input graph.

    Returns
    -------
    DGLGraph
        A simple graph.
1204
1205
1206
1207
1208
1209
1210
1211

    Notes
    -----

    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.
1212
    """
1213
1214
    dgl_warning('dgl.to_simple_graph is renamed to dgl.to_simple in v0.5.')
    return to_simple(g)
1215

1216
def to_bidirected_stale(g, readonly=True):
1217
1218
1219
1220
    """NOTE: this function only works on the deprecated
    :class:`dgl.DGLGraphStale` object.

    Convert the graph to a bidirected graph.
1221
1222

    The function generates a new graph with no node/edge feature.
1223
1224
    If g has an edge for ``(u, v)`` but no edge for ``(v, u)``, then the
    returned graph will have both ``(u, v)`` and ``(v, u)``.
1225

1226
    If the input graph is a multigraph (there are multiple edges from node u to node v),
1227
    the returned graph isn't well defined.
1228
1229
1230

    Parameters
    ----------
1231
    g : DGLGraphStale
1232
        The input graph.
1233
    readonly : bool
1234
1235
        Whether the returned bidirected graph is readonly or not.

1236
1237
        (Default: True)

1238
1239
    Notes
    -----
1240
    Please make sure g is a simple graph, otherwise the return value is undefined.
1241

1242
1243
1244
1245
1246
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1247
1248
1249
1250
1251
1252
1253
1254
1255
    Returns
    -------
    DGLGraph

    Examples
    --------
    The following two examples use PyTorch backend, one for non-multi graph
    and one for multi-graph.

1256
    >>> g = dgl._deprecate.graph.DGLGraph()
1257
1258
    >>> g.add_nodes(2)
    >>> g.add_edges([0, 0], [0, 1])
1259
    >>> bg1 = dgl.to_bidirected_stale(g)
1260
1261
1262
1263
    >>> bg1.edges()
    (tensor([0, 1, 0]), tensor([0, 0, 1]))
    """
    if readonly:
1264
        newgidx = _CAPI_DGLToBidirectedImmutableGraph(g._graph)
1265
    else:
1266
        newgidx = _CAPI_DGLToBidirectedMutableGraph(g._graph)
1267
    return DGLGraphStale(newgidx)
1268

1269
def laplacian_lambda_max(g):
1270
    """Return the largest eigenvalue of the normalized symmetric Laplacian of a graph.
1271

1272
1273
    If the graph is batched from multiple graphs, return the list of the largest eigenvalue
    for each graph instead.
1274
1275
1276

    Parameters
    ----------
1277
    g : DGLGraph
1278
1279
        The input graph, it must be a bi-directed homogeneous graph, i.e., every edge
        should have an accompanied reverse edge in the graph.
1280
        The graph can be batched from multiple graphs.
1281
1282
1283

    Returns
    -------
1284
1285
1286
1287
1288
1289
    list[float]
        A list where the i-th item indicates the largest eigenvalue
        of i-th graph in :attr:`g`.

        In the case where the function takes a single graph, it will return a list
        consisting of a single element.
1290
1291
1292
1293

    Examples
    --------
    >>> import dgl
1294
    >>> g = dgl.graph(([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], [1, 2, 3, 4, 0, 4, 0, 1, 2, 3]))
1295
1296
1297
    >>> dgl.laplacian_lambda_max(g)
    [1.809016994374948]
    """
1298
    g_arr = batch.unbatch(g)
1299
1300
1301
    rst = []
    for g_i in g_arr:
        n = g_i.number_of_nodes()
1302
        adj = g_i.adj(transpose=True, scipy_fmt=g_i.formats()['created'][0]).astype(float)
1303
        norm = sparse.diags(F.asnumpy(g_i.in_degrees()).clip(1) ** -0.5, dtype=float)
1304
        laplacian = sparse.eye(n) - norm * adj * norm
1305
1306
        rst.append(scipy.sparse.linalg.eigs(
            laplacian, 1, which='LM', return_eigenvectors=False)[0].real)
1307
1308
    return rst

Mufei Li's avatar
Mufei Li committed
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
def metapath_reachable_graph(g, metapath):
    """Return a graph where the successors of any node ``u`` are nodes reachable from ``u`` by
    the given metapath.

    If the beginning node type ``s`` and ending node type ``t`` are the same, it will return
    a homogeneous graph with node type ``s = t``.  Otherwise, a unidirectional bipartite graph
    with source node type ``s`` and destination node type ``t`` is returned.

    In both cases, two nodes ``u`` and ``v`` will be connected with an edge ``(u, v)`` if
    there exists one path matching the metapath from ``u`` to ``v``.

    The result graph keeps the node set of type ``s`` and ``t`` in the original graph even if
    they might have no neighbor.

    The features of the source/destination node type in the original graph would be copied to
    the new graph.

    Parameters
    ----------
1328
    g : DGLGraph
Mufei Li's avatar
Mufei Li committed
1329
1330
1331
1332
1333
1334
        The input graph
    metapath : list[str or tuple of str]
        Metapath in the form of a list of edge types

    Returns
    -------
1335
    DGLGraph
1336
        A homogeneous or unidirectional bipartite graph. It will be on CPU regardless of
1337
1338
        whether the input graph is on CPU or GPU.

1339
1340
1341
1342
1343
1344
1345
1346
    Notes
    -----

    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1347
1348
1349
1350
1351
1352
1353
1354
    Examples
    --------
    >>> g = dgl.heterograph({
    ...     ('A', 'AB', 'B'): ([0, 1, 2], [1, 2, 3]),
    ...     ('B', 'BA', 'A'): ([1, 2, 3], [0, 1, 2])})
    >>> new_g = dgl.metapath_reachable_graph(g, ['AB', 'BA'])
    >>> new_g.edges(order='eid')
    (tensor([0, 1, 2]), tensor([0, 1, 2]))
Mufei Li's avatar
Mufei Li committed
1355
1356
1357
    """
    adj = 1
    for etype in metapath:
1358
        adj = adj * g.adj(etype=etype, scipy_fmt='csr', transpose=False)
Mufei Li's avatar
Mufei Li committed
1359
1360
1361
1362

    adj = (adj != 0).tocsr()
    srctype = g.to_canonical_etype(metapath[0])[0]
    dsttype = g.to_canonical_etype(metapath[-1])[2]
1363
1364
1365
    new_g = convert.heterograph({(srctype, '_E', dsttype): adj.nonzero()},
                                {srctype: adj.shape[0], dsttype: adj.shape[1]},
                                idtype=g.idtype, device=g.device)
Mufei Li's avatar
Mufei Li committed
1366

1367
    # copy srcnode features
1368
    new_g.nodes[srctype].data.update(g.nodes[srctype].data)
1369
    # copy dstnode features
Mufei Li's avatar
Mufei Li committed
1370
    if srctype != dsttype:
1371
        new_g.nodes[dsttype].data.update(g.nodes[dsttype].data)
Mufei Li's avatar
Mufei Li committed
1372
1373

    return new_g
1374

1375
def add_nodes(g, num, data=None, ntype=None):
1376
    r"""Add the given number of nodes to the graph and return a new graph.
1377

1378
    The new nodes will have IDs starting from ``g.num_nodes(ntype)``.
1379
1380
1381
1382

    Parameters
    ----------
    num : int
1383
1384
1385
1386
        The number of nodes to add.
    data : dict[str, Tensor], optional
        Feature data of the added nodes. The keys are feature names
        while the values are feature data.
1387
    ntype : str, optional
1388
1389
        The node type name. Can be omitted if there is
        only one type of nodes in the graph.
1390
1391
1392

    Return
    ------
1393
    DGLGraph
1394
1395
1396
1397
        The graph with newly added nodes.

    Notes
    -----
1398
1399
1400
1401
    * For features in :attr:`g` but not in :attr:`data`,
      DGL assigns zero features for the newly added nodes.
    * For feature in :attr:`data` but not in :attr:`g`, DGL assigns zero features
      for the existing nodes in the graph.
1402
1403
1404
1405
    * This function discards the batch information. Please use
      :func:`dgl.DGLGraph.set_batch_num_nodes`
      and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
      to maintain the information.
1406
1407

    Examples
1408
    --------
1409

1410
    The following example uses PyTorch backend.
1411

1412
1413
1414
    >>> import dgl
    >>> import torch

1415
    **Homogeneous Graphs**
1416
1417
1418
1419
1420
1421
1422
1423
1424

    >>> g = dgl.graph((torch.tensor([0, 1]), torch.tensor([1, 2])))
    >>> g.num_nodes()
    3
    >>> g = dgl.add_nodes(g, 2)
    >>> g.num_nodes()
    5

    If the graph has some node features and new nodes are added without
1425
    features, their features will be filled with zeros.
1426
1427
1428
1429
1430
1431

    >>> g.ndata['h'] = torch.ones(5, 1)
    >>> g = dgl.add_nodes(g, 1)
    >>> g.ndata['h']
    tensor([[1.], [1.], [1.], [1.], [1.], [0.]])

1432
    Assign features for the new nodes.
1433
1434
1435
1436
1437

    >>> g = dgl.add_nodes(g, 1, {'h': torch.ones(1, 1), 'w': torch.ones(1, 1)})
    >>> g.ndata['h']
    tensor([[1.], [1.], [1.], [1.], [1.], [0.], [1.]])

1438
1439
    Since :attr:`data` contains new feature fields, the features for existing nodes
    will be filled with zeros.
1440
1441
1442
1443

    >>> g.ndata['w']
    tensor([[0.], [0.], [0.], [0.], [0.], [0.], [1.]])

1444
    **Heterogeneous Graphs**
1445
1446

    >>> g = dgl.heterograph({
1447
1448
1449
1450
1451
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
    ...                                 torch.tensor([0, 0, 1, 1])),
    ...     ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
    ...                                         torch.tensor([0, 1]))
    ...     })
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
    >>> g.num_nodes('user')
    3
    >>> g = dgl.add_nodes(g, 2, ntype='user')
    >>> g.num_nodes('user')
    5

    See Also
    --------
    remove_nodes
    add_edges
    remove_edges
    """
    g = g.clone()
    g.add_nodes(num, data=data, ntype=ntype)
    return g

def add_edges(g, u, v, data=None, etype=None):
1469
    r"""Add the edges to the graph and return a new graph.
1470

1471
    The i-th new edge will be from ``u[i]`` to ``v[i]``.  The IDs of the new
1472
    edges will start from ``g.num_edges(etype)``.
1473
1474

    Parameters
1475
    ----------
1476
    u : int, Tensor or iterable[int]
1477
        Source node IDs, ``u[i]`` gives the source node for the i-th new edge.
1478
    v : int, Tensor or iterable[int]
1479
        Destination node IDs, ``v[i]`` gives the destination node for the i-th new edge.
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
    data : dict[str, Tensor], optional
        Feature data of the added edges. The keys are feature names
        while the values are feature data.
    etype : str or (str, str, str), optional
        The type names of the edges. The allowed type name formats are:

        * ``(str, str, str)`` for source node type, edge type and destination node type.
        * or one ``str`` edge type name if the name can uniquely identify a
          triplet format in the graph.

        Can be omitted if the graph has only one type of edges.
1491

1492
1493
    Return
    ------
1494
    DGLGraph
1495
1496
1497
1498
        The graph with newly added edges.

    Notes
    -----
1499
1500
1501
1502
1503
1504
1505
    * If the end nodes of the given edges do not exist in :attr:`g`,
      :func:`dgl.add_nodes` is invoked to add those nodes.
      The node features of the new nodes will be filled with zeros.
    * For features in :attr:`g` but not in :attr:`data`,
      DGL assigns zero features for the newly added nodes.
    * For feature in :attr:`data` but not in :attr:`g`, DGL assigns zero features
      for the existing nodes in the graph.
1506
1507
1508
1509
    * This function discards the batch information. Please use
      :func:`dgl.DGLGraph.set_batch_num_nodes`
      and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
      to maintain the information.
1510
1511

    Examples
1512
    --------
1513
    The following example uses PyTorch backend.
1514

1515
1516
    >>> import dgl
    >>> import torch
1517

1518
    **Homogeneous Graphs**
1519

1520
1521
1522
1523
1524
1525
    >>> g = dgl.graph((torch.tensor([0, 1]), torch.tensor([1, 2])))
    >>> g.num_edges()
    2
    >>> g = dgl.add_edges(g, torch.tensor([1, 3]), torch.tensor([0, 1]))
    >>> g.num_edges()
    4
1526

1527
1528
    Since ``u`` or ``v`` contains a non-existing node ID, the nodes are
    added implicitly.
1529

1530
1531
1532
1533
    >>> g.num_nodes()
    4

    If the graph has some edge features and new edges are added without
1534
    features, their features will be filled with zeros.
1535
1536
1537
1538
1539
1540

    >>> g.edata['h'] = torch.ones(4, 1)
    >>> g = dgl.add_edges(g, torch.tensor([1]), torch.tensor([1]))
    >>> g.edata['h']
    tensor([[1.], [1.], [1.], [1.], [0.]])

1541
    You can also assign features for the new edges in adding new edges.
1542
1543

    >>> g = dgl.add_edges(g, torch.tensor([0, 0]), torch.tensor([2, 2]),
1544
    ...                   {'h': torch.tensor([[1.], [2.]]), 'w': torch.ones(2, 1)})
1545
1546
    >>> g.edata['h']
    tensor([[1.], [1.], [1.], [1.], [0.], [1.], [2.]])
1547
1548

    Since :attr:`data` contains new feature fields, the features for old edges
1549
    will be filled with zeros.
1550

1551
1552
1553
    >>> g.edata['w']
    tensor([[0.], [0.], [0.], [0.], [0.], [1.], [1.]])

1554
    **Heterogeneous Graphs**
1555
1556

    >>> g = dgl.heterograph({
1557
1558
1559
1560
1561
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
    ...                                 torch.tensor([0, 0, 1, 1])),
    ...     ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
    ...                                         torch.tensor([0, 1]))
    ...     })
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
    >>> g.number_of_edges('plays')
    4
    >>> g = dgl.add_edges(g, torch.tensor([3]), torch.tensor([3]), etype='plays')
    >>> g.number_of_edges('plays')
    5

    See Also
    --------
    add_nodes
    remove_nodes
    remove_edges
1573
    """
1574
1575
1576
1577
    g = g.clone()
    g.add_edges(u, v, data=data, etype=etype)
    return g

1578
def remove_edges(g, eids, etype=None, store_ids=False):
1579
    r"""Remove the specified edges and return a new graph.
1580

1581
1582
1583
    Also delete the features of the edges. The edges must exist in the graph.
    The resulting graph has the same number of the nodes as the input one,
    even if some nodes become isolated after the the edge removal.
1584
1585
1586

    Parameters
    ----------
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
    eids : int, Tensor, iterable[int]
        The IDs of the edges to remove.
    etype : str or (str, str, str), optional
        The type names of the edges. The allowed type name formats are:

        * ``(str, str, str)`` for source node type, edge type and destination node type.
        * or one ``str`` edge type name if the name can uniquely identify a
          triplet format in the graph.

        Can be omitted if the graph has only one type of edges.
1597
1598
1599
1600
    store_ids : bool, optional
        If True, it will store the raw IDs of the extracted nodes and edges in the ``ndata``
        and ``edata`` of the resulting graph under name ``dgl.NID`` and ``dgl.EID``,
        respectively.
1601
1602
1603

    Return
    ------
1604
    DGLGraph
1605
1606
        The graph with edges deleted.

1607
1608
1609
1610
1611
1612
1613
1614
    Notes
    -----

    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1615
1616
1617
1618
1619
    Examples
    --------
    >>> import dgl
    >>> import torch

1620
    **Homogeneous Graphs**
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633

    >>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([0, 1, 2])))
    >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
    >>> g = dgl.remove_edges(g, torch.tensor([0, 1]))
    >>> g
    Graph(num_nodes=3, num_edges=1,
        ndata_schemes={}
        edata_schemes={'he': Scheme(shape=(1,), dtype=torch.float32)})
    >>> g.edges('all')
    (tensor([2]), tensor([2]), tensor([0]))
    >>> g.edata['he']
    tensor([[2.]])

1634
    **Heterogeneous Graphs**
1635
1636

    >>> g = dgl.heterograph({
1637
1638
1639
1640
1641
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
    ...                                 torch.tensor([0, 0, 1, 1])),
    ...     ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
    ...                                         torch.tensor([0, 1]))
    ...     })
1642
1643
    >>> g = dgl.remove_edges(g, torch.tensor([0, 1]), 'plays')
    >>> g.edges('all', etype='plays')
1644
    (tensor([1, 2]), tensor([1, 1]), tensor([0, 1]))
1645

1646
1647
1648
1649
1650
1651
1652
    See Also
    --------
    add_nodes
    add_edges
    remove_nodes
    """
    g = g.clone()
1653
    g.remove_edges(eids, etype=etype, store_ids=store_ids)
1654
1655
1656
    return g


1657
def remove_nodes(g, nids, ntype=None, store_ids=False):
1658
    r"""Remove the specified nodes and return a new graph.
1659

1660
1661
1662
    Also delete the features. Edges that connect from/to the nodes will be
    removed as well. After the removal, DGL re-labels the remaining nodes and edges
    with IDs from 0.
1663
1664
1665

    Parameters
    ----------
1666
1667
    nids : int, Tensor, iterable[int]
        The nodes to be removed.
1668
1669
1670
    ntype : str, optional
        The type of the nodes to remove. Can be omitted if there is
        only one node type in the graph.
1671
1672
1673
1674
    store_ids : bool, optional
        If True, it will store the raw IDs of the extracted nodes and edges in the ``ndata``
        and ``edata`` of the resulting graph under name ``dgl.NID`` and ``dgl.EID``,
        respectively.
1675
1676
1677

    Return
    ------
1678
    DGLGraph
1679
1680
        The graph with nodes deleted.

1681
1682
1683
1684
1685
1686
1687
1688
    Notes
    -----

    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1689
1690
1691
1692
1693
1694
    Examples
    --------

    >>> import dgl
    >>> import torch

1695
    **Homogeneous Graphs**
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709

    >>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([0, 1, 2])))
    >>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
    >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
    >>> g = dgl.remove_nodes(g, torch.tensor([0, 1]))
    >>> g
    Graph(num_nodes=1, num_edges=1,
        ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
        edata_schemes={'he': Scheme(shape=(1,), dtype=torch.float32)})
    >>> g.ndata['hv']
    tensor([[2.]])
    >>> g.edata['he']
    tensor([[2.]])

1710
    **Heterogeneous Graphs**
1711
1712

    >>> g = dgl.heterograph({
1713
1714
1715
1716
1717
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
    ...                                 torch.tensor([0, 0, 1, 1])),
    ...     ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
    ...                                         torch.tensor([0, 1]))
    ...     })
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
    >>> g = dgl.remove_nodes(g, torch.tensor([0, 1]), ntype='game')
    >>> g.num_nodes('user')
    3
    >>> g.num_nodes('game')
    0
    >>> g.num_edges('plays')
    0

    See Also
    --------
    add_nodes
    add_edges
    remove_edges
    """
    g = g.clone()
1733
    g.remove_nodes(nids, ntype=ntype, store_ids=store_ids)
1734
1735
1736
    return g

def add_self_loop(g, etype=None):
1737
    r"""Add self-loops for each node in the graph and return a new graph.
1738
1739
1740
1741
1742

    Parameters
    ----------
    g : DGLGraph
        The graph.
1743
1744
1745
1746
1747
1748
    etype : str or (str, str, str), optional
        The type names of the edges. The allowed type name formats are:

        * ``(str, str, str)`` for source node type, edge type and destination node type.
        * or one ``str`` edge type name if the name can uniquely identify a
          triplet format in the graph.
1749

1750
        Can be omitted if the graph has only one type of edges.
1751
1752
1753

    Return
    ------
1754
    DGLGraph
1755
        The graph with self-loops.
1756
1757
1758

    Notes
    -----
1759
1760
1761
1762
    * The function only supports homogeneous graphs or heterogeneous graphs but
      the relation graph specified by the :attr:`etype` argument is homogeneous.
    * The function adds self-loops regardless of whether they already exist or not.
      If one wishes to have exactly one self-loop for every node,
1763
      call :func:`remove_self_loop` before invoking :func:`add_self_loop`.
1764
    * Features of the new edges (self-loop edges) will be filled with zeros.
1765
1766
1767
1768
    * This function discards the batch information. Please use
      :func:`dgl.DGLGraph.set_batch_num_nodes`
      and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
      to maintain the information.
1769
1770
1771
1772
1773
1774

    Examples
    --------
    >>> import dgl
    >>> import torch

1775
    **Homogeneous Graphs**
1776
1777
1778
1779
1780
1781
1782

    >>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0])))
    >>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
    >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
    >>> g = dgl.add_self_loop(g)
    >>> g
    Graph(num_nodes=3, num_edges=6,
1783
1784
        ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
        edata_schemes={'he': Scheme(shape=(1,), dtype=torch.float32)})
1785
1786
1787
1788
1789
1790
1791
1792
    >>> g.edata['he']
    tensor([[0.],
            [1.],
            [2.],
            [0.],
            [0.],
            [0.]])

1793
    **Heterogeneous Graphs**
1794
1795

    >>> g = dgl.heterograph({
1796
1797
1798
1799
    ...     ('user', 'follows', 'user'): (torch.tensor([1, 2]),
    ...                                   torch.tensor([0, 1])),
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1]),
    ...                                 torch.tensor([0, 1]))})
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
    >>> g = dgl.add_self_loop(g, etype='follows')
    >>> g
    Graph(num_nodes={'user': 3, 'game': 2},
        num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
        metagraph=[('user', 'user'), ('user', 'game')])
    """
    etype = g.to_canonical_etype(etype)
    if etype[0] != etype[2]:
        raise DGLError(
            'add_self_loop does not support unidirectional bipartite graphs: {}.' \
            'Please make sure the types of head node and tail node are identical.' \
            ''.format(etype))
    nodes = g.nodes(etype[0])
    new_g = add_edges(g, nodes, nodes, etype=etype)
1814
1815
    return new_g

1816
DGLHeteroGraph.add_self_loop = utils.alias_func(add_self_loop)
1817
1818

def remove_self_loop(g, etype=None):
1819
    r""" Remove self-loops for each node in the graph and return a new graph.
1820
1821
1822

    Parameters
    ----------
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
    g : DGLGraph
        The graph.
    etype : str or (str, str, str), optional
        The type names of the edges. The allowed type name formats are:

        * ``(str, str, str)`` for source node type, edge type and destination node type.
        * or one ``str`` edge type name if the name can uniquely identify a
          triplet format in the graph.

        Can be omitted if the graph has only one type of edges.

    Notes
    -----
    If a node has multiple self-loops, remove them all. Do nothing for nodes without
    self-loops.
1838

1839
1840
1841
1842
1843
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1844
1845
1846
    Examples
    ---------

1847
1848
    >>> import dgl
    >>> import torch
1849

1850
    **Homogeneous Graphs**
1851

1852
    >>> g = dgl.graph((torch.tensor([0, 0, 0, 1]), torch.tensor([1, 0, 0, 2])))
1853
1854
1855
1856
1857
1858
1859
1860
    >>> g.edata['he'] = torch.arange(4).float().reshape(-1, 1)
    >>> g = dgl.remove_self_loop(g)
    >>> g
    Graph(num_nodes=3, num_edges=2,
        edata_schemes={'he': Scheme(shape=(2,), dtype=torch.float32)})
    >>> g.edata['he']
    tensor([[0.],[3.]])

1861
    **Heterogeneous Graphs**
1862
1863

    >>> g = dgl.heterograph({
1864
1865
1866
1867
1868
1869
    ...     ('user', 'follows', 'user'): (torch.tensor([0, 1, 1, 1, 2]),
    ...                                   torch.tensor([0, 0, 1, 1, 1])),
    ...     ('user', 'plays', 'game'): (torch.tensor([0, 1]),
    ...                                 torch.tensor([0, 1]))
    ...     })
    >>> g = dgl.remove_self_loop(g, etype='follows')
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
    >>> g.num_nodes('user')
    3
    >>> g.num_nodes('game')
    2
    >>> g.num_edges('follows')
    2
    >>> g.num_edges('plays')
    2

    See Also
1880
    --------
1881
    add_self_loop
1882
    """
1883
1884
1885
1886
1887
1888
1889
1890
1891
    etype = g.to_canonical_etype(etype)
    if etype[0] != etype[2]:
        raise DGLError(
            'remove_self_loop does not support unidirectional bipartite graphs: {}.' \
            'Please make sure the types of head node and tail node are identical.' \
            ''.format(etype))
    u, v = g.edges(form='uv', order='eid', etype=etype)
    self_loop_eids = F.tensor(F.nonzero_1d(u == v), dtype=F.dtype(u))
    new_g = remove_edges(g, self_loop_eids, etype=etype)
1892
1893
    return new_g

1894
DGLHeteroGraph.remove_self_loop = utils.alias_func(remove_self_loop)
1895

1896
def compact_graphs(graphs, always_preserve=None, copy_ndata=True, copy_edata=True):
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
    """Given a list of graphs with the same set of nodes, find and eliminate the common
    isolated nodes across all graphs.

    This function requires the graphs to have the same set of nodes (i.e. the node types
    must be the same, and the number of nodes of each node type must be the same).  The
    metagraph does not have to be the same.

    It finds all the nodes that have zero in-degree and zero out-degree in all the given
    graphs, and eliminates them from all the graphs.

1907
    Useful for graph sampling where you have a giant graph but you only wish to perform
1908
1909
1910
1911
    message passing on a smaller graph with a (tiny) subset of nodes.

    Parameters
    ----------
1912
1913
1914
    graphs : DGLGraph or list[DGLGraph]
        The graph, or list of graphs.

1915
        All graphs must be on the same devices.
1916
1917

        All graphs must have the same set of nodes.
1918
1919
1920
    always_preserve : Tensor or dict[str, Tensor], optional
        If a dict of node types and node ID tensors is given, the nodes of given
        node types would not be removed, regardless of whether they are isolated.
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936

        If a Tensor is given, DGL assumes that all the graphs have one (same) node type.
    copy_ndata: bool, optional
        If True, the node features of the returned graphs are copied from the
        original graphs.

        If False, the returned graphs will not have any node features.

        (Default: True)
    copy_edata: bool, optional
        If True, the edge features of the reversed graph are copied from the
        original graph.

        If False, the reversed graph will not have any edge features.

        (Default: True)
1937
1938
1939

    Returns
    -------
1940
    DGLGraph or list[DGLGraph]
1941
1942
1943
1944
1945
1946
        The compacted graph or list of compacted graphs.

        Each returned graph would have a feature ``dgl.NID`` containing the mapping
        of node IDs for each type from the compacted graph(s) to the original graph(s).
        Note that the mapping is the same for all the compacted graphs.

1947
1948
1949
1950
        All the returned graphs are on CPU.

    Notes
    -----
1951
1952
1953
    This function currently requires that the same node type of all graphs should have
    the same node type ID, i.e. the node types are *ordered* the same.

1954
1955
1956
    If :attr:`copy_edata` is True, the resulting graph will share the edge feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs.
1957

1958
1959
1960
1961
1962
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

1963
1964
1965
1966
1967
    Examples
    --------
    The following code constructs a bipartite graph with 20 users and 10 games, but
    only user #1 and #3, as well as game #3 and #5, have connections:

1968
1969
    >>> g = dgl.heterograph({('user', 'plays', 'game'): ([1, 3], [3, 5])},
    >>>                      {'user': 20, 'game': 10})
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987

    The following would compact the graph above to another bipartite graph with only
    two users and two games.

    >>> new_g, induced_nodes = dgl.compact_graphs(g)
    >>> induced_nodes
    {'user': tensor([1, 3]), 'game': tensor([3, 5])}

    The mapping tells us that only user #1 and #3 as well as game #3 and #5 are kept.
    Furthermore, the first user and second user in the compacted graph maps to
    user #1 and #3 in the original graph.  Games are similar.

    One can verify that the edge connections are kept the same in the compacted graph.

    >>> new_g.edges(form='all', order='eid', etype='plays')
    (tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))

    When compacting multiple graphs, nodes that do not have any connections in any
1988
    of the given graphs are removed.  So if you compact ``g`` and the following ``g2``
1989
1990
    graphs together:

1991
1992
    >>> g2 = dgl.heterograph({('user', 'plays', 'game'): ([1, 6], [6, 8])},
    >>>                      {'user': 20, 'game': 10})
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
    >>> (new_g, new_g2), induced_nodes = dgl.compact_graphs([g, g2])
    >>> induced_nodes
    {'user': tensor([1, 3, 6]), 'game': tensor([3, 5, 6, 8])}

    Then one can see that user #1 from both graphs, users #3 from the first graph, as
    well as user #6 from the second graph, are kept.  Games are similar.

    Similarly, one can also verify the connections:

    >>> new_g.edges(form='all', order='eid', etype='plays')
    (tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
    >>> new_g2.edges(form='all', order='eid', etype='plays')
    (tensor([0, 2]), tensor([2, 3]), tensor([0, 1]))
    """
    return_single = False
    if not isinstance(graphs, Iterable):
        graphs = [graphs]
        return_single = True
    if len(graphs) == 0:
        return []
2013
2014
    if graphs[0].is_block:
        raise DGLError('Compacting a block graph is not allowed.')
2015
2016
2017
2018

    # Ensure the node types are ordered the same.
    # TODO(BarclayII): we ideally need to remove this constraint.
    ntypes = graphs[0].ntypes
2019
2020
    idtype = graphs[0].idtype
    device = graphs[0].device
2021
2022
2023
2024
    for g in graphs:
        assert ntypes == g.ntypes, \
            ("All graphs should have the same node types in the same order, got %s and %s" %
             ntypes, g.ntypes)
2025
2026
        assert idtype == g.idtype, "Expect graph data type to be {}, but got {}".format(
            idtype, g.idtype)
2027
2028
        assert device == g.device, "All graphs must be on the same devices." \
            "Expect graph device to be {}, but got {}".format(device, g.device)
2029
2030
2031
2032
2033
2034
2035
2036
2037

    # Process the dictionary or tensor of "always preserve" nodes
    if always_preserve is None:
        always_preserve = {}
    elif not isinstance(always_preserve, Mapping):
        if len(ntypes) > 1:
            raise ValueError("Node type must be given if multiple node types exist.")
        always_preserve = {ntypes[0]: always_preserve}

2038
    always_preserve = utils.prepare_tensor_dict(graphs[0], always_preserve, 'always_preserve')
2039
2040
2041
2042
    always_preserve_nd = []
    for ntype in ntypes:
        nodes = always_preserve.get(ntype, None)
        if nodes is None:
2043
2044
            nodes = F.copy_to(F.tensor([], idtype), device)
        always_preserve_nd.append(F.to_dgl_nd(nodes))
2045
2046
2047
2048

    # Compact and construct heterographs
    new_graph_indexes, induced_nodes = _CAPI_DGLCompactGraphs(
        [g._graph for g in graphs], always_preserve_nd)
2049
    induced_nodes = [F.from_dgl_nd(nodes) for nodes in induced_nodes]
2050
2051
2052
2053

    new_graphs = [
        DGLHeteroGraph(new_graph_index, graph.ntypes, graph.etypes)
        for new_graph_index, graph in zip(new_graph_indexes, graphs)]
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063

    if copy_ndata:
        for g, new_g in zip(graphs, new_graphs):
            node_frames = utils.extract_node_subframes(g, induced_nodes)
            utils.set_new_frames(new_g, node_frames=node_frames)
    if copy_edata:
        for g, new_g in zip(graphs, new_graphs):
            edge_frames = utils.extract_edge_subframes(g, None)
            utils.set_new_frames(new_g, edge_frames=edge_frames)

2064
2065
2066
2067
2068
    if return_single:
        new_graphs = new_graphs[0]

    return new_graphs

2069
def to_block(g, dst_nodes=None, include_dst_in_src=True, src_nodes=None):
2070
    """Convert a graph into a bipartite-structured *block* for message passing.
2071

2072
    A block is a graph consisting of two sets of nodes: the
2073
2074
    *source* nodes and *destination* nodes.  The source and destination nodes can have multiple
    node types.  All the edges connect from source nodes to destination nodes.
2075

2076
    Specifically, the source nodes and destination nodes will have the same node types as the
2077
2078
    ones in the original graph.  DGL maps each edge ``(u, v)`` with edge type
    ``(utype, etype, vtype)`` in the original graph to the edge with type
2079
2080
    ``etype`` connecting from node ID ``u`` of type ``utype`` in the source side to node
    ID ``v`` of type ``vtype`` in the destination side.
2081

2082
2083
2084
2085
    For blocks returned by :func:`to_block`, the destination nodes of the block will only
    contain the nodes that have at least one inbound edge of any type.  The source nodes
    of the block will only contain the nodes that appear in the destination nodes, as well
    as the nodes that have at least one outbound edge connecting to one of the destination nodes.
2086

2087
    The destination nodes are specified by the :attr:`dst_nodes` argument if it is not None.
2088
2089
2090

    Parameters
    ----------
2091
    graph : DGLGraph
2092
        The graph.  Can be either on CPU or GPU.
2093
    dst_nodes : Tensor or dict[str, Tensor], optional
2094
        The list of destination nodes.
2095
2096
2097
2098
2099

        If a tensor is given, the graph must have only one node type.

        If given, it must be a superset of all the nodes that have at least one inbound
        edge.  An error will be raised otherwise.
2100
    include_dst_in_src : bool
2101
        If False, do not include destination nodes in source nodes.
2102

2103
        (Default: True)
2104

2105
2106
2107
2108
2109
2110
    src_nodes : Tensor or disct[str, Tensor], optional
        The list of source nodes (and prefixed by destination nodes if
        `include_dst_in_src` is True).

        If a tensor is given, the graph must have only one node type.

2111
2112
    Returns
    -------
2113
    DGLBlock
2114
2115
2116
2117
2118
2119
2120
        The new graph describing the block.

        The node IDs induced for each type in both sides would be stored in feature
        ``dgl.NID``.

        The edge IDs induced for each type would be stored in feature ``dgl.EID``.

2121
2122
2123
2124
2125
    Raises
    ------
    DGLError
        If :attr:`dst_nodes` is specified but it is not a superset of all the nodes that
        have at least one inbound edge.
2126

2127
2128
2129
        If :attr:`dst_nodes` is not None, and :attr:`g` and :attr:`dst_nodes`
        are not in the same context.

2130
2131
2132
2133
2134
2135
2136
    Notes
    -----
    :func:`to_block` is most commonly used in customizing neighborhood sampling
    for stochastic training on a large graph.  Please refer to the user guide
    :ref:`guide-minibatch` for a more thorough discussion about the methodology
    of stochastic training.

2137
2138
    See also :func:`create_block` for more flexible construction of blocks.

2139
2140
2141
    Examples
    --------
    Converting a homogeneous graph to a block as described above:
2142

2143
    >>> g = dgl.graph(([1, 2], [2, 3]))
2144
2145
    >>> block = dgl.to_block(g, torch.LongTensor([3, 2]))

2146
    The destination nodes would be exactly the same as the ones given: [3, 2].
2147

2148
2149
2150
2151
    >>> induced_dst = block.dstdata[dgl.NID]
    >>> induced_dst
    tensor([3, 2])

2152
    The first few source nodes would also be exactly the same as
2153
2154
    the ones given.  The rest of the nodes are the ones necessary for message passing
    into nodes 3, 2.  This means that the node 1 would be included.
2155

2156
2157
2158
2159
    >>> induced_src = block.srcdata[dgl.NID]
    >>> induced_src
    tensor([3, 2, 1])

2160
    You can notice that the first two nodes are identical to the given nodes as well as
2161
    the destination nodes.
2162
2163

    The induced edges can also be obtained by the following:
2164

2165
2166
2167
    >>> block.edata[dgl.EID]
    tensor([2, 1])

2168
    This indicates that edge (2, 3) and (1, 2) are included in the result graph.  You can
2169
2170
    verify that the first edge in the block indeed maps to the edge (2, 3), and the
    second edge in the block indeed maps to the edge (1, 2):
2171

2172
2173
2174
2175
    >>> src, dst = block.edges(order='eid')
    >>> induced_src[src], induced_dst[dst]
    (tensor([2, 1]), tensor([3, 2]))

2176
2177
    The destination nodes specified must be a superset of the nodes that have edges connecting
    to them.  For example, the following will raise an error since the destination nodes
2178
2179
2180
2181
2182
    does not contain node 3, which has an edge connecting to it.

    >>> g = dgl.graph(([1, 2], [2, 3]))
    >>> dgl.to_block(g, torch.LongTensor([2]))     # error

2183
    Converting a heterogeneous graph to a block is similar, except that when specifying
2184
    the destination nodes, you have to give a dict:
2185

2186
    >>> g = dgl.heterograph({('A', '_E', 'B'): ([1, 2], [2, 3])})
2187

2188
2189
    If you don't specify any node of type A on the destination side, the node type ``A``
    in the block would have zero nodes on the destination side.
2190

2191
    >>> block = dgl.to_block(g, {'B': torch.LongTensor([3, 2])})
2192
    >>> block.number_of_dst_nodes('A')
2193
    0
2194
    >>> block.number_of_dst_nodes('B')
2195
    2
2196
    >>> block.dstnodes['B'].data[dgl.NID]
2197
2198
    tensor([3, 2])

2199
    The source side would contain all the nodes on the destination side:
2200

2201
    >>> block.srcnodes['B'].data[dgl.NID]
2202
2203
    tensor([3, 2])

2204
    As well as all the nodes that have connections to the nodes on the destination side:
2205

2206
    >>> block.srcnodes['A'].data[dgl.NID]
2207
    tensor([2, 1])
2208
2209
2210
2211

    See also
    --------
    create_block
2212
    """
2213
    if dst_nodes is None:
2214
        # Find all nodes that appeared as destinations
2215
        dst_nodes = defaultdict(list)
2216
2217
        for etype in g.canonical_etypes:
            _, dst = g.edges(etype=etype)
2218
2219
2220
2221
            dst_nodes[etype[2]].append(dst)
        dst_nodes = {ntype: F.unique(F.cat(values, 0)) for ntype, values in dst_nodes.items()}
    elif not isinstance(dst_nodes, Mapping):
        # dst_nodes is a Tensor, check if the g has only one type.
2222
        if len(g.ntypes) > 1:
2223
            raise DGLError(
2224
2225
                'Graph has more than one node type; please specify a dict for dst_nodes.')
        dst_nodes = {g.ntypes[0]: dst_nodes}
2226

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
2227
    dst_node_ids = [
2228
2229
        utils.toindex(dst_nodes.get(ntype, []), g._idtype_str).tousertensor(
            ctx=F.to_backend_ctx(g._graph.ctx))
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
2230
2231
        for ntype in g.ntypes]
    dst_node_ids_nd = [F.to_dgl_nd(nodes) for nodes in dst_node_ids]
2232

2233
2234
2235
2236
    for d in dst_node_ids_nd:
        if g._graph.ctx != d.ctx:
            raise ValueError('g and dst_nodes need to have the same context.')

2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
    src_node_ids = None
    src_node_ids_nd = None
    if src_nodes is not None and not isinstance(src_nodes, Mapping):
        # src_nodes is a Tensor, check if the g has only one type.
        if len(g.ntypes) > 1:
            raise DGLError(
                'Graph has more than one node type; please specify a dict for src_nodes.')
        src_nodes = {g.ntypes[0]: src_nodes}
        src_node_ids = [
            F.copy_to(F.tensor(src_nodes.get(ntype, []), dtype=g._idtype_str), \
                F.to_backend_ctx(g._graph.ctx)) \
            for ntype in g.ntypes]
        src_node_ids_nd = [F.to_dgl_nd(nodes) for nodes in src_node_ids]

        for d in src_node_ids_nd:
            if g._graph.ctx != d.ctx:
                raise ValueError('g and src_nodes need to have the same context.')
    else:
        # use an empty list to signal we need to generate it
        src_node_ids_nd = []

    new_graph_index, src_nodes_ids_nd, induced_edges_nd = _CAPI_DGLToBlock(
        g._graph, dst_node_ids_nd, include_dst_in_src, src_node_ids_nd)
2260

2261
    # The new graph duplicates the original node types to SRC and DST sets.
2262
    new_ntypes = (g.ntypes, g.ntypes)
2263
    new_graph = DGLBlock(new_graph_index, new_ntypes, g.etypes)
2264
    assert new_graph.is_unibipartite  # sanity check
2265

2266
    src_node_ids = [F.from_dgl_nd(src) for src in src_nodes_ids_nd]
2267
    edge_ids = [F.from_dgl_nd(eid) for eid in induced_edges_nd]
2268

2269
2270
2271
    node_frames = utils.extract_node_subframes_for_block(g, src_node_ids, dst_node_ids)
    edge_frames = utils.extract_edge_subframes(g, edge_ids)
    utils.set_new_frames(new_graph, node_frames=node_frames, edge_frames=edge_frames)
2272
2273
2274

    return new_graph

2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
def _coalesce_edge_frame(g, edge_maps, counts, aggregator):
    r"""Coalesce edge features of duplicate edges via given aggregator in g.

    Parameters
    ----------
    g : DGLGraph
        The input graph.
    edge_maps : List[Tensor]
        The edge mapping corresponding to each edge type in g.
    counts : List[Tensor]
        The number of duplicated edges from the original graph for each edge type.
    aggregator : str
        Indicates how to coalesce edge features, could be ``arbitrary``, ``sum``
        or ``mean``.

    Returns
    -------
    List[Frame]
        The frames corresponding to each edge type.
    """
    if aggregator == 'arbitrary':
        eids = []
        for i in range(len(g.canonical_etypes)):
            feat_idx = F.asnumpy(edge_maps[i])
            _, indices = np.unique(feat_idx, return_index=True)
            eids.append(F.zerocopy_from_numpy(indices))

        edge_frames = utils.extract_edge_subframes(g, eids)
    elif aggregator in ['sum', 'mean']:
        edge_frames = []
        for i in range(len(g.canonical_etypes)):
            feat_idx = edge_maps[i]
            _, indices = np.unique(F.asnumpy(feat_idx), return_index=True)
            _num_rows = len(indices)
            _data = {}
            for key, col in g._edge_frames[i]._columns.items():
                data = col.data
                new_data = F.scatter_add(data, feat_idx, _num_rows)
                if aggregator == 'mean':
                    norm = F.astype(counts[i], F.dtype(data))
                    norm = F.reshape(norm, (F.shape(norm)[0],) + (1,) * (F.ndim(data) - 1))
                    new_data /= norm
                _data[key] = new_data

            newf = Frame(data=_data, num_rows=_num_rows)
            edge_frames.append(newf)
    else:
        raise DGLError("Aggregator {} not regonized, cannot coalesce edge feature in the "
                       "specified way".format(aggregator))
    return edge_frames

2326
2327
2328
2329
def to_simple(g,
              return_counts='count',
              writeback_mapping=False,
              copy_ndata=True,
2330
2331
              copy_edata=False,
              aggregator='arbitrary'):
2332
    r"""Convert a graph to a simple graph without parallel edges and return.
2333

2334
2335
2336
2337
2338
2339
    For a heterogeneous graph with multiple edge types, DGL treats edges with the same
    edge type and endpoints as parallel edges and removes them.
    Optionally, one can get the the number of parallel edges by specifying the
    :attr:`return_counts` argument. To get the a mapping from the edge IDs in the
    input graph to the edge IDs in the resulting graph, set :attr:`writeback_mapping`
    to true.
2340

2341
2342
    Parameters
    ----------
2343
    g : DGLGraph
2344
        The input graph.  Must be on CPU.
2345
    return_counts : str, optional
2346
2347
        If given, the count of each edge in the original graph
        will be stored as edge features under the name
2348
2349
        ``return_counts``.  The old features with the same name will be replaced.

2350
2351
        (Default: "count")
    writeback_mapping: bool, optional
2352
        If True, return an extra write-back mapping for each edge
2353
        type. The write-back mapping is a tensor recording
2354
2355
        the mapping from the edge IDs in the input graph to
        the edge IDs in the result graph. If the graph is
2356
2357
2358
2359
2360
        heterogeneous, DGL returns a dictionary of edge types and such
        tensors.

        If False, only the simple graph is returned.

2361
2362
2363
        (Default: False)
    copy_ndata: bool, optional
        If True, the node features of the simple graph are copied
2364
2365
2366
2367
        from the original graph.

        If False, the simple graph will not have any node features.

2368
2369
2370
2371
        (Default: True)
    copy_edata: bool, optional
        If True, the edge features of the simple graph are copied
        from the original graph. If there exists duplicate edges between
2372
2373
        two nodes (u, v), the feature of the edge is the aggregation
        of edge feature of duplicate edges.
2374

2375
        If False, the simple graph will not have any edge features.
2376

2377
        (Default: False)
2378
2379
2380
2381
2382
2383
2384
    aggregator: str, optional
        Indicate how to coalesce edge feature of duplicate edges.
        If ``arbitrary``, select one of the duplicate edges' feature.
        If ``sum``, compute the summation of duplicate edges' feature.
        If ``mean``, compute the average of duplicate edges' feature.

        (Default: ``arbitrary``)
2385
2386
2387

    Returns
    -------
2388
    DGLGraph
2389
        The graph.
2390
    tensor or dict of tensor
2391
        The writeback mapping. Only when ``writeback_mapping`` is True.
2392
2393
2394

    Notes
    -----
2395
2396
2397
    If :attr:`copy_ndata` is True, the resulting graph will share the node feature
    tensors with the input graph. Hence, users should try to avoid in-place operations
    which will be visible to both graphs.
2398

2399
2400
2401
2402
2403
    This function discards the batch information. Please use
    :func:`dgl.DGLGraph.set_batch_num_nodes`
    and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
    to maintain the information.

2404
2405
    Examples
    --------
2406
    **Homogeneous Graphs**
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439

    Create a graph for demonstrating to_simple API.
    In the original graph, there are multiple edges between 1 and 2.

    >>> import dgl
    >>> import torch as th
    >>> g = dgl.graph((th.tensor([0, 1, 2, 1]), th.tensor([1, 2, 0, 2])))
    >>> g.ndata['h'] = th.tensor([[0.], [1.], [2.]])
    >>> g.edata['h'] = th.tensor([[3.], [4.], [5.], [6.]])

    Convert the graph to a simple graph. The return counts is
    stored in the edge feature 'cnt' and the writeback mapping
    is returned in a tensor.

    >>> sg, wm = dgl.to_simple(g, return_counts='cnt', writeback_mapping=True)
    >>> sg.ndata['h']
    tensor([[0.],
            [1.],
            [2.]])
    >>> u, v, eid = sg.edges(form='all')
    >>> u
    tensor([0, 1, 2])
    >>> v
    tensor([1, 2, 0])
    >>> eid
    tensor([0, 1, 2])
    >>> sg.edata['cnt']
    tensor([1, 2, 1])
    >>> wm
    tensor([0, 1, 2, 1])
    >>> 'h' in g.edata
    False

2440
    **Heterogeneous Graphs**
2441

2442
    >>> g = dgl.heterograph({
2443
2444
2445
    ...     ('user', 'wins', 'user'): (th.tensor([0, 2, 0, 2, 2]), th.tensor([1, 1, 2, 1, 0])),
    ...     ('user', 'plays', 'game'): (th.tensor([1, 2, 1]), th.tensor([2, 1, 1]))
    ... })
2446
2447
    >>> g.nodes['game'].data['hv'] = th.ones(3, 1)
    >>> g.edges['plays'].data['he'] = th.zeros(3, 1)
2448

2449
    The return counts is stored in the default edge feature 'count' for each edge type.
2450

2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
    >>> sg, wm = dgl.to_simple(g, copy_ndata=False, writeback_mapping=True)
    >>> sg
    Graph(num_nodes={'game': 3, 'user': 3},
          num_edges={('user', 'wins', 'user'): 4, ('game', 'plays', 'user'): 3},
          metagraph=[('user', 'user'), ('game', 'user')])
    >>> sg.edges(etype='wins')
    (tensor([0, 2, 0, 2]), tensor([1, 1, 2, 0]))
    >>> wm[('user', 'wins', 'user')]
    tensor([0, 1, 2, 1, 3])
    >>> sg.edges(etype='plays')
    (tensor([2, 1, 1]), tensor([1, 2, 1]))
    >>> wm[('user', 'plays', 'game')]
    tensor([0, 1, 2])
    >>> 'hv' in sg.nodes['game'].data
    False
    >>> 'he' in sg.edges['plays'].data
    False
    >>> sg.edata['count']
    {('user', 'wins', 'user'): tensor([1, 2, 1, 1])
     ('user', 'plays', 'game'): tensor([1, 1, 1])}
2471
    """
2472
    assert g.device == F.cpu(), 'the graph must be on CPU'
2473
2474
    if g.is_block:
        raise DGLError('Cannot convert a block graph to a simple graph.')
2475
2476
    simple_graph_index, counts, edge_maps = _CAPI_DGLToSimpleHetero(g._graph)
    simple_graph = DGLHeteroGraph(simple_graph_index, g.ntypes, g.etypes)
2477
2478
2479
2480
2481
2482
2483
    counts = [F.from_dgl_nd(count) for count in counts]
    edge_maps = [F.from_dgl_nd(edge_map) for edge_map in edge_maps]

    if copy_ndata:
        node_frames = utils.extract_node_subframes(g, None)
        utils.set_new_frames(simple_graph, node_frames=node_frames)
    if copy_edata:
2484
2485
        new_edge_frames = _coalesce_edge_frame(g, edge_maps, counts, aggregator)
        utils.set_new_frames(simple_graph, edge_frames=new_edge_frames)
2486
2487
2488
2489
2490

    if return_counts is not None:
        for count, canonical_etype in zip(counts, g.canonical_etypes):
            simple_graph.edges[canonical_etype].data[return_counts] = count

2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
    if writeback_mapping:
        # single edge type
        if len(edge_maps) == 1:
            return simple_graph, edge_maps[0]
        # multiple edge type
        else:
            wb_map = {}
            for edge_map, canonical_etype in zip(edge_maps, g.canonical_etypes):
                wb_map[canonical_etype] = edge_map
            return simple_graph, wb_map
2501
2502
2503

    return simple_graph

2504
DGLHeteroGraph.to_simple = utils.alias_func(to_simple)
2505

2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
def _unitgraph_less_than_int32(g):
    """Check if a graph with only one edge type has more than 2 ** 31 - 1
    nodes or edges.
    """
    num_edges = g.num_edges()
    num_nodes = max(g.num_nodes(g.ntypes[0]), g.num_nodes(g.ntypes[-1]))
    return max(num_nodes, num_edges) <= (1 << 31) - 1

def adj_product_graph(A, B, weight_name, etype='_E'):
    r"""Create a weighted graph whose adjacency matrix is the product of
    the adjacency matrices of the given two graphs.

    Namely, given two weighted graphs :attr:`A` and :attr:`B`, whose rows
    represent source nodes and columns represent destination nodes, this function
    returns a new graph whose weighted adjacency matrix is
    :math:`\mathrm{adj}(A) \times \mathrm{adj}(B)`.

    The two graphs must be simple graphs, and must have only one edge type.
    Moreover, the number of nodes of the destination node type of :attr:`A` must
    be the same as the number of nodes of the source node type of :attr:`B`.

    The source node type of the returned graph will be the same as the source
    node type of graph :attr:`A`.  The destination node type of the returned
    graph will be the same as the destination node type of graph :attr:`B`.
    If the two node types are the same, the returned graph will be homogeneous.
    Otherwise, it will be a bipartite graph.

    Unlike ``scipy``, if an edge in the result graph has zero weight, it will
    not be removed from the graph.

    Notes
    -----
    This function works on both CPU and GPU.  For GPU, the number of nodes and
    edges must be less than the maximum of ``int32`` (i.e. ``2 ** 31 - 1``) due
    to restriction of cuSPARSE.

    The edge weights returned by this function is differentiable w.r.t. the
    input edge weights.

    If the graph format is restricted, both graphs must have CSR available.

    Parameters
    ----------
    A : DGLGraph
        The graph as left operand.
    B : DGLGraph
        The graph as right operand.
    weight_name : str
        The feature name of edge weight of both graphs.

        The corresponding edge feature must be scalar.
    etype : str, optional
        The edge type of the returned graph.

    Returns
    -------
    DGLGraph
        The new graph.  The edge weight of the returned graph will have the
        same feature name as :attr:`weight_name`.

    Examples
    --------
    The following shows weighted adjacency matrix multiplication between two
    bipartite graphs.  You can also perform this between two homogeneous
    graphs, or one homogeneous graph and one bipartite graph, as long as the
    numbers of nodes of the same type match.

    >>> A = dgl.heterograph({
    ...     ('A', 'AB', 'B'): ([2, 2, 0, 2, 0, 1], [2, 1, 0, 0, 2, 2])},
    ...     num_nodes_dict={'A': 3, 'B': 4})
    >>> B = dgl.heterograph({
    ...     ('B', 'BA', 'A'): ([0, 3, 2, 1, 3, 3], [1, 2, 0, 2, 1, 0])},
    ...     num_nodes_dict={'A': 3, 'B': 4})

    If your graph is a multigraph, you will need to call :func:`dgl.to_simple`
    to convert it into a simple graph first.

    >>> A = dgl.to_simple(A)
    >>> B = dgl.to_simple(B)

2586
2587
2588
2589
2590
2591
2592
    Initialize learnable edge weights.

    >>> A.edata['w'] = torch.randn(6).requires_grad_()
    >>> B.edata['w'] = torch.randn(6).requires_grad_()

    Take the product.

2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
    >>> C = dgl.adj_product_graph(A, B, 'w')
    >>> C.edges()
    (tensor([0, 0, 1, 2, 2, 2]), tensor([0, 1, 0, 0, 2, 1]))

    >>> C.edata['w']
    tensor([0.6906, 0.2002, 0.0591, 0.3672, 0.1066, 0.1328],
           grad_fn=<CSRMMBackward>)

    Note that this function is differentiable:

    >>> C.edata['w'].sum().backward()
    >>> A.edata['w'].grad
    tensor([0.7153, 0.2775, 0.7141, 0.7141, 0.7153, 0.7153])

    >>> B.edata['w'].grad
    tensor([0.4664, 0.0000, 1.5614, 0.3840, 0.0000, 0.0000])

    If the source node type of the left operand is the same as the destination
    node type of the right operand, this function returns a homogeneous graph:

    >>> C.ntypes
    ['A']

    Otherwise, it returns a bipartite graph instead:

    >>> A = dgl.heterograph({
    ...     ('A', 'AB', 'B'): ([2, 2, 0, 2, 0, 1], [2, 1, 0, 0, 2, 2])},
    ...     num_nodes_dict={'A': 3, 'B': 4})
    >>> B = dgl.heterograph({
    ...     ('B', 'BC', 'C'): ([0, 3, 2, 1, 3, 3], [1, 2, 0, 2, 1, 0])},
    ...     num_nodes_dict={'C': 3, 'B': 4})
    >>> A.edata['w'] = torch.randn(6).requires_grad_()
    >>> B.edata['w'] = torch.randn(6).requires_grad_()
    >>> C = dgl.adj_product_graph(A, B, 'w')
    >>> C.ntypes
    ['A', 'C']
    """
    srctype, _, _ = A.canonical_etypes[0]
    _, _, dsttype = B.canonical_etypes[0]
    num_vtypes = 1 if srctype == dsttype else 2
    ntypes = [srctype] if num_vtypes == 1 else [srctype, dsttype]

    if A.device != F.cpu():
        if not (_unitgraph_less_than_int32(A) and _unitgraph_less_than_int32(B)):
            raise ValueError(
                'For GPU graphs the number of nodes and edges must be less than 2 ** 31 - 1.')

    C_gidx, C_weights = F.csrmm(
        A._graph, A.edata[weight_name], B._graph, B.edata[weight_name], num_vtypes)
    num_nodes_dict = {srctype: A.num_nodes(srctype), dsttype: B.num_nodes(dsttype)}
    C_metagraph, ntypes, etypes, _ = \
        create_metagraph_index(ntypes, [(srctype, etype, dsttype)])
    num_nodes_per_type = [num_nodes_dict[ntype] for ntype in ntypes]
    C_gidx = create_heterograph_from_relations(
        C_metagraph, [C_gidx], utils.toindex(num_nodes_per_type))

    C = DGLHeteroGraph(C_gidx, ntypes, etypes)
    C.edata[weight_name] = C_weights
    return C

def adj_sum_graph(graphs, weight_name):
    r"""Create a weighted graph whose adjacency matrix is the sum of the
    adjacency matrices of the given graphs, whose rows represent source nodes
    and columns represent destination nodes.

    All the graphs must be simple graphs, and must have only one edge type.
    They also must have the same metagraph, i.e. have the same source node type
    and the same destination node type.  Moreover, the number of nodes for every
    graph must also be the same.

    The metagraph of the returned graph will be the same as the input graphs.

    Unlike ``scipy``, if an edge in the result graph has zero weight, it will
    not be removed from the graph.

    Notes
    -----
    This function works on both CPU and GPU.  For GPU, the number of nodes and
    edges must be less than the maximum of ``int32`` (i.e. ``2 ** 31 - 1``) due
    to restriction of cuSPARSE.

    The edge weights returned by this function is differentiable w.r.t. the
    input edge weights.

    If the graph format is restricted, both graphs must have CSR available.

    Parameters
    ----------
    graphs : list[DGLGraph]
        The list of graphs.  Must have at least one element.
    weight_name : str
        The feature name of edge weight of both graphs.

        The corresponding edge feature must be scalar.

    Returns
    -------
    DGLGraph
        The new graph.  The edge weight of the returned graph will have the
        same feature name as :attr:`weight_name`.

    Examples
    --------
    The following shows weighted adjacency matrix summation between two
    bipartite graphs.  You can also perform this between homogeneous graphs.

    >>> A = dgl.heterograph(
    ...     {('A', 'AB', 'B'): ([2, 2, 0, 2, 0, 1], [2, 1, 0, 0, 2, 2])},
    ...     num_nodes_dict={'A': 3, 'B': 4})
    >>> B = dgl.heterograph(
    ...     {('A', 'AB', 'B'): ([1, 2, 0, 2, 1, 0], [0, 3, 2, 1, 3, 3])},
    ...     num_nodes_dict={'A': 3, 'B': 4})
    >>> A.edata['w'] = torch.randn(6).requires_grad_()
    >>> B.edata['w'] = torch.randn(6).requires_grad_()

2708
    If your graph is a multigraph, call :func:`dgl.to_simple`
2709
2710
2711
2712
2713
    to convert it into a simple graph first.

    >>> A = dgl.to_simple(A)
    >>> B = dgl.to_simple(B)

2714
2715
2716
2717
2718
2719
2720
    Initialize learnable edge weights.

    >>> A.edata['w'] = torch.randn(6).requires_grad_()
    >>> B.edata['w'] = torch.randn(6).requires_grad_()

    Take the sum.

2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
    >>> C = dgl.adj_sum_graph([A, B], 'w')
    >>> C.edges()
    (tensor([0, 0, 0, 1, 1, 1, 2, 2, 2, 2]),
     tensor([0, 2, 3, 2, 0, 3, 0, 1, 2, 3]))

    Note that this function is differentiable:

    >>> C.edata['w'].sum().backward()
    >>> A.edata['w'].grad
    tensor([1., 1., 1., 1., 1., 1.])

    >>> B.edata['w'].grad
    tensor([1., 1., 1., 1., 1., 1.])
    """
    if len(graphs) == 0:
        raise ValueError('The list of graphs must not be empty.')

    if graphs[0].device != F.cpu():
        if not all(_unitgraph_less_than_int32(A) for A in graphs):
            raise ValueError(
                'For GPU graphs the number of nodes and edges must be less than 2 ** 31 - 1.')
    metagraph = graphs[0]._graph.metagraph
    num_nodes = utils.toindex(
        [graphs[0]._graph.number_of_nodes(i) for i in range(graphs[0]._graph.number_of_ntypes())])
    weights = [A.edata[weight_name] for A in graphs]
    gidxs = [A._graph for A in graphs]
    C_gidx, C_weights = F.csrsum(gidxs, weights)
    C_gidx = create_heterograph_from_relations(metagraph, [C_gidx], num_nodes)

    C = DGLHeteroGraph(C_gidx, graphs[0].ntypes, graphs[0].etypes)
    C.edata[weight_name] = C_weights
    return C

2754
def as_heterograph(g, ntype='_U', etype='_E'):  # pylint: disable=unused-argument
2755
2756
    """Convert a DGLGraph to a DGLHeteroGraph with one node and edge type.

2757
2758
    DEPRECATED: DGLGraph and DGLHeteroGraph have been merged. This function will
                do nothing and can be removed safely in all cases.
2759
    """
2760
2761
2762
    dgl_warning('DEPRECATED: DGLGraph and DGLHeteroGraph have been merged in v0.5.\n'
                '\tdgl.as_heterograph will do nothing and can be removed safely in all cases.')
    return g
2763
2764
2765
2766

def as_immutable_graph(hg):
    """Convert a DGLHeteroGraph with one node and edge type into a DGLGraph.

2767
2768
    DEPRECATED: DGLGraph and DGLHeteroGraph have been merged. This function will
                do nothing and can be removed safely in all cases.
2769
    """
2770
2771
2772
    dgl_warning('DEPRECATED: DGLGraph and DGLHeteroGraph have been merged in v0.5.\n'
                '\tdgl.as_immutable_graph will do nothing and can be removed safely in all cases.')
    return hg
2773

2774
2775
def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET'):
    r"""Return a new graph whose CSR matrix is sorted by the given tag.
2776

2777
2778
2779
    Sort the internal CSR matrix of the graph so that the adjacency list of each node
    , which contains the out-edges, is sorted by the tag of the out-neighbors.
    After sorting, edges sharing the same tag will be arranged in a consecutive range in
2780
2781
    a node's adjacency list. Following is an example:

2782
        Consider a graph as follows::
2783

2784
2785
            0 -> 0, 1, 2, 3, 4
            1 -> 0, 1, 2
2786

2787
2788
        Given node tags ``[1, 1, 0, 2, 0]``, each node's adjacency list
        will be sorted as follows::
2789

2790
2791
            0 -> 2, 4, 0, 1, 3
            1 -> 2, 0, 1
2792
2793

    The function will also returns the starting offsets of the tag
2794
2795
2796
    segments in a tensor of shape :math:`(N, max\_tag+2)`. For node ``i``,
    its out-edges connecting to node tag ``j`` is stored between
    ``tag_offsets[i][j]`` ~ ``tag_offsets[i][j+1]``. Since the offsets
2797
    can be viewed node data, we store it in the
2798
2799
    ``ndata`` of the returned graph. Users can specify the
    ndata name by the :attr:`tag_pos_name` argument.
2800
2801
2802

    Note that the function will not change the edge ID neither
    how the edge features are stored. The input graph must
2803
    allow CSR format. The graph must be on CPU.
2804
2805
2806
2807
2808
2809
2810

    If the input graph is heterogenous, it must have only one edge
    type and two node types (i.e., source and destination node types).
    In this case, the provided node tags are for the destination nodes,
    and the tag offsets are stored in the source node data.

    The sorted graph and the calculated tag offsets are needed by
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
    certain operators that consider node tags. See
    :func:`~dgl.sampling.sample_neighbors_biased` for an example.

    Parameters
    ------------
    g : DGLGraph
        The input graph.
    tag : Tensor
        Integer tensor of shape :math:`(N,)`, :math:`N` being the number of (destination) nodes.
    tag_offset_name : str
        The name of the node feature to store tag offsets.

    Returns
    -------
    g_sorted : DGLGraph
        A new graph whose CSR is sorted. The node/edge features of the
        input graph is shallow-copied over.

        - ``g_sorted.ndata[tag_offset_name]`` : Tensor of shape :math:`(N, max\_tag + 2)`.
        - If ``g`` is heterogeneous, get from ``g_sorted.srcdata``.
2831
2832
2833
2834
2835
2836
2837
2838
2839

    Examples
    -----------

    >>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
    >>> g.adjacency_matrix(scipy_fmt='csr').nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([0, 1, 2, 3, 4, 0, 1, 2], dtype=int32))
    >>> tag = torch.IntTensor([1,1,0,2,0])
2840
    >>> g_sorted = dgl.sort_csr_by_tag(g, tag)
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
    >>> g_sorted.adjacency_matrix(scipy_fmt='csr').nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
    >>> g_sorted.ndata['_TAG_OFFSET']
    tensor([[0, 2, 4, 5],
            [0, 1, 3, 3],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]])

2851
2852
2853
    See Also
    --------
    dgl.sampling.sample_neighbors_biased
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
    """
    if len(g.etypes) > 1:
        raise DGLError("Only support homograph and bipartite graph")
    num_tags = int(F.asnumpy(F.max(tag, 0))) + 1
    tag_arr = F.zerocopy_to_dgl_ndarray(tag)
    new_g = g.clone()
    new_g._graph, tag_pos_arr = _CAPI_DGLHeteroSortOutEdges(g._graph, tag_arr, num_tags)
    new_g.srcdata[tag_offset_name] = F.from_dgl_nd(tag_pos_arr)
    return new_g


2865
2866
def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET'):
    r"""Return a new graph whose CSC matrix is sorted by the given tag.
2867

2868
2869
2870
    Sort the internal CSC matrix of the graph so that the adjacency list of each node
    , which contains the in-edges, is sorted by the tag of the in-neighbors.
    After sorting, edges sharing the same tag will be arranged in a consecutive range in
2871
2872
2873
    a node's adjacency list. Following is an example:


2874
        Consider a graph as follows::
2875

2876
2877
            0 <- 0, 1, 2, 3, 4
            1 <- 0, 1, 2
2878

2879
2880
        Given node tags ``[1, 1, 0, 2, 0]``, each node's adjacency list
        will be sorted as follows::
2881

2882
2883
2884
2885
2886
2887
2888
            0 <- 2, 4, 0, 1, 3
            1 <- 2, 0, 1

    The function will also return the starting offsets of the tag
    segments in a tensor of shape :math:`(N, max\_tag+2)`. For a node ``i``,
    its in-edges connecting to node tag ``j`` is stored between
    ``tag_offsets[i][j]`` ~ ``tag_offsets[i][j+1]``. Since the offsets
2889
    can be viewed node data, we store it in the
2890
2891
    ``ndata`` of the returned graph. Users can specify the
    ndata name by the ``tag_pos_name`` argument.
2892
2893
2894

    Note that the function will not change the edge ID neither
    how the edge features are stored. The input graph must
2895
    allow CSC format. The graph must be on CPU.
2896
2897
2898
2899
2900
2901
2902

    If the input graph is heterogenous, it must have only one edge
    type and two node types (i.e., source and destination node types).
    In this case, the provided node tags are for the source nodes,
    and the tag offsets are stored in the destination node data.

    The sorted graph and the calculated tag offsets are needed by
2903
    certain operators that consider node tags. See :func:`~dgl.sampling.sample_neighbors_biased`
2904
2905
    for an example.

2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
    Parameters
    ------------
    g : DGLGraph
        The input graph.
    tag : Tensor
        Integer tensor of shape :math:`(N,)`, :math:`N` being the number of (source) nodes.
    tag_offset_name : str
        The name of the node feature to store tag offsets.

    Returns
    -------
    g_sorted : DGLGraph
        A new graph whose CSC matrix is sorted. The node/edge features of the
        input graph is shallow-copied over.

        - ``g_sorted.ndata[tag_offset_name]`` : Tensor of shape :math:`(N, max\_tag + 2)`.
        - If ``g`` is heterogeneous, get from ``g_sorted.dstdata``.

2924
2925
2926
2927
    Examples
    -----------

    >>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
2928
    >>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
2929
2930
2931
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([0, 1, 2, 3, 4, 0, 1, 2], dtype=int32)))
    >>> tag = torch.IntTensor([1,1,0,2,0])
2932
    >>> g_sorted = dgl.sort_csc_by_tag(g, tag)
2933
    >>> g_sorted.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
2934
2935
2936
2937
2938
2939
2940
2941
2942
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
     array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
    >>> g_sorted.ndata['_TAG_OFFSET']
    tensor([[0, 2, 4, 5],
            [0, 1, 3, 3],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]])

2943
2944
2945
    See Also
    --------
    dgl.sampling.sample_neighbors_biased
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
    """
    if len(g.etypes) > 1:
        raise DGLError("Only support homograph and bipartite graph")
    num_tags = int(F.asnumpy(F.max(tag, 0))) + 1
    tag_arr = F.zerocopy_to_dgl_ndarray(tag)
    new_g = g.clone()
    new_g._graph, tag_pos_arr = _CAPI_DGLHeteroSortInEdges(g._graph, tag_arr, num_tags)
    new_g.dstdata[tag_offset_name] = F.from_dgl_nd(tag_pos_arr)
    return new_g

2956

2957
2958
2959
def reorder_graph(g, node_permute_algo='rcmk', edge_permute_algo='src',
                  store_ids=True, permute_config=None):
    r"""Return a new graph with nodes and edges re-ordered/re-labeled
2960
    according to the specified permute algorithm.
2961

2962
    Support homogeneous graph only for the moment.
2963

2964
    The re-ordering has two 2 steps: first re-order nodes and then re-order edges.
2965

2966
2967
2968
2969
2970
2971
    For node permutation, users can re-order by the :attr:`node_permute_algo`
    argument. For edge permutation, user can re-arrange edges according to their
    source nodes or destination nodes by the :attr:`edge_permute_algo` argument.
    Some of the permutation algorithms are only implemented in CPU, so if the
    input graph is on GPU, it will be copied to CPU first. The storage order of
    the node and edge features in the graph are permuted accordingly.
2972
2973
2974
2975
2976

    Parameters
    ----------
    g : DGLGraph
        The homogeneous graph.
2977
2978
2979
    node_permute_algo: str, optional
        The permutation algorithm to re-order nodes. Options are ``rcmk`` or
        ``metis`` or ``custom``. ``rcmk`` is the default value.
2980

2981
        * ``rcmk``: Use the `Reverse Cuthill–McKee <https://docs.scipy.org/doc/scipy/reference/
2982
          generated/scipy.sparse.csgraph.reverse_cuthill_mckee.html#
2983
2984
          scipy-sparse-csgraph-reverse-cuthill-mckee>`__ from ``scipy`` to generate nodes
          permutation.
2985
        * ``metis``: Use the :func:`~dgl.metis_partition_assignment` function
2986
2987
          to partition the input graph, which gives a cluster assignment of each node.
          DGL then sorts the assignment array so the new node order will put nodes of
2988
2989
          the same cluster together. Please note that the generated nodes permutation
          of ``metis`` is non-deterministic due to algorithm's nature.
2990
2991
2992
2993
2994
2995
2996
2997
        * ``custom``: Reorder the graph according to the user-provided node permutation
          array (provided in :attr:`permute_config`).
    edge_permute_algo: str, optional
        The permutation algorithm to reorder edges. Options are ``src`` or ``dst``.
        ``src`` is the default value.

        * ``src``: Edges are arranged according to their source nodes.
        * ``dst``: Edges are arranged according to their destination nodes.
2998
    store_ids: bool, optional
2999
3000
        If True, DGL will store the original node and edge IDs in the ndata and edata
        of the resulting graph under name ``dgl.NID`` and ``dgl.EID``, respectively.
3001
    permute_config: dict, optional
3002
        Additional key-value config data for the specified permutation algorithm.
3003

3004
3005
3006
3007
3008
3009
        * For ``rcmk``, this argument is not required.
        * For ``metis``, users should specify the number of partitions ``k`` (e.g.,
          ``permute_config={'k':10}`` to partition the graph to 10 clusters).
        * For ``custom``, users should provide a node permutation array ``nodes_perm``.
          The array must be an integer list or a tensor with the same device of the
          input graph.
3010
3011
3012

    Returns
    -------
3013
    DGLGraph
3014
        The re-ordered graph.
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035

    Examples
    --------
    >>> import dgl
    >>> import torch
    >>> g = dgl.graph((torch.tensor([0, 1, 2, 3, 4]), torch.tensor([2, 2, 3, 2, 3])))
    >>> g.ndata['h'] = torch.arange(g.num_nodes() * 2).view(g.num_nodes(), 2)
    >>> g.edata['w'] = torch.arange(g.num_edges() * 1).view(g.num_edges(), 1)
    >>> g.ndata
    {'h': tensor([[0, 1],
            [2, 3],
            [4, 5],
            [6, 7],
            [8, 9]])}
    >>> g.edata
    {'w': tensor([[0],
            [1],
            [2],
            [3],
            [4]])}

3036
    Reorder according to ``'rcmk'`` permute algorithm.
3037

3038
    >>> rg = dgl.reorder_graph(g)
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
    >>> rg.ndata
    {'h': tensor([[8, 9],
            [6, 7],
            [2, 3],
            [4, 5],
            [0, 1]]), '_ID': tensor([4, 3, 1, 2, 0])}
    >>> rg.edata
    {'w': tensor([[4],
            [3],
            [1],
            [2],
            [0]]), '_ID': tensor([4, 3, 1, 2, 0])}

3052
    Reorder according to ``'metis'`` permute algorithm.
3053

3054
    >>> rg = dgl.reorder_graph(g, 'metis', permute_config={'k':2})
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
    >>> rg.ndata
    {'h': tensor([[4, 5],
            [2, 3],
            [0, 1],
            [8, 9],
            [6, 7]]), '_ID': tensor([2, 1, 0, 4, 3])}
    >>> rg.edata
    {'w': tensor([[2],
            [1],
            [0],
            [4],
            [3]]), '_ID': tensor([2, 1, 0, 4, 3])}

3068
    Reorder according to ``'custom'`` permute algorithm with user-provided nodes_perm.
3069
3070
3071
3072

    >>> nodes_perm = torch.randperm(g.num_nodes())
    >>> nodes_perm
    tensor([3, 2, 0, 4, 1])
3073
    >>> rg = dgl.reorder_graph(g, 'custom', permute_config={'nodes_perm':nodes_perm})
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
    >>> rg.ndata
    {'h': tensor([[6, 7],
            [4, 5],
            [0, 1],
            [8, 9],
            [2, 3]]), '_ID': tensor([3, 2, 0, 4, 1])}
    >>> rg.edata
    {'w': tensor([[3],
            [2],
            [0],
            [4],
            [1]]), '_ID': tensor([3, 2, 0, 4, 1])}

3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
    Reorder according to ``dst`` edge permute algorithm and refine further
    according to self-generated edges permutation. Please assure to specify
    ``relabel_nodes`` as ``False`` to keep the nodes order.

    >>> rg = dgl.reorder_graph(g, edge_permute_algo='dst')
    >>> rg.edges()
    (tensor([0, 3, 1, 2, 4]), tensor([1, 1, 3, 3, 3]))
    >>> eg = dgl.edge_subgraph(rg, [0, 2, 4, 1, 3], relabel_nodes=False)
    >>> eg.edata
    {'w': tensor([[4],
            [3],
            [0],
            [2],
            [1]]), '_ID': tensor([0, 2, 4, 1, 3])}

3102
    """
3103
    # sanity checks
3104
3105
    if not g.is_homogeneous:
        raise DGLError("Homograph is supported only.")
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
    expected_node_algo = ['rcmk', 'metis', 'custom']
    if node_permute_algo not in expected_node_algo:
        raise DGLError("Unexpected node_permute_algo is specified: {}. Expected algos: {}".format(
            node_permute_algo, expected_node_algo))
    expected_edge_algo = ['src', 'dst']
    if edge_permute_algo not in expected_edge_algo:
        raise DGLError("Unexpected edge_permute_algo is specified: {}. Expected algos: {}".format(
            edge_permute_algo, expected_edge_algo))

    # generate nodes permutation
    if node_permute_algo == 'rcmk':
3117
        nodes_perm = rcmk_perm(g)
3118
    elif node_permute_algo == 'metis':
3119
3120
3121
        if permute_config is None or 'k' not in permute_config:
            raise DGLError(
                "Partition parts 'k' is required for metis. Please specify in permute_config.")
3122
        nodes_perm = metis_perm(g, permute_config['k'])
3123
3124
3125
3126
3127
3128
3129
3130
3131
    else:
        if permute_config is None or 'nodes_perm' not in permute_config:
            raise DGLError(
                "permute_algo is specified as custom, but no 'nodes_perm' is specified in \
                    permute_config.")
        nodes_perm = permute_config['nodes_perm']
        if len(nodes_perm) != g.num_nodes():
            raise DGLError("Length of passed in nodes_perm[{}] does not \
                    match graph num_nodes[{}].".format(len(nodes_perm), g.num_nodes()))
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146

    # reorder nodes
    rg = subgraph.node_subgraph(g, nodes_perm, store_ids=store_ids)

    # reorder edges
    if edge_permute_algo == 'src':
        # the output graph of dgl.node_subgraph() is ordered/labeled
        # according to src already. Nothing needs to do.
        pass
    elif edge_permute_algo == 'dst':
        edges_perm = np.argsort(F.asnumpy(rg.edges()[1]))
        rg = subgraph.edge_subgraph(
            rg, edges_perm, relabel_nodes=False, store_ids=store_ids)

    return rg
3147
3148


3149
DGLHeteroGraph.reorder_graph = utils.alias_func(reorder_graph)
3150
3151


3152
3153
3154
def metis_perm(g, k):
    r"""Return nodes permutation according to ``'metis'`` algorithm.

3155
    For internal use.
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167

    Parameters
    ----------
    g : DGLGraph
        The homogeneous graph.
    k: int
        The partition parts number.

    Returns
    -------
    iterable[int]
        The nodes permutation.
3168
3169
3170
3171
    """
    pids = metis_partition_assignment(
        g if g.device == F.cpu() else g.to(F.cpu()), k)
    pids = F.asnumpy(pids)
3172
    return np.argsort(pids).copy()
3173
3174


3175
3176
3177
def rcmk_perm(g):
    r"""Return nodes permutation according to ``'rcmk'`` algorithm.

3178
    For internal use.
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188

    Parameters
    ----------
    g : DGLGraph
        The homogeneous graph.

    Returns
    -------
    iterable[int]
        The nodes permutation.
3189
3190
3191
3192
3193
3194
3195
3196
3197
    """
    fmat = 'csr'
    allowed_fmats = sum(g.formats().values(), [])
    if fmat not in allowed_fmats:
        g = g.formats(allowed_fmats + [fmat])
    csr_adj = g.adj(scipy_fmt=fmat)
    perm = sparse.csgraph.reverse_cuthill_mckee(csr_adj)
    return perm.copy()

3198
_init_api("dgl.transform", __name__)