"src/diffusers/models/controlnet.py" did not exist on "1a6fa69ab610586dad912c2b8d72bef9e3f209ee"
Unverified Commit c3a33407 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] PinSAGE sampler (#1249)



* [WIP] PinSAGE operators

* moved the edge remapping mess into C

* some docstrings

* lint

* lint x2

* lint x3

* skip gpu test on topk

* extend pinsage to any metapath

* lint x4

* addresses #1265

* add always_preserve (fixes #1266) and fix a silly bug

* disable gpu test on compaction

* lint

* fix a horrible bug and add more tests

* lint

* addresses comments

* lint

* bugfix

* addresses comments
Co-authored-by: default avatarMinjie Wang <minjie.wang@nyu.edu>
parent 87bca129
...@@ -115,6 +115,26 @@ template<typename ValueType> ...@@ -115,6 +115,26 @@ template<typename ValueType>
ValueType IndexSelect(NDArray array, uint64_t index); ValueType IndexSelect(NDArray array, uint64_t index);
NDArray IndexSelect(NDArray array, IdArray index); NDArray IndexSelect(NDArray array, IdArray index);
/*!
* \brief Permute the elements of an array according to given indices.
*
* Equivalent to:
*
* <code>
* result = np.zeros_like(array)
* result[indices] = array
* </code>
*/
NDArray Scatter(NDArray array, IdArray indices);
/*!
* \brief Repeat each element a number of times. Equivalent to np.repeat(array, repeats)
* \param array A 1D vector
* \param repeats A 1D integer vector for number of times to repeat for each element in
* \c array. Must have the same shape as \c array.
*/
NDArray Repeat(NDArray array, IdArray repeats);
/*! /*!
* \brief Relabel the given ids to consecutive ids. * \brief Relabel the given ids to consecutive ids.
* *
...@@ -234,9 +254,9 @@ struct CSRMatrix { ...@@ -234,9 +254,9 @@ struct CSRMatrix {
* Note that we do allow duplicate non-zero entries -- multiple non-zero entries * Note that we do allow duplicate non-zero entries -- multiple non-zero entries
* that have the same row, col indices. It corresponds to multigraph in * that have the same row, col indices. It corresponds to multigraph in
* graph terminology. * graph terminology.
*
* We call a COO matrix is *coalesced* if its row index is sorted.
*/ */
// TODO(BarclayII): Graph queries on COO formats should support the case where
// data ordered by rows/columns instead of EID.
struct COOMatrix { struct COOMatrix {
/*! \brief the dense shape of the matrix */ /*! \brief the dense shape of the matrix */
int64_t num_rows = 0, num_cols = 0; int64_t num_rows = 0, num_cols = 0;
...@@ -523,6 +543,12 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray ...@@ -523,6 +543,12 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray
/*! \return True if the matrix has duplicate entries */ /*! \return True if the matrix has duplicate entries */
bool COOHasDuplicate(COOMatrix coo); bool COOHasDuplicate(COOMatrix coo);
/*!
* \brief Deduplicate the entries of a sorted COO matrix, replacing the data with the
* number of occurrences of the row-col coordinates.
*/
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
/*! /*!
* \brief Sort the indices of a COO matrix. * \brief Sort the indices of a COO matrix.
* *
...@@ -619,7 +645,7 @@ COOMatrix COORowWiseTopk( ...@@ -619,7 +645,7 @@ COOMatrix COORowWiseTopk(
COOMatrix mat, COOMatrix mat,
IdArray rows, IdArray rows,
int64_t k, int64_t k,
FloatArray weight, NDArray weight,
bool ascending = false); bool ascending = false);
// inline implementations // inline implementations
......
...@@ -594,19 +594,6 @@ HeteroGraphPtr CreateFromCSR( ...@@ -594,19 +594,6 @@ HeteroGraphPtr CreateFromCSR(
IdArray indptr, IdArray indices, IdArray edge_ids, IdArray indptr, IdArray indices, IdArray edge_ids,
SparseFormat restrict_format = SparseFormat::ANY); SparseFormat restrict_format = SparseFormat::ANY);
/*!
* \brief Given a list of graphs, remove the common nodes that do not have inbound and
* outbound edges.
*
* The graphs should have identical node ID space (i.e. should have the same set of nodes,
* including types and IDs) and metagraph.
*
* \return A pair. The first element is the list of compacted graphs, and the second
* element is the mapping from the compacted graphs and the original graph.
*/
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs);
/*! /*!
* \brief Extract the subgraph of the in edges of the given nodes. * \brief Extract the subgraph of the in edges of the given nodes.
* \param graph Graph * \param graph Graph
......
/*!
* Copyright (c) 2019 by Contributors
* \file dgl/transform.h
* \brief DGL graph transformations
*/
#ifndef DGL_TRANSFORM_H_
#define DGL_TRANSFORM_H_
#include <vector>
#include <tuple>
#include <utility>
#include "base_heterograph.h"
#include "array.h"
namespace dgl {
namespace transform {
/*!
* \brief Given a list of graphs, remove the common nodes that do not have inbound and
* outbound edges.
*
* The graphs should have identical node ID space (i.e. should have the same set of nodes,
* including types and IDs) and metagraph.
*
* \param graphs The list of graphs.
* \param always_preserve The list of nodes to preserve regardless of whether the inbound
* or outbound edges exist.
*
* \return A pair. The first element is the list of compacted graphs, and the second
* element is the mapping from the compacted graphs and the original graph.
*/
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve);
/*!
* \brief Convert a multigraph to a simple graph.
*
* \return A triplet of
* * \c hg : The said simple graph.
* * \c count : The array of edge occurrences per edge type.
* * \c edge_map : The mapping from original edge IDs to new edge IDs per edge type.
*
* \note Example: consider the following graph:
*
* g = dgl.graph([(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)])
*
* Then ToSimpleGraph(g) would yield the following elements:
*
* * The first element would be the simple graph itself:
*
* simple_g = dgl.graph([(0, 1), (1, 3), (1, 4), (2, 2)])
*
* * The second element is an array \c count. \c count[i] stands for the number of edges
* connecting simple_g.src[i] and simple_g.dst[i] in the original graph.
*
* count[0] = [1, 2, 2, 1]
*
* * One can find the mapping between edges from the original graph to the new simple
* graph.
*
* edge_map[0] = [0, 1, 3, 1, 2, 2]
*/
std::tuple<HeteroGraphPtr, std::vector<IdArray>, std::vector<IdArray>>
ToSimpleGraph(const HeteroGraphPtr graph);
}; // namespace transform
}; // namespace dgl
#endif // DGL_TRANSFORM_H_
"""Module for converting graph from/to other object.""" """Module for converting graph from/to other object."""
from collections import defaultdict from collections import defaultdict
from collections.abc import Iterable
import numpy as np import numpy as np
import scipy as sp import scipy as sp
import networkx as nx import networkx as nx
...@@ -20,7 +19,6 @@ __all__ = [ ...@@ -20,7 +19,6 @@ __all__ = [
'to_hetero', 'to_hetero',
'to_homo', 'to_homo',
'to_networkx', 'to_networkx',
'compact_graphs',
] ]
def graph(data, ntype='_N', etype='_E', card=None, validate=True, restrict_format='any', def graph(data, ntype='_N', etype='_E', card=None, validate=True, restrict_format='any',
...@@ -724,98 +722,6 @@ def to_homo(G): ...@@ -724,98 +722,6 @@ def to_homo(G):
return retg return retg
def compact_graphs(graphs):
"""Given a list of graphs with the same set of nodes, find and eliminate the common
isolated nodes across all graphs.
This function requires the graphs to have the same set of nodes (i.e. the node types
must be the same, and the number of nodes of each node type must be the same). The
metagraph does not have to be the same.
It finds all the nodes that have zero in-degree and zero out-degree in all the given
graphs, and eliminates them from all the graphs.
Useful for graph sampling where we have a giant graph but we only wish to perform
message passing on a smaller graph with a (tiny) subset of nodes.
The node and edge features are not preserved.
Parameters
----------
graphs : DGLHeteroGraph or list[DGLHeteroGraph]
The graph, or list of graphs
Returns
-------
DGLHeteroGraph or list[DGLHeteroGraph]
The compacted graph or list of compacted graphs.
Each returned graph would have a feature ``dgl.NID`` containing the mapping
of node IDs for each type from the compacted graph(s) to the original graph(s).
Note that the mapping is the same for all the compacted graphs.
Examples
--------
The following code constructs a bipartite graph with 20 users and 10 games, but
only user #1 and #3, as well as game #3 and #5, have connections:
>>> g = dgl.bipartite([(1, 3), (3, 5)], 'user', 'plays', 'game', card=(20, 10))
The following would compact the graph above to another bipartite graph with only
two users and two games.
>>> new_g, induced_nodes = dgl.compact_graphs(g)
>>> induced_nodes
{'user': tensor([1, 3]), 'game': tensor([3, 5])}
The mapping tells us that only user #1 and #3 as well as game #3 and #5 are kept.
Furthermore, the first user and second user in the compacted graph maps to
user #1 and #3 in the original graph. Games are similar.
One can verify that the edge connections are kept the same in the compacted graph.
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
When compacting multiple graphs, nodes that do not have any connections in any
of the given graphs are removed. So if we compact ``g`` and the following ``g2``
graphs together:
>>> g2 = dgl.bipartite([(1, 6), (6, 8)], 'user', 'plays', 'game', card=(20, 10))
>>> (new_g, new_g2), induced_nodes = dgl.compact_graphs([g, g2])
>>> induced_nodes
{'user': tensor([1, 3, 6]), 'game': tensor([3, 5, 6, 8])}
Then one can see that user #1 from both graphs, users #3 from the first graph, as
well as user #6 from the second graph, are kept. Games are similar.
Similarly, one can also verify the connections:
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
>>> new_g2.edges(form='all', order='eid', etype='plays')
(tensor([0, 2]), tensor([2, 3]), tensor([0, 1]))
"""
return_single = False
if not isinstance(graphs, Iterable):
graphs = [graphs]
return_single = True
new_graph_indexes, induced_nodes = heterograph_index.compact_graph_indexes(
[g._graph for g in graphs])
new_graphs = [
DGLHeteroGraph(new_graph_index, graph.ntypes, graph.etypes)
for new_graph_index, graph in zip(new_graph_indexes, graphs)]
for g in new_graphs:
for i, ntype in enumerate(graphs[0].ntypes):
g.nodes[ntype].data[NID] = induced_nodes[i]
if return_single:
new_graphs = new_graphs[0]
return new_graphs
############################################################ ############################################################
# Internal APIs # Internal APIs
############################################################ ############################################################
......
...@@ -145,6 +145,16 @@ class HeteroGraphIndex(ObjectBase): ...@@ -145,6 +145,16 @@ class HeteroGraphIndex(ObjectBase):
_CAPI_DGLHeteroClear(self) _CAPI_DGLHeteroClear(self)
self._cache.clear() self._cache.clear()
def dtype(self):
"""Return the data type of this graph index.
Returns
-------
DGLDataType
The data type of the graph.
"""
return _CAPI_DGLHeteroDataType(self)
def ctx(self): def ctx(self):
"""Return the context of this graph index. """Return the context of this graph index.
...@@ -1069,31 +1079,6 @@ def disjoint_partition(graph, bnn_all_types, bne_all_types): ...@@ -1069,31 +1079,6 @@ def disjoint_partition(graph, bnn_all_types, bne_all_types):
return _CAPI_DGLHeteroDisjointPartitionBySizes( return _CAPI_DGLHeteroDisjointPartitionBySizes(
graph, bnn_all_types.todgltensor(), bne_all_types.todgltensor()) graph, bnn_all_types.todgltensor(), bne_all_types.todgltensor())
def compact_graph_indexes(graphs):
"""Given a list of graphs, remove the common nodes that do not have inbound and
outbound edges.
The graphs should have identical node space (i.e. should have the same set of
nodes, including types and IDs) and metagraph.
Parameters
----------
graph : list[HeteroGraphIndex]
List of heterographs.
Returns
-------
list[HeteroGraphIndex]
A list of compacted heterographs.
The returned heterographs also have the same metagraph, which is identical
to the original heterographs.
The returned heterographs also have identical node space.
list[Tensor]
The induced node IDs of each node type.
"""
new_graphs, induced_nodes = _CAPI_DGLCompactGraphs(graphs)
return new_graphs, [F.zerocopy_from_dgl_ndarray(nodes.data) for nodes in induced_nodes]
@register_object("graph.FlattenedHeteroGraph") @register_object("graph.FlattenedHeteroGraph")
class FlattenedHeteroGraph(ObjectBase): class FlattenedHeteroGraph(ObjectBase):
"""FlattenedHeteroGraph object class in C++ backend.""" """FlattenedHeteroGraph object class in C++ backend."""
......
"""Sampler modules.""" """Sampler modules."""
from .randomwalks import * from .randomwalks import *
from .pinsage import *
from .neighbor import * from .neighbor import *
...@@ -7,7 +7,9 @@ from ..heterograph import DGLHeteroGraph ...@@ -7,7 +7,9 @@ from ..heterograph import DGLHeteroGraph
from .. import ndarray as nd from .. import ndarray as nd
from .. import utils from .. import utils
__all__ = ['sample_neighbors', 'sample_neighbors_topk'] __all__ = [
'sample_neighbors',
'select_topk']
def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=True): def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=True):
"""Sample from the neighbors of the given nodes and return the induced subgraph. """Sample from the neighbors of the given nodes and return the induced subgraph.
...@@ -82,7 +84,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=True): ...@@ -82,7 +84,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=True):
ret.edges[etype].data[EID] = induced_edges[i].tousertensor() ret.edges[etype].data[EID] = induced_edges[i].tousertensor()
return ret return ret
def sample_neighbors_topk(g, nodes, k, weight, edge_dir='in', ascending=False): def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False):
"""Select the neighbors with k-largest weights on the connecting edges for each given node. """Select the neighbors with k-largest weights on the connecting edges for each given node.
If k > the number of neighbors, all the neighbors are sampled. If k > the number of neighbors, all the neighbors are sampled.
...@@ -94,15 +96,15 @@ def sample_neighbors_topk(g, nodes, k, weight, edge_dir='in', ascending=False): ...@@ -94,15 +96,15 @@ def sample_neighbors_topk(g, nodes, k, weight, edge_dir='in', ascending=False):
---------- ----------
g : DGLHeteroGraph g : DGLHeteroGraph
Full graph structure. Full graph structure.
nodes : tensor or dict
Node ids to sample neighbors from. The allowed types
are dictionary of node types to node id tensors, or simply node id
tensor if the given graph g has only one type of nodes.
k : int k : int
The K value. The K value.
weight : str weight : str
Feature name of the weights associated with each edge. Its shape should be Feature name of the weights associated with each edge. Its shape should be
compatible with a scalar edge feature tensor. compatible with a scalar edge feature tensor.
nodes : tensor or dict, optional
Node ids to sample neighbors from. The allowed types
are dictionary of node types to node id tensors, or simply node id
tensor if the given graph g has only one type of nodes.
edge_dir : str, optional edge_dir : str, optional
Edge direction ('in' or 'out'). If is 'in', sample from in edges. Edge direction ('in' or 'out'). If is 'in', sample from in edges.
Otherwise, sample from out edges. Otherwise, sample from out edges.
...@@ -112,14 +114,19 @@ def sample_neighbors_topk(g, nodes, k, weight, edge_dir='in', ascending=False): ...@@ -112,14 +114,19 @@ def sample_neighbors_topk(g, nodes, k, weight, edge_dir='in', ascending=False):
Returns Returns
------- -------
DGLGraph DGLHeteroGraph
A sampled subgraph by top k criterion. The sampled subgraph has the same A sampled subgraph by top k criterion. The sampled subgraph has the same
metagraph as the original one. metagraph as the original one.
""" """
if not isinstance(nodes, dict): # Rectify nodes to a dictionary
if nodes is None:
nodes = {ntype: F.arange(0, g.number_of_nodes(ntype)) for ntype in g.ntypes}
elif not isinstance(nodes, dict):
if len(g.ntypes) > 1: if len(g.ntypes) > 1:
raise DGLError("Must specify node type when the graph is not homogeneous.") raise DGLError("Must specify node type when the graph is not homogeneous.")
nodes = {g.ntypes[0] : nodes} nodes = {g.ntypes[0] : nodes}
# Parse nodes into a list of NDArrays.
nodes_all_types = [] nodes_all_types = []
for ntype in g.ntypes: for ntype in g.ntypes:
if ntype in nodes: if ntype in nodes:
......
"""PinSAGE sampler & related functions and classes"""
import numpy as np
from .. import backend as F
from .. import convert
from .. import transform
from .randomwalks import random_walk
from .neighbor import select_topk
from ..base import EID
class RandomWalkNeighborSampler(object):
"""PinSAGE-like sampler extended to any heterographs, given a metapath.
Given a heterogeneous graph, this neighbor sampler would generate a homogeneous
graph where the neighbors of each node are the most commonly visited nodes of the
same type by random walk with restarts. The random walk with restarts are based
on a given metapath, which should have the same beginning and ending node type.
The homogeneous graph also has a feature that stores the number of visits to
the corresponding neighbors from the seed nodes.
This is a generalization of PinSAGE sampler which only works on bidirectional
bipartite graphs.
Parameters
----------
G : DGLHeteroGraph
The heterogeneous graph.
random_walk_length : int
The maximum number of steps of random walk with restarts.
Note that here we consider a full traversal of the given metapath as a single
random walk "step" (i.e. a single step may consist of multiple hops).
Usually considered a hyperparameter.
random_walk_restart_prob : int
Restart probability of random walk with restarts.
Note that the random walks only would halt after a full traversal of a metapath.
It will never halt in the middle of a metapath.
Usually considered a hyperparameter.
num_random_walks : int
Number of random walks to try for each seed node.
Usually considered a hyperparameter.
num_neighbors : int
Number of neighbors to select for each seed.
metapath : list[str] or list[tuple[str, str, str]], optional
The metapath.
If not given, assumes that the graph is homogeneous.
weight_column : str, default "weights"
The weight of each neighbor, stored as an edge feature.
Inputs
------
seed_nodes : Tensor
A tensor of seed node IDs of node type ``ntype``.
Outputs
-------
g : DGLHeteroGraph
A homogeneous graph constructed by selecting neighbors for each seed node according
to PinSAGE algorithm.
Examples
--------
See examples in :any:`PinSAGESampler`.
"""
def __init__(self, G, random_walk_length, random_walk_restart_prob,
num_random_walks, num_neighbors, metapath=None, weight_column='weights'):
self.G = G
self.weight_column = weight_column
self.num_random_walks = num_random_walks
self.num_neighbors = num_neighbors
self.random_walk_length = random_walk_length
if metapath is None:
if len(G.ntypes) > 1 or len(G.etypes) > 1:
raise ValueError('Metapath must be specified if the graph is homogeneous.')
metapath = [G.canonical_etypes[0]]
start_ntype = G.to_canonical_etype(metapath[0])[0]
end_ntype = G.to_canonical_etype(metapath[-1])[-1]
if start_ntype != end_ntype:
raise ValueError('The metapath must start and end at the same node type.')
self.ntype = start_ntype
self.metapath_hops = len(metapath)
self.metapath = metapath
self.full_metapath = metapath * random_walk_length
restart_prob = np.zeros(self.metapath_hops * random_walk_length)
restart_prob[self.metapath_hops::self.metapath_hops] = random_walk_restart_prob
self.restart_prob = F.zerocopy_from_numpy(restart_prob)
# pylint: disable=no-member
def __call__(self, seed_nodes):
seed_nodes = F.repeat(seed_nodes, self.num_random_walks, 0)
paths, _ = random_walk(
self.G, seed_nodes, metapath=self.full_metapath, restart_prob=self.restart_prob)
src = F.reshape(paths[:, self.metapath_hops::self.metapath_hops], (-1,))
dst = F.repeat(paths[:, 0], self.random_walk_length, 0)
src_mask = (src != -1)
src = F.boolean_mask(src, src_mask)
dst = F.boolean_mask(dst, src_mask)
# count the number of visits and pick the K-most frequent neighbors for each node
neighbor_graph = convert.graph(
(src, dst), card=self.G.number_of_nodes(self.ntype), ntype=self.ntype)
neighbor_graph = transform.to_simple(neighbor_graph, return_counts=self.weight_column)
counts = neighbor_graph.edata[self.weight_column]
neighbor_graph = select_topk(neighbor_graph, self.num_neighbors, self.weight_column)
selected_counts = F.gather_row(counts, neighbor_graph.edata[EID])
neighbor_graph.edata[self.weight_column] = selected_counts
return neighbor_graph
class PinSAGESampler(RandomWalkNeighborSampler):
"""PinSAGE neighbor sampler.
Given a bidirectional bipartite graph, PinSAGE neighbor sampler would generate
a homogeneous graph where the neighbors of each node are the most commonly visited
nodes of the same type by random walk with restarts.
Parameters
----------
G : DGLHeteroGraph
The bidirectional bipartite graph.
The graph should only have two node types: ``ntype`` and ``other_type``.
The graph should only have two edge types, one connecting from ``ntype`` to
``other_type``, and another connecting from ``other_type`` to ``ntype``.
PinSAGE works on a bidirectional bipartite graph where for each edge
going from node u to node v, there exists an edge going from node v to node u.
ntype : str
The node type for which the graph would be constructed on.
other_type : str
The other node type.
random_walk_length : int
The maximum number of steps of random walk with restarts.
Note that here we consider traversing from ``ntype`` to ``other_type`` then back
to ``ntype`` as a single step (i.e. a single step consists of two hops).
Usually considered a hyperparameter.
random_walk_restart_prob : int
Restart probability of random walk with restarts.
Note that the random walks only would halt on node type ``ntype``, and would
never halt on ``other_type``.
Usually considered a hyperparameter.
num_random_walks : int
Number of random walks to try for each seed node.
Usually considered a hyperparameter.
num_neighbors : int
Number of neighbors to select for each seed.
weight_column : str, default "weights"
The weight of each neighbor, stored as an edge feature.
Inputs
------
seed_nodes : Tensor
A tensor of seed node IDs of node type ``ntype``.
Outputs
-------
g : DGLHeteroGraph
A homogeneous graph constructed by selecting neighbors for each seed node according
to PinSAGE algorithm.
Examples
--------
Generate a random bidirectional bipartite graph with 3000 "A" nodes and 5000 "B" nodes.
>>> g = scipy.sparse.random(3000, 5000, 0.003)
>>> G = dgl.heterograph({
... ('A', 'AB', 'B'): g,
... ('B', 'BA', 'A'): g.T})
Then we create a PinSAGE neighbor sampler that samples a graph of node type "A". Each
node would have (a maximum of) 10 neighbors.
>>> sampler = dgl.sampling.PinSAGESampler(G, 'A', 'B', 3, 0.5, 200, 10)
This is how we select the neighbors for node #0, #1 and #2 of type "A" according to
PinSAGE algorithm:
>>> seeds = torch.LongTensor([0, 1, 2])
>>> frontier = sampler(seeds)
>>> frontier.all_edges(form='uv')
(tensor([ 230, 0, 802, 47, 50, 1639, 1533, 406, 2110, 2687, 2408, 2823,
0, 972, 1230, 1658, 2373, 1289, 1745, 2918, 1818, 1951, 1191, 1089,
1282, 566, 2541, 1505, 1022, 812]),
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2]))
For an end-to-end example of PinSAGE model, including sampling on multiple layers
and computing with the sampled graphs, please refer to [TODO]
References
----------
Graph Convolutional Neural Networks for Web-Scale Recommender Systems
Ying et al., 2018, https://arxiv.org/abs/1806.01973
"""
def __init__(self, G, ntype, other_type, random_walk_length, random_walk_restart_prob,
num_random_walks, num_neighbors, weight_column='weights'):
metagraph = G.metagraph
fw_etype = list(metagraph[ntype][other_type])[0]
bw_etype = list(metagraph[other_type][ntype])[0]
super().__init__(G, random_walk_length,
random_walk_restart_prob, num_random_walks, num_neighbors,
metapath=[fw_etype, bw_etype], weight_column=weight_column)
...@@ -46,7 +46,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob ...@@ -46,7 +46,7 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
The feature tensor must be non-negative. The feature tensor must be non-negative.
If omitted, we assume the neighbors are picked uniformly. If omitted, we assume the neighbors are picked uniformly.
restart_prob : float or Tensor, optional restart_prob : float or Tensor, optional
Probability to stop after each step. Probability to stop at each step.
If a tensor is given, ``restart_prob`` should have the same length as ``metapath``. If a tensor is given, ``restart_prob`` should have the same length as ``metapath``.
Returns Returns
......
"""Module for graph transformation utilities.""" """Module for graph transformation utilities."""
from collections.abc import Iterable, Mapping
import numpy as np import numpy as np
from scipy import sparse from scipy import sparse
from ._ffi.function import _init_api from ._ffi.function import _init_api
from .base import EID
from .graph import DGLGraph from .graph import DGLGraph
from .heterograph import DGLHeteroGraph from .heterograph import DGLHeteroGraph
from . import ndarray as nd from . import ndarray as nd
...@@ -15,11 +15,27 @@ from .graph_index import _get_halo_subgraph_inner_edge ...@@ -15,11 +15,27 @@ from .graph_index import _get_halo_subgraph_inner_edge
from .batched_graph import BatchedDGLGraph, unbatch from .batched_graph import BatchedDGLGraph, unbatch
from .convert import graph, bipartite from .convert import graph, bipartite
from . import utils from . import utils
from .base import EID, NID
from . import ndarray as nd
__all__ = ['line_graph', 'khop_adj', 'khop_graph', 'reverse', 'to_simple_graph', 'to_bidirected', __all__ = [
'laplacian_lambda_max', 'knn_graph', 'segmented_knn_graph', 'add_self_loop', 'line_graph',
'remove_self_loop', 'metapath_reachable_graph', 'in_subgraph', 'out_subgraph'] 'khop_adj',
'khop_graph',
'reverse',
'to_simple_graph',
'to_bidirected',
'laplacian_lambda_max',
'knn_graph',
'segmented_knn_graph',
'add_self_loop',
'remove_self_loop',
'metapath_reachable_graph',
'compact_graphs',
'to_simple',
'in_subgraph',
'out_subgraph']
def pairwise_squared_distance(x): def pairwise_squared_distance(x):
...@@ -565,6 +581,139 @@ def partition_graph_with_halo(g, node_part, num_hops): ...@@ -565,6 +581,139 @@ def partition_graph_with_halo(g, node_part, num_hops):
subg_dict[i] = subg subg_dict[i] = subg
return subg_dict return subg_dict
def compact_graphs(graphs, always_preserve=None):
"""Given a list of graphs with the same set of nodes, find and eliminate the common
isolated nodes across all graphs.
This function requires the graphs to have the same set of nodes (i.e. the node types
must be the same, and the number of nodes of each node type must be the same). The
metagraph does not have to be the same.
It finds all the nodes that have zero in-degree and zero out-degree in all the given
graphs, and eliminates them from all the graphs.
Useful for graph sampling where we have a giant graph but we only wish to perform
message passing on a smaller graph with a (tiny) subset of nodes.
The node and edge features are not preserved.
Parameters
----------
graphs : DGLHeteroGraph or list[DGLHeteroGraph]
The graph, or list of graphs
always_preserve : Tensor or dict[str, Tensor], optional
If a dict of node types and node ID tensors is given, the nodes of given
node types would not be removed, regardless of whether they are isolated.
If a Tensor is given, assume that all the graphs have one (same) node type.
Returns
-------
DGLHeteroGraph or list[DGLHeteroGraph]
The compacted graph or list of compacted graphs.
Each returned graph would have a feature ``dgl.NID`` containing the mapping
of node IDs for each type from the compacted graph(s) to the original graph(s).
Note that the mapping is the same for all the compacted graphs.
Bugs
----
This function currently requires that the same node type of all graphs should have
the same node type ID, i.e. the node types are *ordered* the same.
Examples
--------
The following code constructs a bipartite graph with 20 users and 10 games, but
only user #1 and #3, as well as game #3 and #5, have connections:
>>> g = dgl.bipartite([(1, 3), (3, 5)], 'user', 'plays', 'game', card=(20, 10))
The following would compact the graph above to another bipartite graph with only
two users and two games.
>>> new_g, induced_nodes = dgl.compact_graphs(g)
>>> induced_nodes
{'user': tensor([1, 3]), 'game': tensor([3, 5])}
The mapping tells us that only user #1 and #3 as well as game #3 and #5 are kept.
Furthermore, the first user and second user in the compacted graph maps to
user #1 and #3 in the original graph. Games are similar.
One can verify that the edge connections are kept the same in the compacted graph.
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
When compacting multiple graphs, nodes that do not have any connections in any
of the given graphs are removed. So if we compact ``g`` and the following ``g2``
graphs together:
>>> g2 = dgl.bipartite([(1, 6), (6, 8)], 'user', 'plays', 'game', card=(20, 10))
>>> (new_g, new_g2), induced_nodes = dgl.compact_graphs([g, g2])
>>> induced_nodes
{'user': tensor([1, 3, 6]), 'game': tensor([3, 5, 6, 8])}
Then one can see that user #1 from both graphs, users #3 from the first graph, as
well as user #6 from the second graph, are kept. Games are similar.
Similarly, one can also verify the connections:
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
>>> new_g2.edges(form='all', order='eid', etype='plays')
(tensor([0, 2]), tensor([2, 3]), tensor([0, 1]))
"""
return_single = False
if not isinstance(graphs, Iterable):
graphs = [graphs]
return_single = True
if len(graphs) == 0:
return []
# Ensure the node types are ordered the same.
# TODO(BarclayII): we ideally need to remove this constraint.
ntypes = graphs[0].ntypes
graph_dtype = graphs[0]._graph.dtype()
graph_ctx = graphs[0]._graph.ctx()
for g in graphs:
assert ntypes == g.ntypes, \
("All graphs should have the same node types in the same order, got %s and %s" %
ntypes, g.ntypes)
assert graph_dtype == g._graph.dtype(), "Graph data type mismatch"
assert graph_ctx == g._graph.ctx(), "Graph device mismatch"
# Process the dictionary or tensor of "always preserve" nodes
if always_preserve is None:
always_preserve = {}
elif not isinstance(always_preserve, Mapping):
if len(ntypes) > 1:
raise ValueError("Node type must be given if multiple node types exist.")
always_preserve = {ntypes[0]: always_preserve}
always_preserve_nd = []
for ntype in ntypes:
nodes = always_preserve.get(ntype, None)
if nodes is None:
nodes = nd.empty([0], graph_dtype, graph_ctx)
else:
nodes = F.zerocopy_to_dgl_ndarray(nodes)
always_preserve_nd.append(nodes)
# Compact and construct heterographs
new_graph_indexes, induced_nodes = _CAPI_DGLCompactGraphs(
[g._graph for g in graphs], always_preserve_nd)
induced_nodes = [F.zerocopy_from_dgl_ndarray(nodes.data) for nodes in induced_nodes]
new_graphs = [
DGLHeteroGraph(new_graph_index, graph.ntypes, graph.etypes)
for new_graph_index, graph in zip(new_graph_indexes, graphs)]
for g in new_graphs:
for i, ntype in enumerate(graphs[0].ntypes):
g.nodes[ntype].data[NID] = induced_nodes[i]
if return_single:
new_graphs = new_graphs[0]
return new_graphs
def in_subgraph(g, nodes): def in_subgraph(g, nodes):
"""Extract the subgraph containing only the in edges of the given nodes. """Extract the subgraph containing only the in edges of the given nodes.
...@@ -572,7 +721,6 @@ def in_subgraph(g, nodes): ...@@ -572,7 +721,6 @@ def in_subgraph(g, nodes):
Node/edge features are not preserved. The original IDs Node/edge features are not preserved. The original IDs
the extracted edges are stored as the `dgl.EID` feature in the returned graph. the extracted edges are stored as the `dgl.EID` feature in the returned graph.
Parameters Parameters
---------- ----------
g : DGLHeteroGraph g : DGLHeteroGraph
...@@ -612,7 +760,6 @@ def out_subgraph(g, nodes): ...@@ -612,7 +760,6 @@ def out_subgraph(g, nodes):
Node/edge features are not preserved. The original IDs Node/edge features are not preserved. The original IDs
the extracted edges are stored as the `dgl.EID` feature in the returned graph. the extracted edges are stored as the `dgl.EID` feature in the returned graph.
Parameters Parameters
---------- ----------
g : DGLHeteroGraph g : DGLHeteroGraph
...@@ -645,4 +792,68 @@ def out_subgraph(g, nodes): ...@@ -645,4 +792,68 @@ def out_subgraph(g, nodes):
ret.edges[etype].data[EID] = induced_edges[i].tousertensor() ret.edges[etype].data[EID] = induced_edges[i].tousertensor()
return ret return ret
def to_simple(g, return_counts='count', writeback_mapping=None):
"""Convert a heterogeneous multigraph to a heterogeneous simple graph, coalescing
duplicate edges into one.
This function does not preserve node and edge features.
Parameters
----------
g : DGLHeteroGraph
The heterogeneous graph
return_counts : str, optional
If given, the returned graph would have a column with the same name that stores
the number of duplicated edges from the original graph.
writeback_mapping : str, optional
If given, the mapping from the edge IDs of original graph to those of the returned
graph would be written into edge feature with this name in the original graph for
each edge type.
Returns
-------
DGLHeteroGraph
The new heterogeneous simple graph.
Examples
--------
Consider the following graph
>>> g = dgl.graph([(0, 1), (1, 3), (2, 2), (1, 3), (1, 4), (1, 4)])
>>> sg = dgl.to_simple(g, return_counts='weights', writeback_mapping='new_eid')
The returned graph would have duplicate edges connecting (1, 3) and (1, 4) removed:
>>> sg.all_edges(form='uv', order='eid')
(tensor([0, 1, 1, 2]), tensor([1, 3, 4, 2]))
If ``return_counts`` is set, the returned graph will also return how many edges
in the original graph are connecting the endpoints of the edges in the new graph:
>>> sg.edata['weights']
tensor([1, 2, 2, 1])
This essentially reads that one edge is connecting (0, 1) in ``g``, whereas 2 edges
are connecting (1, 3) in ``g``, etc.
One can also retrieve the mapping from the edges in the original graph to edges in
the new graph by setting ``writeback_mapping`` and running
>>> g.edata['new_eid']
tensor([0, 1, 3, 1, 2, 2])
This tells us that the first edge in ``g`` is mapped to the first edge in ``sg``, and
the second and the fourth edge are mapped to the second edge in ``sg``, etc.
"""
simple_graph_index, counts, edge_maps = _CAPI_DGLToSimpleHetero(g._graph)
simple_graph = DGLHeteroGraph(simple_graph_index, g.ntypes, g.etypes)
counts = [F.zerocopy_from_dgl_ndarray(count.data) for count in counts]
edge_maps = [F.zerocopy_from_dgl_ndarray(edge_map.data) for edge_map in edge_maps]
if return_counts is not None:
for count, canonical_etype in zip(counts, g.canonical_etypes):
simple_graph.edges[canonical_etype].data[return_counts] = count
if writeback_mapping is not None:
for edge_map, canonical_etype in zip(edge_maps, g.canonical_etypes):
g.edges[canonical_etype].data[writeback_mapping] = edge_map
return simple_graph
_init_api("dgl.transform") _init_api("dgl.transform")
...@@ -230,6 +230,30 @@ template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index); ...@@ -230,6 +230,30 @@ template uint64_t IndexSelect<uint64_t>(NDArray array, uint64_t index);
template float IndexSelect<float>(NDArray array, uint64_t index); template float IndexSelect<float>(NDArray array, uint64_t index);
template double IndexSelect<double>(NDArray array, uint64_t index); template double IndexSelect<double>(NDArray array, uint64_t index);
NDArray Scatter(NDArray array, IdArray indices) {
NDArray ret;
ATEN_XPU_SWITCH(array->ctx.device_type, XPU, {
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
ATEN_ID_TYPE_SWITCH(indices->dtype, IdType, {
ret = impl::Scatter<XPU, DType, IdType>(array, indices);
});
});
});
return ret;
}
NDArray Repeat(NDArray array, IdArray repeats) {
NDArray ret;
ATEN_XPU_SWITCH(array->ctx.device_type, XPU, {
ATEN_DTYPE_SWITCH(array->dtype, DType, "values", {
ATEN_ID_TYPE_SWITCH(repeats->dtype, IdType, {
ret = impl::Repeat<XPU, DType, IdType>(array, repeats);
});
});
});
return ret;
}
IdArray Relabel_(const std::vector<IdArray>& arrays) { IdArray Relabel_(const std::vector<IdArray>& arrays) {
IdArray ret; IdArray ret;
ATEN_XPU_SWITCH(arrays[0]->ctx.device_type, XPU, { ATEN_XPU_SWITCH(arrays[0]->ctx.device_type, XPU, {
...@@ -426,11 +450,11 @@ COOMatrix CSRRowWiseSampling( ...@@ -426,11 +450,11 @@ COOMatrix CSRRowWiseSampling(
} }
COOMatrix CSRRowWiseTopk( COOMatrix CSRRowWiseTopk(
CSRMatrix mat, IdArray rows, int64_t k, FloatArray weight, bool ascending) { CSRMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending) {
COOMatrix ret; COOMatrix ret;
ATEN_CSR_SWITCH(mat, XPU, IdType, { ATEN_CSR_SWITCH(mat, XPU, IdType, {
ATEN_FLOAT_TYPE_SWITCH(weight->dtype, FloatType, "weight", { ATEN_DTYPE_SWITCH(weight->dtype, DType, "weight", {
ret = impl::CSRRowWiseTopk<XPU, IdType, FloatType>( ret = impl::CSRRowWiseTopk<XPU, IdType, DType>(
mat, rows, k, weight, ascending); mat, rows, k, weight, ascending);
}); });
}); });
...@@ -580,5 +604,13 @@ COOMatrix COORowWiseTopk( ...@@ -580,5 +604,13 @@ COOMatrix COORowWiseTopk(
return ret; return ret;
} }
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
std::pair<COOMatrix, IdArray> ret;
ATEN_COO_SWITCH(coo, XPU, IdType, {
ret = impl::COOCoalesce<XPU, IdType>(coo);
});
return ret;
}
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
...@@ -42,6 +42,12 @@ NDArray IndexSelect(NDArray array, IdArray index); ...@@ -42,6 +42,12 @@ NDArray IndexSelect(NDArray array, IdArray index);
template <DLDeviceType XPU, typename DType> template <DLDeviceType XPU, typename DType>
DType IndexSelect(NDArray array, uint64_t index); DType IndexSelect(NDArray array, uint64_t index);
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Scatter(NDArray array, IdArray indices);
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Repeat(NDArray array, IdArray repeats);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
IdArray Relabel_(const std::vector<IdArray>& arrays); IdArray Relabel_(const std::vector<IdArray>& arrays);
...@@ -117,9 +123,9 @@ COOMatrix CSRRowWiseSamplingUniform( ...@@ -117,9 +123,9 @@ COOMatrix CSRRowWiseSamplingUniform(
CSRMatrix mat, IdArray rows, int64_t num_samples, bool replace); CSRMatrix mat, IdArray rows, int64_t num_samples, bool replace);
// FloatType is the type of weight data. // FloatType is the type of weight data.
template <DLDeviceType XPU, typename IdType, typename FloatType> template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix CSRRowWiseTopk( COOMatrix CSRRowWiseTopk(
CSRMatrix mat, IdArray rows, int64_t k, FloatArray weight, bool ascending); CSRMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending);
/////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////
...@@ -164,6 +170,9 @@ COOMatrix COOSliceRows(COOMatrix coo, runtime::NDArray rows); ...@@ -164,6 +170,9 @@ COOMatrix COOSliceRows(COOMatrix coo, runtime::NDArray rows);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols); COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
COOMatrix COOSort(COOMatrix mat, bool sort_column); COOMatrix COOSort(COOMatrix mat, bool sort_column);
......
/*!
* Copyright (c) 2020 by Contributors
* \file array/cpu/array_repeat.cc
* \brief Array repeat CPU implementation
*/
#include <dgl/array.h>
#include <algorithm>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Repeat(NDArray array, IdArray repeats) {
CHECK(array->shape[0] == repeats->shape[0]) << "shape of array and repeats mismatch";
const int64_t len = array->shape[0];
const DType *array_data = static_cast<DType *>(array->data);
const IdType *repeats_data = static_cast<IdType *>(repeats->data);
IdType num_elements = 0;
for (int64_t i = 0; i < len; ++i)
num_elements += repeats_data[i];
NDArray result = NDArray::Empty({num_elements}, array->dtype, array->ctx);
DType *result_data = static_cast<DType *>(result->data);
IdType curr = 0;
for (int64_t i = 0; i < len; ++i) {
std::fill(result_data + curr, result_data + curr + repeats_data[i], array_data[i]);
curr += repeats_data[i];
}
return result;
}
template NDArray Repeat<kDLCPU, int32_t, int32_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, int64_t, int32_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, float, int32_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, double, int32_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, int32_t, int64_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, int64_t, int64_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, float, int64_t>(NDArray, IdArray);
template NDArray Repeat<kDLCPU, double, int64_t>(NDArray, IdArray);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file array/cpu/array_scatter.cc
* \brief Array scatter CPU implementation
*/
#include <dgl/array.h>
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename DType, typename IdType>
NDArray Scatter(NDArray array, IdArray indices) {
NDArray result = NDArray::Empty({indices->shape[0]}, array->dtype, array->ctx);
const DType *array_data = static_cast<DType *>(array->data);
const IdType *indices_data = static_cast<IdType *>(indices->data);
DType *result_data = static_cast<DType *>(result->data);
for (int64_t i = 0; i < indices->shape[0]; ++i)
result_data[indices_data[i]] = array_data[i];
return result;
}
template NDArray Scatter<kDLCPU, int32_t, int32_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, int64_t, int32_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, float, int32_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, double, int32_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, int32_t, int64_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, int64_t, int64_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, float, int64_t>(NDArray, IdArray);
template NDArray Scatter<kDLCPU, double, int64_t>(NDArray, IdArray);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file array/cpu/coo_coalesce.cc
* \brief COO coalescing
*/
#include <dgl/array.h>
#include <vector>
namespace dgl {
namespace aten {
namespace impl {
template <DLDeviceType XPU, typename IdType>
std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
const int64_t nnz = coo.row->shape[0];
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
if (!coo.row_sorted || !coo.col_sorted)
coo = COOSort(coo, true);
std::vector<IdType> new_row, new_col, count;
IdType prev_row = -1, prev_col = -1;
for (int64_t i = 0; i < nnz; ++i) {
const IdType curr_row = coo_row_data[i];
const IdType curr_col = coo_col_data[i];
if (curr_row == prev_row && curr_col == prev_col) {
++count[count.size() - 1];
} else {
new_row.push_back(curr_row);
new_col.push_back(curr_col);
count.push_back(1);
prev_row = curr_row;
prev_col = curr_col;
}
}
COOMatrix coo_result = COOMatrix{
coo.num_rows, coo.num_cols, NDArray::FromVector(new_row), NDArray::FromVector(new_col),
NDArray(), true};
return std::make_pair(coo_result, NDArray::FromVector(count));
}
template std::pair<COOMatrix, IdArray> COOCoalesce<kDLCPU, int32_t>(COOMatrix);
template std::pair<COOMatrix, IdArray> COOCoalesce<kDLCPU, int64_t>(COOMatrix);
}; // namespace impl
}; // namespace aten
}; // namespace dgl
...@@ -12,9 +12,9 @@ namespace aten { ...@@ -12,9 +12,9 @@ namespace aten {
namespace impl { namespace impl {
namespace { namespace {
template <typename IdxType, typename FloatType> template <typename IdxType, typename DType>
inline PickFn<IdxType> GetTopkPickFn(int64_t k, FloatArray weight, bool ascending) { inline PickFn<IdxType> GetTopkPickFn(int64_t k, NDArray weight, bool ascending) {
const FloatType* wdata = static_cast<FloatType*>(weight->data); const DType* wdata = static_cast<DType*>(weight->data);
PickFn<IdxType> pick_fn = [k, ascending, wdata] PickFn<IdxType> pick_fn = [k, ascending, wdata]
(IdxType rowid, IdxType off, IdxType len, (IdxType rowid, IdxType off, IdxType len,
const IdxType* col, const IdxType* data, const IdxType* col, const IdxType* data,
...@@ -55,37 +55,53 @@ inline PickFn<IdxType> GetTopkPickFn(int64_t k, FloatArray weight, bool ascendin ...@@ -55,37 +55,53 @@ inline PickFn<IdxType> GetTopkPickFn(int64_t k, FloatArray weight, bool ascendin
} // namespace } // namespace
template <DLDeviceType XPU, typename IdxType, typename FloatType> template <DLDeviceType XPU, typename IdxType, typename DType>
COOMatrix CSRRowWiseTopk( COOMatrix CSRRowWiseTopk(
CSRMatrix mat, IdArray rows, int64_t k, FloatArray weight, bool ascending) { CSRMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending) {
auto pick_fn = GetTopkPickFn<IdxType, FloatType>(k, weight, ascending); auto pick_fn = GetTopkPickFn<IdxType, DType>(k, weight, ascending);
return CSRRowWisePick(mat, rows, k, false, pick_fn); return CSRRowWisePick(mat, rows, k, false, pick_fn);
} }
template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, int32_t>(
CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, int32_t>(
CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, int64_t>(
CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, int64_t>(
CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, float>( template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, float>(
CSRMatrix, IdArray, int64_t, FloatArray, bool); CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, float>( template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, float>(
CSRMatrix, IdArray, int64_t, FloatArray, bool); CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, double>( template COOMatrix CSRRowWiseTopk<kDLCPU, int32_t, double>(
CSRMatrix, IdArray, int64_t, FloatArray, bool); CSRMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, double>( template COOMatrix CSRRowWiseTopk<kDLCPU, int64_t, double>(
CSRMatrix, IdArray, int64_t, FloatArray, bool); CSRMatrix, IdArray, int64_t, NDArray, bool);
template <DLDeviceType XPU, typename IdxType, typename FloatType> template <DLDeviceType XPU, typename IdxType, typename DType>
COOMatrix COORowWiseTopk( COOMatrix COORowWiseTopk(
COOMatrix mat, IdArray rows, int64_t k, FloatArray weight, bool ascending) { COOMatrix mat, IdArray rows, int64_t k, NDArray weight, bool ascending) {
auto pick_fn = GetTopkPickFn<IdxType, FloatType>(k, weight, ascending); auto pick_fn = GetTopkPickFn<IdxType, DType>(k, weight, ascending);
return COORowWisePick(mat, rows, k, false, pick_fn); return COORowWisePick(mat, rows, k, false, pick_fn);
} }
template COOMatrix COORowWiseTopk<kDLCPU, int32_t, int32_t>(
COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int64_t, int32_t>(
COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int32_t, int64_t>(
COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int64_t, int64_t>(
COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int32_t, float>( template COOMatrix COORowWiseTopk<kDLCPU, int32_t, float>(
COOMatrix, IdArray, int64_t, FloatArray, bool); COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int64_t, float>( template COOMatrix COORowWiseTopk<kDLCPU, int64_t, float>(
COOMatrix, IdArray, int64_t, FloatArray, bool); COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int32_t, double>( template COOMatrix COORowWiseTopk<kDLCPU, int32_t, double>(
COOMatrix, IdArray, int64_t, FloatArray, bool); COOMatrix, IdArray, int64_t, NDArray, bool);
template COOMatrix COORowWiseTopk<kDLCPU, int64_t, double>( template COOMatrix COORowWiseTopk<kDLCPU, int64_t, double>(
COOMatrix, IdArray, int64_t, FloatArray, bool); COOMatrix, IdArray, int64_t, NDArray, bool);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
......
...@@ -21,7 +21,10 @@ inline bool operator == (const DLDataType& ty1, const DLDataType& ty2) { ...@@ -21,7 +21,10 @@ inline bool operator == (const DLDataType& ty1, const DLDataType& ty2) {
/*! \brief Output the string representation of device context.*/ /*! \brief Output the string representation of device context.*/
inline std::ostream& operator << (std::ostream& os, const DLDataType& ty) { inline std::ostream& operator << (std::ostream& os, const DLDataType& ty) {
return os << "code=" << ty.code << ",bits=" << ty.bits << "lanes=" << ty.lanes; return os <<
"code=" << static_cast<int>(ty.code) <<
",bits=" << static_cast<int>(ty.bits) <<
"lanes=" << static_cast<int>(ty.lanes);
} }
/*! \brief Check whether two device contexts are the same.*/ /*! \brief Check whether two device contexts are the same.*/
......
...@@ -16,9 +16,6 @@ ...@@ -16,9 +16,6 @@
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./unit_graph.h" #include "./unit_graph.h"
#include "graph_serializer.h" #include "graph_serializer.h"
// TODO(BarclayII): currently CompactGraphs depend on IdHashMap implementation which
// only works on CPU. Should fix later to make it device agnostic.
#include "../array/cpu/array_utils.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -115,66 +112,6 @@ HeteroSubgraph EdgeSubgraphNoPreserveNodes( ...@@ -115,66 +112,6 @@ HeteroSubgraph EdgeSubgraphNoPreserveNodes(
return ret; return ret;
} }
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs) {
// TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
// Step 1: Collect the nodes that has connections for each type.
std::vector<aten::IdHashMap<IdType>> hashmaps(graphs[0]->NumVertexTypes());
std::vector<std::vector<EdgeArray>> all_edges(graphs.size()); // all_edges[i][etype]
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray edges = curr_graph->Edges(etype, "eid");
hashmaps[srctype].Update(edges.src);
hashmaps[dsttype].Update(edges.dst);
all_edges[i].push_back(edges);
}
}
// Step 2: Relabel the nodes for each type to a smaller ID space and save the mapping.
std::vector<IdArray> induced_nodes;
for (auto &hashmap : hashmaps)
induced_nodes.push_back(hashmap.Values());
// Step 3: Remap the edges of each graph.
std::vector<HeteroGraphPtr> new_graphs;
for (size_t i = 0; i < graphs.size(); ++i) {
std::vector<HeteroGraphPtr> rel_graphs;
const HeteroGraphPtr curr_graph = graphs[i];
const auto meta_graph = curr_graph->meta_graph();
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray &edges = all_edges[i][etype];
const IdArray mapped_rows = hashmaps[srctype].Map(edges.src, -1);
const IdArray mapped_cols = hashmaps[dsttype].Map(edges.dst, -1);
rel_graphs.push_back(UnitGraph::CreateFromCOO(
srctype == dsttype ? 1 : 2,
induced_nodes[srctype]->shape[0],
induced_nodes[dsttype]->shape[0],
mapped_rows,
mapped_cols));
}
new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs));
}
return std::make_pair(new_graphs, induced_nodes);
}
} // namespace } // namespace
HeteroGraph::HeteroGraph(GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs) HeteroGraph::HeteroGraph(GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs)
...@@ -589,15 +526,6 @@ HeteroGraphPtr CreateFromCSR( ...@@ -589,15 +526,6 @@ HeteroGraphPtr CreateFromCSR(
return HeteroGraphPtr(new HeteroGraph(unit_g->meta_graph(), {unit_g})); return HeteroGraphPtr(new HeteroGraph(unit_g->meta_graph(), {unit_g}));
} }
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs) {
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result;
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result = CompactGraphs<IdType>(graphs);
});
return result;
}
constexpr uint64_t kDGLSerialize_HeteroGraph = 0xDD589FBE35224ABF; constexpr uint64_t kDGLSerialize_HeteroGraph = 0xDD589FBE35224ABF;
bool HeteroGraph::Load(dmlc::Stream* fs) { bool HeteroGraph::Load(dmlc::Stream* fs) {
...@@ -764,6 +692,12 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroClear") ...@@ -764,6 +692,12 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroClear")
hg->Clear(); hg->Clear();
}); });
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDataType")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0];
*rv = hg->DataType();
});
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroContext") DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroContext")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([] (DGLArgs args, DGLRetValue* rv) {
HeteroGraphRef hg = args[0]; HeteroGraphRef hg = args[0];
...@@ -996,31 +930,6 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph") ...@@ -996,31 +930,6 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph")
*rv = HeteroSubgraphRef(subg); *rv = HeteroSubgraphRef(subg);
}); });
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLCompactGraphs")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
List<HeteroGraphRef> graph_refs = args[0];
std::vector<HeteroGraphPtr> graphs;
for (HeteroGraphRef gref : graph_refs)
graphs.push_back(gref.sptr());
const auto &result_pair = CompactGraphs(graphs);
List<HeteroGraphRef> compacted_graph_refs;
List<Value> induced_nodes;
for (const HeteroGraphPtr g : result_pair.first)
compacted_graph_refs.push_back(HeteroGraphRef(g));
for (const IdArray &ids : result_pair.second)
induced_nodes.push_back(Value(MakeValue(ids)));
List<ObjectRef> result;
result.push_back(compacted_graph_refs);
result.push_back(induced_nodes);
*rv = result;
});
DGL_REGISTER_GLOBAL("transform._CAPI_DGLInSubgraph") DGL_REGISTER_GLOBAL("transform._CAPI_DGLInSubgraph")
.set_body([] (DGLArgs args, DGLRetValue *rv) { .set_body([] (DGLArgs args, DGLRetValue *rv) {
HeteroGraphRef hg = args[0]; HeteroGraphRef hg = args[0];
......
...@@ -35,7 +35,7 @@ void CheckRandomWalkInputs( ...@@ -35,7 +35,7 @@ void CheckRandomWalkInputs(
for (uint64_t i = 0; i < prob.size(); ++i) { for (uint64_t i = 0; i < prob.size(); ++i) {
FloatArray p = prob[i]; FloatArray p = prob[i];
CHECK_FLOAT(p, "probability"); CHECK_FLOAT(p, "probability");
if (p.GetSize() == 0) if (p.GetSize() != 0)
CHECK_NDIM(p, 1, "probability"); CHECK_NDIM(p, 1, "probability");
} }
} }
......
/*!
* Copyright (c) 2019 by Contributors
* \file graph/transform/compact.cc
* \brief Compact graph implementation
*/
#include <dgl/base_heterograph.h>
#include <dgl/transform.h>
#include <dgl/array.h>
#include <dgl/packed_func_ext.h>
#include <vector>
#include <utility>
#include "../../c_api_common.h"
#include "../unit_graph.h"
// TODO(BarclayII): currently CompactGraphs depend on IdHashMap implementation which
// only works on CPU. Should fix later to make it device agnostic.
#include "../../array/cpu/array_utils.h"
namespace dgl {
using namespace dgl::runtime;
using namespace dgl::aten;
namespace transform {
namespace {
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
// TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
// Step 1: Collect the nodes that has connections for each type.
std::vector<aten::IdHashMap<IdType>> hashmaps(graphs[0]->NumVertexTypes());
std::vector<std::vector<EdgeArray>> all_edges(graphs.size()); // all_edges[i][etype]
for (size_t i = 0; i < always_preserve.size(); ++i)
hashmaps[i].Update(always_preserve[i]);
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray edges = curr_graph->Edges(etype, "eid");
hashmaps[srctype].Update(edges.src);
hashmaps[dsttype].Update(edges.dst);
all_edges[i].push_back(edges);
}
}
// Step 2: Relabel the nodes for each type to a smaller ID space and save the mapping.
std::vector<IdArray> induced_nodes;
for (auto &hashmap : hashmaps)
induced_nodes.push_back(hashmap.Values());
// Step 3: Remap the edges of each graph.
std::vector<HeteroGraphPtr> new_graphs;
for (size_t i = 0; i < graphs.size(); ++i) {
std::vector<HeteroGraphPtr> rel_graphs;
const HeteroGraphPtr curr_graph = graphs[i];
const auto meta_graph = curr_graph->meta_graph();
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray &edges = all_edges[i][etype];
const IdArray mapped_rows = hashmaps[srctype].Map(edges.src, -1);
const IdArray mapped_cols = hashmaps[dsttype].Map(edges.dst, -1);
rel_graphs.push_back(UnitGraph::CreateFromCOO(
srctype == dsttype ? 1 : 2,
induced_nodes[srctype]->shape[0],
induced_nodes[dsttype]->shape[0],
mapped_rows,
mapped_cols));
}
new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs));
}
return std::make_pair(new_graphs, induced_nodes);
}
}; // namespace
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(
const std::vector<HeteroGraphPtr> &graphs,
const std::vector<IdArray> &always_preserve) {
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result;
// TODO(BarclayII): check for all IdArrays
CHECK(graphs[0]->DataType() == always_preserve[0]->dtype) << "data type mismatch.";
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result = CompactGraphs<IdType>(graphs, always_preserve);
});
return result;
}
DGL_REGISTER_GLOBAL("transform._CAPI_DGLCompactGraphs")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
List<HeteroGraphRef> graph_refs = args[0];
List<Value> always_preserve_refs = args[1];
std::vector<HeteroGraphPtr> graphs;
std::vector<IdArray> always_preserve;
for (HeteroGraphRef gref : graph_refs)
graphs.push_back(gref.sptr());
for (Value array : always_preserve_refs)
always_preserve.push_back(array->data);
const auto &result_pair = CompactGraphs(graphs, always_preserve);
List<HeteroGraphRef> compacted_graph_refs;
List<Value> induced_nodes;
for (const HeteroGraphPtr g : result_pair.first)
compacted_graph_refs.push_back(HeteroGraphRef(g));
for (const IdArray &ids : result_pair.second)
induced_nodes.push_back(Value(MakeValue(ids)));
List<ObjectRef> result;
result.push_back(compacted_graph_refs);
result.push_back(induced_nodes);
*rv = result;
});
}; // namespace transform
}; // namespace dgl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment