Unverified Commit eeeb52f4 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Preference to COO for "hypersparse" unit graphs & graph compaction (#1238)

* unit graph that prefers coo queries

* auto detect coo preference

* forgot some functions

* disable lint on detect_prefer_coo

* reorg

* change comment

* lint

* fix

* move array_utils.h to src

* compact graph impl

* fix redundant copying in idhashmap

* docstring

* moving preference detection to C

* lint

* fix unit test & address comments

* hypersparse autorestrict

* docstring & fix

* revert copyto and asnumbits

* fix stupid bug

* lint

* leave a TODO for sorted COO

* fixing same node type mapping to different id in different graphs

* addresses comments

* made induced nodes a feautre column

* lint?
parent 828a5e5b
...@@ -211,7 +211,7 @@ struct CSRMatrix { ...@@ -211,7 +211,7 @@ struct CSRMatrix {
runtime::NDArray indptr, indices; runtime::NDArray indptr, indices;
/*! \brief data array, could be empty. */ /*! \brief data array, could be empty. */
runtime::NDArray data; runtime::NDArray data;
/*! \brief indicate that the edges are stored in the sorted order. */ /*! \brief whether the column indices per row are sorted */
bool sorted; bool sorted;
}; };
...@@ -229,7 +229,9 @@ struct COOMatrix { ...@@ -229,7 +229,9 @@ struct COOMatrix {
int64_t num_rows, num_cols; int64_t num_rows, num_cols;
/*! \brief COO index arrays */ /*! \brief COO index arrays */
runtime::NDArray row, col; runtime::NDArray row, col;
/*! \brief data array, could be empty. */ /*!
* \brief data array, could be empty. When empty, assume it is from 0 to NNZ - 1.
*/
runtime::NDArray data; runtime::NDArray data;
}; };
...@@ -253,6 +255,11 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix , int64_t row); ...@@ -253,6 +255,11 @@ runtime::NDArray CSRGetRowColumnIndices(CSRMatrix , int64_t row);
/*! \brief Return the data array of the given row */ /*! \brief Return the data array of the given row */
runtime::NDArray CSRGetRowData(CSRMatrix , int64_t row); runtime::NDArray CSRGetRowData(CSRMatrix , int64_t row);
/*! \brief Whether the CSR matrix contains data */
inline bool CSRHasData(CSRMatrix csr) {
return csr.data.defined();
}
/* \brief Get data. The return type is an ndarray due to possible duplicate entries. */ /* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col); runtime::NDArray CSRGetData(CSRMatrix , int64_t row, int64_t col);
/*! /*!
...@@ -326,8 +333,39 @@ void CSRSort(CSRMatrix csr); ...@@ -326,8 +333,39 @@ void CSRSort(CSRMatrix csr);
///////////////////////// COO routines ////////////////////////// ///////////////////////// COO routines //////////////////////////
/*! \return True if the matrix has duplicate entries */ /*! \brief Return true if the value (row, col) is non-zero */
bool COOHasDuplicate(COOMatrix coo); bool COOIsNonZero(COOMatrix , int64_t row, int64_t col);
/*!
* \brief Batched implementation of COOIsNonZero.
* \note This operator allows broadcasting (i.e, either row or col can be of length 1).
*/
runtime::NDArray COOIsNonZero(COOMatrix, runtime::NDArray row, runtime::NDArray col);
/*! \brief Return the nnz of the given row */
int64_t COOGetRowNNZ(COOMatrix , int64_t row);
runtime::NDArray COOGetRowNNZ(COOMatrix , runtime::NDArray row);
/*! \brief Return the data array of the given row */
std::pair<runtime::NDArray, runtime::NDArray>
COOGetRowDataAndIndices(COOMatrix , int64_t row);
/*! \brief Whether the COO matrix contains data */
inline bool COOHasData(COOMatrix csr) {
return csr.data.defined();
}
/*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */
runtime::NDArray COOGetData(COOMatrix , int64_t row, int64_t col);
/*!
* \brief Get the data and the row,col indices for each returned entries.
* \note This operator allows broadcasting (i.e, either row or col can be of length 1).
*/
std::vector<runtime::NDArray> COOGetDataAndIndices(
COOMatrix , runtime::NDArray rows, runtime::NDArray cols);
/*! \brief Return a transposed COO matrix */
COOMatrix COOTranspose(COOMatrix coo);
/*! /*!
* \brief Convert COO matrix to CSR matrix. * \brief Convert COO matrix to CSR matrix.
...@@ -339,6 +377,32 @@ bool COOHasDuplicate(COOMatrix coo); ...@@ -339,6 +377,32 @@ bool COOHasDuplicate(COOMatrix coo);
*/ */
CSRMatrix COOToCSR(COOMatrix coo); CSRMatrix COOToCSR(COOMatrix coo);
/*!
* \brief Slice rows of the given matrix and return.
* \param coo COO matrix
* \param start Start row id (inclusive)
* \param end End row id (exclusive)
*/
COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end);
COOMatrix COOSliceRows(COOMatrix coo, runtime::NDArray rows);
/*!
* \brief Get the submatrix specified by the row and col ids.
*
* In numpy notation, given matrix M, row index array I, col index array J
* This function returns the submatrix M[I, J].
*
* \param coo The input coo matrix
* \param rows The row index to select
* \param cols The col index to select
* \return submatrix
*/
COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
/*! \return True if the matrix has duplicate entries */
bool COOHasDuplicate(COOMatrix coo);
// inline implementations // inline implementations
template <typename T> template <typename T>
IdArray VecToIdArray(const std::vector<T>& vec, IdArray VecToIdArray(const std::vector<T>& vec,
...@@ -399,7 +463,7 @@ IdArray VecToIdArray(const std::vector<T>& vec, ...@@ -399,7 +463,7 @@ IdArray VecToIdArray(const std::vector<T>& vec,
/* /*
* Dispatch according to float type (either float32 or float64): * Dispatch according to float type (either float32 or float64):
* *
* ATEN_ID_TYPE_SWITCH(array->dtype, FloatType, { * ATEN_FLOAT_TYPE_SWITCH(array->dtype, FloatType, {
* // Now FloatType is the type corresponding to data type in array. * // Now FloatType is the type corresponding to data type in array.
* // For instance, one can do this for a CPU array: * // For instance, one can do this for a CPU array:
* FloatType *data = static_cast<FloatType *>(array->data); * FloatType *data = static_cast<FloatType *>(array->data);
...@@ -422,7 +486,7 @@ IdArray VecToIdArray(const std::vector<T>& vec, ...@@ -422,7 +486,7 @@ IdArray VecToIdArray(const std::vector<T>& vec,
/* /*
* Dispatch according to data type (int32, int64, float32 or float64): * Dispatch according to data type (int32, int64, float32 or float64):
* *
* ATEN_ID_TYPE_SWITCH(array->dtype, DType, { * ATEN_DTYPE_SWITCH(array->dtype, DType, {
* // Now DType is the type corresponding to data type in array. * // Now DType is the type corresponding to data type in array.
* // For instance, one can do this for a CPU array: * // For instance, one can do this for a CPU array:
* DType *data = static_cast<DType *>(array->data); * DType *data = static_cast<DType *>(array->data);
......
...@@ -90,6 +90,11 @@ class BaseHeteroGraph : public runtime::Object { ...@@ -90,6 +90,11 @@ class BaseHeteroGraph : public runtime::Object {
*/ */
virtual void Clear() = 0; virtual void Clear() = 0;
/*!
* \brief Get the data type of node and edge IDs of this graph.
*/
virtual DLDataType DataType() const = 0;
/*! /*!
* \brief Get the device context of this graph. * \brief Get the device context of this graph.
*/ */
...@@ -98,6 +103,7 @@ class BaseHeteroGraph : public runtime::Object { ...@@ -98,6 +103,7 @@ class BaseHeteroGraph : public runtime::Object {
/*! /*!
* \brief Get the number of integer bits used to store node/edge ids (32 or 64). * \brief Get the number of integer bits used to store node/edge ids (32 or 64).
*/ */
// TODO(BarclayII) replace NumBits() calls to DataType() calls
virtual uint8_t NumBits() const = 0; virtual uint8_t NumBits() const = 0;
/*! /*!
...@@ -464,19 +470,44 @@ DGL_DEFINE_OBJECT_REF(HeteroSubgraphRef, HeteroSubgraph); ...@@ -464,19 +470,44 @@ DGL_DEFINE_OBJECT_REF(HeteroSubgraphRef, HeteroSubgraph);
// creators // creators
/*! \brief Create a bipartite graph from COO arrays */ /*!
HeteroGraphPtr CreateBipartiteFromCOO( * \brief Sparse graph format.
int64_t num_src, int64_t num_dst, IdArray row, IdArray col); */
enum class SparseFormat {
ANY = 0,
COO = 1,
CSR = 2,
CSC = 3
};
/*! \brief Create a bipartite graph from (out) CSR arrays */ inline SparseFormat ParseSparseFormat(const std::string& name) {
HeteroGraphPtr CreateBipartiteFromCSR( if (name == "coo")
int64_t num_src, int64_t num_dst, return SparseFormat::COO;
IdArray indptr, IdArray indices, IdArray edge_ids); else if (name == "csr")
return SparseFormat::CSR;
else if (name == "csc")
return SparseFormat::CSC;
else
return SparseFormat::ANY;
}
/*! \brief Create a heterograph from meta graph and a list of bipartite graph */ /*! \brief Create a heterograph from meta graph and a list of bipartite graph */
HeteroGraphPtr CreateHeteroGraph( HeteroGraphPtr CreateHeteroGraph(
GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs); GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs);
/*!
* \brief Given a list of graphs, remove the common nodes that do not have inbound and
* outbound edges.
*
* The graphs should have identical node ID space (i.e. should have the same set of nodes,
* including types and IDs) and metagraph.
*
* \return A pair. The first element is the list of compacted graphs, and the second
* element is the mapping from the compacted graphs and the original graph.
*/
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs);
}; // namespace dgl }; // namespace dgl
#endif // DGL_BASE_HETEROGRAPH_H_ #endif // DGL_BASE_HETEROGRAPH_H_
"""Module for converting graph from/to other object.""" """Module for converting graph from/to other object."""
from collections import defaultdict from collections import defaultdict
from collections.abc import Iterable
import numpy as np import numpy as np
import scipy as sp import scipy as sp
import networkx as nx import networkx as nx
...@@ -19,9 +20,11 @@ __all__ = [ ...@@ -19,9 +20,11 @@ __all__ = [
'to_hetero', 'to_hetero',
'to_homo', 'to_homo',
'to_networkx', 'to_networkx',
'compact_graphs',
] ]
def graph(data, ntype='_N', etype='_E', card=None, validate=True, **kwargs): def graph(data, ntype='_N', etype='_E', card=None, validate=True, restrict_format='any',
**kwargs):
"""Create a graph with one type of nodes and edges. """Create a graph with one type of nodes and edges.
In the sparse matrix perspective, :func:`dgl.graph` creates a graph In the sparse matrix perspective, :func:`dgl.graph` creates a graph
...@@ -49,6 +52,8 @@ def graph(data, ntype='_N', etype='_E', card=None, validate=True, **kwargs): ...@@ -49,6 +52,8 @@ def graph(data, ntype='_N', etype='_E', card=None, validate=True, **kwargs):
If True, check if node ids are within cardinality, the check process may take If True, check if node ids are within cardinality, the check process may take
some time. (Default: True) some time. (Default: True)
If False and card is not None, user would receive a warning. If False and card is not None, user would receive a warning.
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
kwargs : key-word arguments, optional kwargs : key-word arguments, optional
Other key word arguments. Only comes into effect when we are using a NetworkX Other key word arguments. Only comes into effect when we are using a NetworkX
graph. It can consist of: graph. It can consist of:
...@@ -122,17 +127,24 @@ def graph(data, ntype='_N', etype='_E', card=None, validate=True, **kwargs): ...@@ -122,17 +127,24 @@ def graph(data, ntype='_N', etype='_E', card=None, validate=True, **kwargs):
urange, vrange = None, None urange, vrange = None, None
if isinstance(data, tuple): if isinstance(data, tuple):
u, v = data u, v = data
return create_from_edges(u, v, ntype, etype, ntype, urange, vrange, validate) return create_from_edges(
u, v, ntype, etype, ntype, urange, vrange, validate,
restrict_format=restrict_format)
elif isinstance(data, list): elif isinstance(data, list):
return create_from_edge_list(data, ntype, etype, ntype, urange, vrange, validate) return create_from_edge_list(
data, ntype, etype, ntype, urange, vrange, validate,
restrict_format=restrict_format)
elif isinstance(data, sp.sparse.spmatrix): elif isinstance(data, sp.sparse.spmatrix):
return create_from_scipy(data, ntype, etype, ntype) return create_from_scipy(
data, ntype, etype, ntype, restrict_format=restrict_format)
elif isinstance(data, nx.Graph): elif isinstance(data, nx.Graph):
return create_from_networkx(data, ntype, etype, **kwargs) return create_from_networkx(
data, ntype, etype, restrict_format=restrict_format, **kwargs)
else: else:
raise DGLError('Unsupported graph data type:', type(data)) raise DGLError('Unsupported graph data type:', type(data))
def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True, **kwargs): def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True,
restrict_format='any', **kwargs):
"""Create a bipartite graph. """Create a bipartite graph.
The result graph is directed and edges must be from ``utype`` nodes The result graph is directed and edges must be from ``utype`` nodes
...@@ -165,6 +177,8 @@ def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True ...@@ -165,6 +177,8 @@ def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True
If True, check if node ids are within cardinality, the check process may take If True, check if node ids are within cardinality, the check process may take
some time. (Default: True) some time. (Default: True)
If False and card is not None, user would receive a warning. If False and card is not None, user would receive a warning.
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
kwargs : key-word arguments, optional kwargs : key-word arguments, optional
Other key word arguments. Only comes into effect when we are using a NetworkX Other key word arguments. Only comes into effect when we are using a NetworkX
graph. It can consist of: graph. It can consist of:
...@@ -253,13 +267,19 @@ def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True ...@@ -253,13 +267,19 @@ def bipartite(data, utype='_U', etype='_E', vtype='_V', card=None, validate=True
urange, vrange = None, None urange, vrange = None, None
if isinstance(data, tuple): if isinstance(data, tuple):
u, v = data u, v = data
return create_from_edges(u, v, utype, etype, vtype, urange, vrange, validate) return create_from_edges(
u, v, utype, etype, vtype, urange, vrange, validate,
restrict_format=restrict_format)
elif isinstance(data, list): elif isinstance(data, list):
return create_from_edge_list(data, utype, etype, vtype, urange, vrange, validate) return create_from_edge_list(
data, utype, etype, vtype, urange, vrange, validate,
restrict_format=restrict_format)
elif isinstance(data, sp.sparse.spmatrix): elif isinstance(data, sp.sparse.spmatrix):
return create_from_scipy(data, utype, etype, vtype) return create_from_scipy(
data, utype, etype, vtype, restrict_format=restrict_format)
elif isinstance(data, nx.Graph): elif isinstance(data, nx.Graph):
return create_from_networkx_bipartite(data, utype, etype, vtype, **kwargs) return create_from_networkx_bipartite(
data, utype, etype, vtype, restrict_format=restrict_format, **kwargs)
else: else:
raise DGLError('Unsupported graph data type:', type(data)) raise DGLError('Unsupported graph data type:', type(data))
...@@ -331,24 +351,29 @@ def hetero_from_relations(rel_graphs): ...@@ -331,24 +351,29 @@ def hetero_from_relations(rel_graphs):
# TODO(minjie): this API can be generalized as a union operation of the input graphs # TODO(minjie): this API can be generalized as a union operation of the input graphs
# TODO(minjie): handle node/edge data # TODO(minjie): handle node/edge data
# infer meta graph # infer meta graph
ntype_dict = {} # ntype -> ntid ntype_set = set()
meta_edges = [] meta_edges = []
ntypes = [] ntypes = []
etypes = [] etypes = []
# TODO(BarclayII): I'm keeping the node type names sorted because even if
# the metagraph is the same, the same node type name in different graphs may
# map to different node type IDs.
# In the future, we need to lower the type names into C++.
for rgrh in rel_graphs: for rgrh in rel_graphs:
assert len(rgrh.etypes) == 1 assert len(rgrh.etypes) == 1
stype, etype, dtype = rgrh.canonical_etypes[0] stype, etype, dtype = rgrh.canonical_etypes[0]
if stype not in ntype_dict: ntype_set.add(stype)
ntype_dict[stype] = len(ntypes) ntype_set.add(dtype)
ntypes.append(stype) ntypes = list(sorted(ntype_set))
ntype_dict = {ntype: i for i, ntype in enumerate(ntypes)}
for rgrh in rel_graphs:
stype, etype, dtype = rgrh.canonical_etypes[0]
stid = ntype_dict[stype] stid = ntype_dict[stype]
if dtype not in ntype_dict:
ntype_dict[dtype] = len(ntypes)
ntypes.append(dtype)
dtid = ntype_dict[dtype] dtid = ntype_dict[dtype]
meta_edges.append((stid, dtid)) meta_edges.append((stid, dtid))
etypes.append(etype) etypes.append(etype)
metagraph = graph_index.from_edge_list(meta_edges, True, True) metagraph = graph_index.from_edge_list(meta_edges, True, True)
# create graph index # create graph index
hgidx = heterograph_index.create_heterograph_from_relations( hgidx = heterograph_index.create_heterograph_from_relations(
metagraph, [rgrh._graph for rgrh in rel_graphs]) metagraph, [rgrh._graph for rgrh in rel_graphs])
...@@ -699,11 +724,104 @@ def to_homo(G): ...@@ -699,11 +724,104 @@ def to_homo(G):
return retg return retg
def compact_graphs(graphs):
"""Given a list of graphs with the same set of nodes, find and eliminate the common
isolated nodes across all graphs.
This function requires the graphs to have the same set of nodes (i.e. the node types
must be the same, and the number of nodes of each node type must be the same). The
metagraph does not have to be the same.
It finds all the nodes that have zero in-degree and zero out-degree in all the given
graphs, and eliminates them from all the graphs.
Useful for graph sampling where we have a giant graph but we only wish to perform
message passing on a smaller graph with a (tiny) subset of nodes.
The node and edge features are not preserved.
Parameters
----------
graphs : DGLHeteroGraph or list[DGLHeteroGraph]
The graph, or list of graphs
Returns
-------
DGLHeteroGraph or list[DGLHeteroGraph]
The compacted graph or list of compacted graphs.
Each returned graph would have a feature ``dgl.NID`` containing the mapping
of node IDs for each type from the compacted graph(s) to the original graph(s).
Note that the mapping is the same for all the compacted graphs.
Examples
--------
The following code constructs a bipartite graph with 20 users and 10 games, but
only user #1 and #3, as well as game #3 and #5, have connections:
>>> g = dgl.bipartite([(1, 3), (3, 5)], 'user', 'plays', 'game', card=(20, 10))
The following would compact the graph above to another bipartite graph with only
two users and two games.
>>> new_g, induced_nodes = dgl.compact_graphs(g)
>>> induced_nodes
{'user': tensor([1, 3]), 'game': tensor([3, 5])}
The mapping tells us that only user #1 and #3 as well as game #3 and #5 are kept.
Furthermore, the first user and second user in the compacted graph maps to
user #1 and #3 in the original graph. Games are similar.
One can verify that the edge connections are kept the same in the compacted graph.
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
When compacting multiple graphs, nodes that do not have any connections in any
of the given graphs are removed. So if we compact ``g`` and the following ``g2``
graphs together:
>>> g2 = dgl.bipartite([(1, 6), (6, 8)], 'user', 'plays', 'game', card=(20, 10))
>>> (new_g, new_g2), induced_nodes = dgl.compact_graphs([g, g2])
>>> induced_nodes
{'user': tensor([1, 3, 6]), 'game': tensor([3, 5, 6, 8])}
Then one can see that user #1 from both graphs, users #3 from the first graph, as
well as user #6 from the second graph, are kept. Games are similar.
Similarly, one can also verify the connections:
>>> new_g.edges(form='all', order='eid', etype='plays')
(tensor([0, 1]), tensor([0, 1]), tensor([0, 1]))
>>> new_g2.edges(form='all', order='eid', etype='plays')
(tensor([0, 2]), tensor([2, 3]), tensor([0, 1]))
"""
return_single = False
if not isinstance(graphs, Iterable):
graphs = [graphs]
return_single = True
new_graph_indexes, induced_nodes = heterograph_index.compact_graph_indexes(
[g._graph for g in graphs])
new_graphs = [
DGLHeteroGraph(new_graph_index, graph.ntypes, graph.etypes)
for new_graph_index, graph in zip(new_graph_indexes, graphs)]
for g in new_graphs:
for i, ntype in enumerate(graphs[0].ntypes):
g.nodes[ntype].data[NID] = induced_nodes[i]
if return_single:
new_graphs = new_graphs[0]
return new_graphs
############################################################ ############################################################
# Internal APIs # Internal APIs
############################################################ ############################################################
def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, validate=True): def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, validate=True,
restrict_format="any"):
"""Internal function to create a graph from incident nodes with types. """Internal function to create a graph from incident nodes with types.
utype could be equal to vtype utype could be equal to vtype
...@@ -728,6 +846,8 @@ def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, valid ...@@ -728,6 +846,8 @@ def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, valid
maximum of the destination node IDs in the edge list plus 1. (Default: None) maximum of the destination node IDs in the edge list plus 1. (Default: None)
validate : bool, optional validate : bool, optional
If True, checks if node IDs are within range. If True, checks if node IDs are within range.
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
Returns Returns
------- -------
...@@ -755,13 +875,16 @@ def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, valid ...@@ -755,13 +875,16 @@ def create_from_edges(u, v, utype, etype, vtype, urange=None, vrange=None, valid
num_ntypes = 1 num_ntypes = 1
else: else:
num_ntypes = 2 num_ntypes = 2
hgidx = heterograph_index.create_unitgraph_from_coo(num_ntypes, urange, vrange, u, v)
hgidx = heterograph_index.create_unitgraph_from_coo(
num_ntypes, urange, vrange, u, v, restrict_format)
if utype == vtype: if utype == vtype:
return DGLHeteroGraph(hgidx, [utype], [etype]) return DGLHeteroGraph(hgidx, [utype], [etype])
else: else:
return DGLHeteroGraph(hgidx, [utype, vtype], [etype]) return DGLHeteroGraph(hgidx, [utype, vtype], [etype])
def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None, validate=True): def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None,
validate=True, restrict_format='any'):
"""Internal function to create a heterograph from a list of edge tuples with types. """Internal function to create a heterograph from a list of edge tuples with types.
utype could be equal to vtype utype could be equal to vtype
...@@ -784,7 +907,8 @@ def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None, ...@@ -784,7 +907,8 @@ def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None,
maximum of the destination node IDs in the edge list plus 1. (Default: None) maximum of the destination node IDs in the edge list plus 1. (Default: None)
validate : bool, optional validate : bool, optional
If True, checks if node IDs are within range. If True, checks if node IDs are within range.
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
Returns Returns
------- -------
...@@ -796,9 +920,11 @@ def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None, ...@@ -796,9 +920,11 @@ def create_from_edge_list(elist, utype, etype, vtype, urange=None, vrange=None,
u, v = zip(*elist) u, v = zip(*elist)
u = list(u) u = list(u)
v = list(v) v = list(v)
return create_from_edges(u, v, utype, etype, vtype, urange, vrange, validate) return create_from_edges(
u, v, utype, etype, vtype, urange, vrange, validate, restrict_format)
def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False): def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False,
restrict_format='any'):
"""Internal function to create a heterograph from a scipy sparse matrix with types. """Internal function to create a heterograph from a scipy sparse matrix with types.
Parameters Parameters
...@@ -818,7 +944,8 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False): ...@@ -818,7 +944,8 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False):
(source, destination) order. (source, destination) order.
validate : bool, optional validate : bool, optional
If True, checks if node IDs are within range. If True, checks if node IDs are within range.
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
Returns Returns
------- -------
...@@ -830,7 +957,7 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False): ...@@ -830,7 +957,7 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False):
row = utils.toindex(spmat.row) row = utils.toindex(spmat.row)
col = utils.toindex(spmat.col) col = utils.toindex(spmat.col)
hgidx = heterograph_index.create_unitgraph_from_coo( hgidx = heterograph_index.create_unitgraph_from_coo(
num_ntypes, num_src, num_dst, row, col) num_ntypes, num_src, num_dst, row, col, restrict_format)
else: else:
spmat = spmat.tocsr() spmat = spmat.tocsr()
indptr = utils.toindex(spmat.indptr) indptr = utils.toindex(spmat.indptr)
...@@ -838,7 +965,7 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False): ...@@ -838,7 +965,7 @@ def create_from_scipy(spmat, utype, etype, vtype, with_edge_id=False):
# TODO(minjie): with_edge_id is only reasonable for csr matrix. How to fix? # TODO(minjie): with_edge_id is only reasonable for csr matrix. How to fix?
data = utils.toindex(spmat.data if with_edge_id else list(range(len(indices)))) data = utils.toindex(spmat.data if with_edge_id else list(range(len(indices))))
hgidx = heterograph_index.create_unitgraph_from_csr( hgidx = heterograph_index.create_unitgraph_from_csr(
num_ntypes, num_src, num_dst, indptr, indices, data) num_ntypes, num_src, num_dst, indptr, indices, data, restrict_format)
if num_ntypes == 1: if num_ntypes == 1:
return DGLHeteroGraph(hgidx, [utype], [etype]) return DGLHeteroGraph(hgidx, [utype], [etype])
else: else:
...@@ -848,7 +975,8 @@ def create_from_networkx(nx_graph, ...@@ -848,7 +975,8 @@ def create_from_networkx(nx_graph,
ntype, etype, ntype, etype,
edge_id_attr_name='id', edge_id_attr_name='id',
node_attrs=None, node_attrs=None,
edge_attrs=None): edge_attrs=None,
restrict_format='any'):
"""Create a heterograph that has only one set of nodes and edges. """Create a heterograph that has only one set of nodes and edges.
Parameters Parameters
...@@ -865,6 +993,8 @@ def create_from_networkx(nx_graph, ...@@ -865,6 +993,8 @@ def create_from_networkx(nx_graph,
Names for node features to retrieve from the NetworkX graph (Default: None) Names for node features to retrieve from the NetworkX graph (Default: None)
edge_attrs : list of str edge_attrs : list of str
Names for edge features to retrieve from the NetworkX graph (Default: None) Names for edge features to retrieve from the NetworkX graph (Default: None)
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
Returns Returns
------- -------
...@@ -899,7 +1029,8 @@ def create_from_networkx(nx_graph, ...@@ -899,7 +1029,8 @@ def create_from_networkx(nx_graph,
src = utils.toindex(src) src = utils.toindex(src)
dst = utils.toindex(dst) dst = utils.toindex(dst)
num_nodes = nx_graph.number_of_nodes() num_nodes = nx_graph.number_of_nodes()
g = create_from_edges(src, dst, ntype, etype, ntype, num_nodes, num_nodes, validate=False) g = create_from_edges(src, dst, ntype, etype, ntype, num_nodes, num_nodes,
validate=False, restrict_format=restrict_format)
# handle features # handle features
# copy attributes # copy attributes
...@@ -950,7 +1081,8 @@ def create_from_networkx_bipartite(nx_graph, ...@@ -950,7 +1081,8 @@ def create_from_networkx_bipartite(nx_graph,
utype, etype, vtype, utype, etype, vtype,
edge_id_attr_name='id', edge_id_attr_name='id',
node_attrs=None, node_attrs=None,
edge_attrs=None): edge_attrs=None,
restrict_format='any'):
"""Create a heterograph that has one set of source nodes, one set of """Create a heterograph that has one set of source nodes, one set of
destination nodes and one set of edges. destination nodes and one set of edges.
...@@ -974,6 +1106,8 @@ def create_from_networkx_bipartite(nx_graph, ...@@ -974,6 +1106,8 @@ def create_from_networkx_bipartite(nx_graph,
Names for node features to retrieve from the NetworkX graph (Default: None) Names for node features to retrieve from the NetworkX graph (Default: None)
edge_attrs : list of str edge_attrs : list of str
Names for edge features to retrieve from the NetworkX graph (Default: None) Names for edge features to retrieve from the NetworkX graph (Default: None)
restrict_format : 'any', 'coo', 'csr', 'csc', optional
Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
Returns Returns
------- -------
...@@ -1013,7 +1147,7 @@ def create_from_networkx_bipartite(nx_graph, ...@@ -1013,7 +1147,7 @@ def create_from_networkx_bipartite(nx_graph,
dst = utils.toindex(dst) dst = utils.toindex(dst)
g = create_from_edges( g = create_from_edges(
src, dst, utype, etype, vtype, src, dst, utype, etype, vtype,
len(top_nodes), len(bottom_nodes), validate=False) len(top_nodes), len(bottom_nodes), validate=False, restrict_format=restrict_format)
# TODO attributes # TODO attributes
assert node_attrs is None, 'Retrieval of node attributes are not supported yet.' assert node_attrs is None, 'Retrieval of node attributes are not supported yet.'
......
...@@ -43,7 +43,7 @@ class HeteroGraphIndex(ObjectBase): ...@@ -43,7 +43,7 @@ class HeteroGraphIndex(ObjectBase):
num_dst = number_of_nodes[dst_ntype] num_dst = number_of_nodes[dst_ntype]
src_id, dst_id, _ = edges_per_type src_id, dst_id, _ = edges_per_type
rel_graphs.append(create_unitgraph_from_coo( rel_graphs.append(create_unitgraph_from_coo(
1 if src_ntype == dst_ntype else 2, num_src, num_dst, src_id, dst_id)) 1 if src_ntype == dst_ntype else 2, num_src, num_dst, src_id, dst_id, 'any'))
self.__init_handle_by_constructor__( self.__init_handle_by_constructor__(
_CAPI_DGLHeteroCreateHeteroGraph, metagraph, rel_graphs) _CAPI_DGLHeteroCreateHeteroGraph, metagraph, rel_graphs)
...@@ -957,7 +957,8 @@ class HeteroSubgraphIndex(ObjectBase): ...@@ -957,7 +957,8 @@ class HeteroSubgraphIndex(ObjectBase):
# Creators # Creators
################################################################# #################################################################
def create_unitgraph_from_coo(num_ntypes, num_src, num_dst, row, col): def create_unitgraph_from_coo(num_ntypes, num_src, num_dst, row, col,
restrict_format):
"""Create a unitgraph graph index from COO format """Create a unitgraph graph index from COO format
Parameters Parameters
...@@ -972,15 +973,19 @@ def create_unitgraph_from_coo(num_ntypes, num_src, num_dst, row, col): ...@@ -972,15 +973,19 @@ def create_unitgraph_from_coo(num_ntypes, num_src, num_dst, row, col):
Row index. Row index.
col : utils.Index col : utils.Index
Col index. Col index.
restrict_format : "any", "coo", "csr" or "csc"
Restrict the storage format of the unit graph.
Returns Returns
------- -------
HeteroGraphIndex HeteroGraphIndex
""" """
return _CAPI_DGLHeteroCreateUnitGraphFromCOO( return _CAPI_DGLHeteroCreateUnitGraphFromCOO(
int(num_ntypes), int(num_src), int(num_dst), row.todgltensor(), col.todgltensor()) int(num_ntypes), int(num_src), int(num_dst), row.todgltensor(), col.todgltensor(),
restrict_format)
def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edge_ids): def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edge_ids,
restrict_format):
"""Create a unitgraph graph index from CSR format """Create a unitgraph graph index from CSR format
Parameters Parameters
...@@ -997,6 +1002,8 @@ def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edg ...@@ -997,6 +1002,8 @@ def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edg
CSR indices. CSR indices.
edge_ids : utils.Index edge_ids : utils.Index
Edge shuffle id. Edge shuffle id.
restrict_format : "any", "coo", "csr" or "csc"
Restrict the storage format of the unit graph.
Returns Returns
------- -------
...@@ -1004,7 +1011,8 @@ def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edg ...@@ -1004,7 +1011,8 @@ def create_unitgraph_from_csr(num_ntypes, num_src, num_dst, indptr, indices, edg
""" """
return _CAPI_DGLHeteroCreateUnitGraphFromCSR( return _CAPI_DGLHeteroCreateUnitGraphFromCSR(
int(num_ntypes), int(num_src), int(num_dst), int(num_ntypes), int(num_src), int(num_dst),
indptr.todgltensor(), indices.todgltensor(), edge_ids.todgltensor()) indptr.todgltensor(), indices.todgltensor(), edge_ids.todgltensor(),
restrict_format)
def create_heterograph_from_relations(metagraph, rel_graphs): def create_heterograph_from_relations(metagraph, rel_graphs):
"""Create a heterograph from metagraph and graphs of every relation. """Create a heterograph from metagraph and graphs of every relation.
...@@ -1061,6 +1069,31 @@ def disjoint_partition(graph, bnn_all_types, bne_all_types): ...@@ -1061,6 +1069,31 @@ def disjoint_partition(graph, bnn_all_types, bne_all_types):
return _CAPI_DGLHeteroDisjointPartitionBySizes( return _CAPI_DGLHeteroDisjointPartitionBySizes(
graph, bnn_all_types.todgltensor(), bne_all_types.todgltensor()) graph, bnn_all_types.todgltensor(), bne_all_types.todgltensor())
def compact_graph_indexes(graphs):
"""Given a list of graphs, remove the common nodes that do not have inbound and
outbound edges.
The graphs should have identical node space (i.e. should have the same set of
nodes, including types and IDs) and metagraph.
Parameters
----------
graph : list[HeteroGraphIndex]
List of heterographs.
Returns
-------
list[HeteroGraphIndex]
A list of compacted heterographs.
The returned heterographs also have the same metagraph, which is identical
to the original heterographs.
The returned heterographs also have identical node space.
list[Tensor]
The induced node IDs of each node type.
"""
new_graphs, induced_nodes = _CAPI_DGLCompactGraphs(graphs)
return new_graphs, [F.zerocopy_from_dgl_ndarray(nodes.data) for nodes in induced_nodes]
@register_object("graph.FlattenedHeteroGraph") @register_object("graph.FlattenedHeteroGraph")
class FlattenedHeteroGraph(ObjectBase): class FlattenedHeteroGraph(ObjectBase):
"""FlattenedHeteroGraph object class in C++ backend.""" """FlattenedHeteroGraph object class in C++ backend."""
......
...@@ -168,7 +168,7 @@ def build_gidx_and_mapping_uv(edge_tuples, num_src, num_dst): ...@@ -168,7 +168,7 @@ def build_gidx_and_mapping_uv(edge_tuples, num_src, num_dst):
Number of ints needed to represent the graph Number of ints needed to represent the graph
""" """
u, v, eid = edge_tuples u, v, eid = edge_tuples
gidx = create_unitgraph_from_coo(2, num_src, num_dst, u, v) gidx = create_unitgraph_from_coo(2, num_src, num_dst, u, v, 'any')
forward, backward = gidx.get_csr_shuffle_order(0) forward, backward = gidx.get_csr_shuffle_order(0)
eid = eid.tousertensor() eid = eid.tousertensor()
nbits = gidx.bits_needed(0) nbits = gidx.bits_needed(0)
......
...@@ -411,6 +411,22 @@ void CSRSort(CSRMatrix csr) { ...@@ -411,6 +411,22 @@ void CSRSort(CSRMatrix csr) {
///////////////////////// COO routines ////////////////////////// ///////////////////////// COO routines //////////////////////////
bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col) {
bool ret = false;
ATEN_COO_IDX_SWITCH(coo, XPU, IdType, {
ret = impl::COOIsNonZero<XPU, IdType>(coo, row, col);
});
return ret;
}
NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
NDArray ret;
ATEN_COO_IDX_SWITCH(coo, XPU, IdType, {
ret = impl::COOIsNonZero<XPU, IdType>(coo, row, col);
});
return ret;
}
bool COOHasDuplicate(COOMatrix coo) { bool COOHasDuplicate(COOMatrix coo) {
bool ret = false; bool ret = false;
ATEN_COO_IDX_SWITCH(coo, XPU, IdType, { ATEN_COO_IDX_SWITCH(coo, XPU, IdType, {
...@@ -419,6 +435,55 @@ bool COOHasDuplicate(COOMatrix coo) { ...@@ -419,6 +435,55 @@ bool COOHasDuplicate(COOMatrix coo) {
return ret; return ret;
} }
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
int64_t ret = 0;
ATEN_COO_IDX_SWITCH(coo, XPU, IdType, {
ret = impl::COOGetRowNNZ<XPU, IdType>(coo, row);
});
return ret;
}
NDArray COOGetRowNNZ(COOMatrix coo, NDArray row) {
NDArray ret;
ATEN_COO_IDX_SWITCH(coo, XPU, IdType, {
ret = impl::COOGetRowNNZ<XPU, IdType>(coo, row);
});
return ret;
}
std::pair<NDArray, NDArray> COOGetRowDataAndIndices(COOMatrix coo, int64_t row) {
std::pair<NDArray, NDArray> ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOGetRowDataAndIndices<XPU, IdType, DType>(coo, row);
});
return ret;
}
NDArray COOGetData(COOMatrix coo, int64_t row, int64_t col) {
NDArray ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOGetData<XPU, IdType, DType>(coo, row, col);
});
return ret;
}
std::vector<NDArray> COOGetDataAndIndices(
COOMatrix coo, NDArray rows, NDArray cols) {
std::vector<NDArray> ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOGetDataAndIndices<XPU, IdType, DType>(coo, rows, cols);
});
return ret;
}
COOMatrix COOTranspose(COOMatrix coo) {
COOMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOTranspose<XPU, IdType, DType>(coo);
});
return ret;
}
CSRMatrix COOToCSR(COOMatrix coo) { CSRMatrix COOToCSR(COOMatrix coo) {
CSRMatrix ret; CSRMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, { ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
...@@ -427,5 +492,29 @@ CSRMatrix COOToCSR(COOMatrix coo) { ...@@ -427,5 +492,29 @@ CSRMatrix COOToCSR(COOMatrix coo) {
return ret; return ret;
} }
COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
COOMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOSliceRows<XPU, IdType, DType>(coo, start, end);
});
return ret;
}
COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
COOMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOSliceRows<XPU, IdType, DType>(coo, rows);
});
return ret;
}
COOMatrix COOSliceMatrix(COOMatrix coo, NDArray rows, NDArray cols) {
COOMatrix ret;
ATEN_COO_SWITCH(coo, XPU, IdType, DType, {
ret = impl::COOSliceMatrix<XPU, IdType, DType>(coo, rows, cols);
});
return ret;
}
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
...@@ -107,12 +107,48 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray ...@@ -107,12 +107,48 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
template <DLDeviceType XPU, typename IdType, typename DType> template <DLDeviceType XPU, typename IdType, typename DType>
void CSRSort(CSRMatrix csr); void CSRSort(CSRMatrix csr);
template <DLDeviceType XPU, typename IdType>
bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray COOIsNonZero(COOMatrix coo, runtime::NDArray row, runtime::NDArray col);
template <DLDeviceType XPU, typename IdType> template <DLDeviceType XPU, typename IdType>
bool COOHasDuplicate(COOMatrix coo); bool COOHasDuplicate(COOMatrix coo);
template <DLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row);
template <DLDeviceType XPU, typename IdType>
runtime::NDArray COOGetRowNNZ(COOMatrix coo, runtime::NDArray row);
template <DLDeviceType XPU, typename IdType, typename DType>
std::pair<runtime::NDArray, runtime::NDArray>
COOGetRowDataAndIndices(COOMatrix coo, int64_t row);
template <DLDeviceType XPU, typename IdType, typename DType>
runtime::NDArray COOGetData(COOMatrix coo, int64_t row, int64_t col);
template <DLDeviceType XPU, typename IdType, typename DType>
std::vector<runtime::NDArray> COOGetDataAndIndices(
COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOTranspose(COOMatrix coo);
template <DLDeviceType XPU, typename IdType, typename DType> template <DLDeviceType XPU, typename IdType, typename DType>
CSRMatrix COOToCSR(COOMatrix coo); CSRMatrix COOToCSR(COOMatrix coo);
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end);
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceRows(COOMatrix coo, runtime::NDArray rows);
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
......
/*!
* Copyright (c) 2019 by Contributors
* \file dgl/array_utils.h
* \brief Utility classes and functions for DGL arrays.
*/
#ifndef DGL_ARRAY_CPU_ARRAY_UTILS_H_
#define DGL_ARRAY_CPU_ARRAY_UTILS_H_
#include <dgl/array.h>
#include <vector>
#include <unordered_map>
#include <utility>
namespace dgl {
namespace aten {
/*!
* \brief A hashmap that maps each ids in the given array to new ids starting from zero.
*
* Useful for relabeling integers and finding unique integers.
*
* Usually faster than std::unordered_map in existence checking.
*/
template <typename IdType>
class IdHashMap {
public:
// default ctor
IdHashMap(): filter_(kFilterSize, false) {}
// Construct the hashmap using the given id array.
// The id array could contain duplicates.
explicit IdHashMap(IdArray ids): filter_(kFilterSize, false) {
Update(ids);
}
// Update the hashmap with given id array.
// The id array could contain duplicates.
void Update(IdArray ids) {
const IdType* ids_data = static_cast<IdType*>(ids->data);
const int64_t len = ids->shape[0];
IdType newid = oldv2newv_.size();
for (int64_t i = 0; i < len; ++i) {
const IdType id = ids_data[i];
if (!Contains(id)) {
oldv2newv_[id] = newid++;
filter_[id & kFilterMask] = true;
}
}
}
// Return true if the given id is contained in this hashmap.
bool Contains(IdType id) const {
return filter_[id & kFilterMask] && oldv2newv_.count(id);
}
// Return the new id of the given id. If the given id is not contained
// in the hash map, returns the default_val instead.
IdType Map(IdType id, IdType default_val) const {
if (filter_[id & kFilterMask]) {
auto it = oldv2newv_.find(id);
return (it == oldv2newv_.end()) ? default_val : it->second;
} else {
return default_val;
}
}
// Return the new id of each id in the given array.
IdArray Map(IdArray ids, IdType default_val) const {
const IdType* ids_data = static_cast<IdType*>(ids->data);
const int64_t len = ids->shape[0];
IdArray values = NewIdArray(len, ids->ctx, ids->dtype.bits);
IdType* values_data = static_cast<IdType*>(values->data);
for (int64_t i = 0; i < len; ++i)
values_data[i] = Map(ids_data[i], default_val);
return values;
}
// Return all the old ids collected so far, ordered by new id.
IdArray Values() const {
IdArray values = NewIdArray(oldv2newv_.size(), DLContext{kDLCPU, 0}, sizeof(IdType) * 8);
IdType* values_data = static_cast<IdType*>(values->data);
for (auto pair : oldv2newv_)
values_data[pair.second] = pair.first;
return values;
}
private:
static constexpr int32_t kFilterMask = 0xFFFFFF;
static constexpr int32_t kFilterSize = kFilterMask + 1;
// This bitmap is used as a bloom filter to remove some lookups.
// Hashtable is very slow. Using bloom filter can significantly speed up lookups.
std::vector<bool> filter_;
// The hashmap from old vid to new vid
std::unordered_map<IdType, IdType> oldv2newv_;
};
/*
* \brief Hash type for building maps/sets with pairs as keys.
*/
struct PairHash {
template <class T1, class T2>
std::size_t operator() (const std::pair<T1, T2>& pair) const {
return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
}
};
}; // namespace aten
}; // namespace dgl
#endif // DGL_ARRAY_CPU_ARRAY_UTILS_H_
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <vector> #include <vector>
#include <unordered_set> #include <unordered_set>
#include "array_utils.h"
namespace dgl { namespace dgl {
...@@ -13,69 +14,6 @@ using runtime::NDArray; ...@@ -13,69 +14,6 @@ using runtime::NDArray;
namespace aten { namespace aten {
namespace impl { namespace impl {
namespace {
/*!
* \brief A hashmap that maps each ids in the given array to new ids starting from zero.
*/
template <typename IdType>
class IdHashMap {
public:
// Construct the hashmap using the given id arrays.
// The id array could contain duplicates.
explicit IdHashMap(IdArray ids): filter_(kFilterSize, false) {
const IdType* ids_data = static_cast<IdType*>(ids->data);
const int64_t len = ids->shape[0];
IdType newid = 0;
for (int64_t i = 0; i < len; ++i) {
const IdType id = ids_data[i];
if (!Contains(id)) {
oldv2newv_[id] = newid++;
filter_[id & kFilterMask] = true;
}
}
}
// Return true if the given id is contained in this hashmap.
bool Contains(IdType id) const {
return filter_[id & kFilterMask] && oldv2newv_.count(id);
}
// Return the new id of the given id. If the given id is not contained
// in the hash map, returns the default_val instead.
IdType Map(IdType id, IdType default_val) const {
if (filter_[id & kFilterMask]) {
auto it = oldv2newv_.find(id);
return (it == oldv2newv_.end()) ? default_val : it->second;
} else {
return default_val;
}
}
private:
static constexpr int32_t kFilterMask = 0xFFFFFF;
static constexpr int32_t kFilterSize = kFilterMask + 1;
// This bitmap is used as a bloom filter to remove some lookups.
// Hashtable is very slow. Using bloom filter can significantly speed up lookups.
std::vector<bool> filter_;
// The hashmap from old vid to new vid
std::unordered_map<IdType, IdType> oldv2newv_;
};
struct PairHash {
template <class T1, class T2>
std::size_t operator() (const std::pair<T1, T2>& pair) const {
return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
}
};
inline bool CSRHasData(CSRMatrix csr) {
return csr.data.defined();
}
inline bool COOHasData(COOMatrix csr) {
return csr.data.defined();
}
} // namespace
///////////////////////////// CSRIsNonZero ///////////////////////////// ///////////////////////////// CSRIsNonZero /////////////////////////////
...@@ -649,91 +587,6 @@ void CSRSort(CSRMatrix csr) { ...@@ -649,91 +587,6 @@ void CSRSort(CSRMatrix csr) {
template void CSRSort<kDLCPU, int64_t, int64_t>(CSRMatrix csr); template void CSRSort<kDLCPU, int64_t, int64_t>(CSRMatrix csr);
template void CSRSort<kDLCPU, int32_t, int32_t>(CSRMatrix csr); template void CSRSort<kDLCPU, int32_t, int32_t>(CSRMatrix csr);
///////////////////////////// COOHasDuplicate /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool COOHasDuplicate(COOMatrix coo) {
std::unordered_set<std::pair<IdType, IdType>, PairHash> hashmap;
const IdType* src_data = static_cast<IdType*>(coo.row->data);
const IdType* dst_data = static_cast<IdType*>(coo.col->data);
const auto nnz = coo.row->shape[0];
for (IdType eid = 0; eid < nnz; ++eid) {
const auto& p = std::make_pair(src_data[eid], dst_data[eid]);
if (hashmap.count(p)) {
return true;
} else {
hashmap.insert(p);
}
}
return false;
}
template bool COOHasDuplicate<kDLCPU, int32_t>(COOMatrix coo);
template bool COOHasDuplicate<kDLCPU, int64_t>(COOMatrix coo);
///////////////////////////// COOToCSR /////////////////////////////
// complexity: time O(NNZ), space O(1)
template <DLDeviceType XPU, typename IdType, typename DType>
CSRMatrix COOToCSR(COOMatrix coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType* row_data = static_cast<IdType*>(coo.row->data);
const IdType* col_data = static_cast<IdType*>(coo.col->data);
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data;
if (COOHasData(coo)) {
ret_data = NDArray::Empty({NNZ}, coo.data->dtype, coo.data->ctx);
} else {
// if no data array in the input coo, the return data array is a shuffle index.
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
}
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
IdType* Bi = static_cast<IdType*>(ret_indices->data);
std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++;
}
// cumsum
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
const IdType temp = Bp[i];
Bp[i] = cumsum;
cumsum += temp;
}
Bp[N] = NNZ;
for (int64_t i = 0; i < NNZ; ++i) {
const IdType r = row_data[i];
Bi[Bp[r]] = col_data[i];
if (COOHasData(coo)) {
const DType* data = static_cast<DType*>(coo.data->data);
DType* Bx = static_cast<DType*>(ret_data->data);
Bx[Bp[r]] = data[i];
} else {
IdType* Bx = static_cast<IdType*>(ret_data->data);
Bx[Bp[r]] = i;
}
Bp[r]++;
}
// correct the indptr
for (int64_t i = 0, last = 0; i <= N; ++i) {
IdType temp = Bp[i];
Bp[i] = last;
last = temp;
}
return CSRMatrix{coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data};
}
template CSRMatrix COOToCSR<kDLCPU, int32_t, int32_t>(COOMatrix coo);
template CSRMatrix COOToCSR<kDLCPU, int64_t, int64_t>(COOMatrix coo);
} // namespace impl } // namespace impl
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file array/cpu/spmat_op_impl.cc
* \brief CPU implementation of COO sparse matrix operators
*/
#include <dgl/array.h>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include "array_utils.h"
namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {
/*
* TODO(BarclayII):
* For row-major sorted COOs, we have faster implementation with binary search,
* sorted search, etc. Later we should benchmark how much we can gain with
* sorted COOs on hypersparse graphs.
*/
///////////////////////////// COOIsNonZero /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < coo.num_cols) << "Invalid col index: " << col;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row && coo_col_data[i] == col)
return true;
}
return false;
}
template bool COOIsNonZero<kDLCPU, int32_t>(COOMatrix, int64_t, int64_t);
template bool COOIsNonZero<kDLCPU, int64_t>(COOMatrix, int64_t, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
const auto rowlen = row->shape[0];
const auto collen = col->shape[0];
const auto rstlen = std::max(rowlen, collen);
NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
const IdType* row_data = static_cast<IdType*>(row->data);
const IdType* col_data = static_cast<IdType*>(col->data);
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const int64_t kmax = std::max(rowlen, collen);
#pragma omp parallel for
for (int64_t k = 0; k < kmax; ++k) {
int64_t i = row_stride * k;
int64_t j = col_stride * k;
rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
}
return rst;
}
template NDArray COOIsNonZero<kDLCPU, int32_t>(COOMatrix, NDArray, NDArray);
template NDArray COOIsNonZero<kDLCPU, int64_t>(COOMatrix, NDArray, NDArray);
///////////////////////////// COOHasDuplicate /////////////////////////////
template <DLDeviceType XPU, typename IdType>
bool COOHasDuplicate(COOMatrix coo) {
std::unordered_set<std::pair<IdType, IdType>, PairHash> hashmap;
const IdType* src_data = static_cast<IdType*>(coo.row->data);
const IdType* dst_data = static_cast<IdType*>(coo.col->data);
const auto nnz = coo.row->shape[0];
for (IdType eid = 0; eid < nnz; ++eid) {
const auto& p = std::make_pair(src_data[eid], dst_data[eid]);
if (hashmap.count(p)) {
return true;
} else {
hashmap.insert(p);
}
}
return false;
}
template bool COOHasDuplicate<kDLCPU, int32_t>(COOMatrix coo);
template bool COOHasDuplicate<kDLCPU, int64_t>(COOMatrix coo);
///////////////////////////// COOGetRowNNZ /////////////////////////////
template <DLDeviceType XPU, typename IdType>
int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
int64_t result = 0;
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row)
++result;
}
return result;
}
template int64_t COOGetRowNNZ<kDLCPU, int32_t>(COOMatrix, int64_t);
template int64_t COOGetRowNNZ<kDLCPU, int64_t>(COOMatrix, int64_t);
template <DLDeviceType XPU, typename IdType>
NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
const auto len = rows->shape[0];
const IdType* vid_data = static_cast<IdType*>(rows->data);
NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
IdType* rst_data = static_cast<IdType*>(rst->data);
#pragma omp parallel for
for (int64_t i = 0; i < len; ++i)
rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
return rst;
}
template NDArray COOGetRowNNZ<kDLCPU, int32_t>(COOMatrix, NDArray);
template NDArray COOGetRowNNZ<kDLCPU, int64_t>(COOMatrix, NDArray);
///////////////////////////// COOGetRowDataAndIndices /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
std::pair<NDArray, NDArray> COOGetRowDataAndIndices(
COOMatrix coo, int64_t row) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* coo_data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
std::vector<IdType> indices;
std::vector<DType> data;
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row) {
indices.push_back(coo_col_data[i]);
data.push_back(coo_data ? coo_data[i] : i);
}
}
return std::make_pair(NDArray::FromVector(data), NDArray::FromVector(indices));
}
template std::pair<NDArray, NDArray>
COOGetRowDataAndIndices<kDLCPU, int32_t, int32_t>(COOMatrix, int64_t);
template std::pair<NDArray, NDArray>
COOGetRowDataAndIndices<kDLCPU, int64_t, int64_t>(COOMatrix, int64_t);
///////////////////////////// COOGetData /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
NDArray COOGetData(COOMatrix coo, int64_t row, int64_t col) {
CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
CHECK(col >= 0 && col < coo.num_cols) << "Invalid col index: " << col;
std::vector<DType> ret_vec;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
for (IdType i = 0; i < coo.row->shape[0]; ++i) {
if (coo_row_data[i] == row && coo_col_data[i] == col)
ret_vec.push_back(data ? data[i] : i);
}
return NDArray::FromVector(ret_vec);
}
template NDArray COOGetData<kDLCPU, int32_t, int32_t>(COOMatrix, int64_t, int64_t);
template NDArray COOGetData<kDLCPU, int64_t, int64_t>(COOMatrix, int64_t, int64_t);
///////////////////////////// COOGetDataAndIndices /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
std::vector<NDArray> COOGetDataAndIndices(
COOMatrix coo, NDArray rows, NDArray cols) {
const int64_t rowlen = rows->shape[0];
const int64_t collen = cols->shape[0];
CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
<< "Invalid row and col id array.";
const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
const IdType* row_data = static_cast<IdType*>(rows->data);
const IdType* col_data = static_cast<IdType*>(cols->data);
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
std::vector<IdType> ret_rows, ret_cols;
std::vector<DType> ret_data;
for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
const IdType row_id = row_data[i], col_id = col_data[j];
CHECK(row_id >= 0 && row_id < coo.num_rows) << "Invalid row index: " << row_id;
CHECK(col_id >= 0 && col_id < coo.num_cols) << "Invalid col index: " << col_id;
for (int64_t k = 0; k < coo.row->shape[0]; ++k) {
if (coo_row_data[k] == row_id && coo_col_data[k] == col_id) {
ret_rows.push_back(row_id);
ret_cols.push_back(col_id);
ret_data.push_back(data ? data[k] : k);
}
}
}
return {NDArray::FromVector(ret_rows),
NDArray::FromVector(ret_cols),
NDArray::FromVector(ret_data)};
}
template std::vector<NDArray> COOGetDataAndIndices<kDLCPU, int32_t, int32_t>(
COOMatrix coo, NDArray rows, NDArray cols);
template std::vector<NDArray> COOGetDataAndIndices<kDLCPU, int64_t, int64_t>(
COOMatrix coo, NDArray rows, NDArray cols);
///////////////////////////// COOTranspose /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOTranspose(COOMatrix coo) {
return COOMatrix{coo.num_cols, coo.num_rows, coo.col, coo.row, coo.data};
}
template COOMatrix COOTranspose<kDLCPU, int32_t, int32_t>(COOMatrix coo);
template COOMatrix COOTranspose<kDLCPU, int64_t, int64_t>(COOMatrix coo);
///////////////////////////// COOToCSR /////////////////////////////
// complexity: time O(NNZ), space O(1)
template <DLDeviceType XPU, typename IdType, typename DType>
CSRMatrix COOToCSR(COOMatrix coo) {
const int64_t N = coo.num_rows;
const int64_t NNZ = coo.row->shape[0];
const IdType* row_data = static_cast<IdType*>(coo.row->data);
const IdType* col_data = static_cast<IdType*>(coo.col->data);
NDArray ret_indptr = NDArray::Empty({N + 1}, coo.row->dtype, coo.row->ctx);
NDArray ret_indices = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
NDArray ret_data;
if (COOHasData(coo)) {
ret_data = NDArray::Empty({NNZ}, coo.data->dtype, coo.data->ctx);
} else {
// if no data array in the input coo, the return data array is a shuffle index.
ret_data = NDArray::Empty({NNZ}, coo.row->dtype, coo.row->ctx);
}
IdType* Bp = static_cast<IdType*>(ret_indptr->data);
IdType* Bi = static_cast<IdType*>(ret_indices->data);
std::fill(Bp, Bp + N, 0);
for (int64_t i = 0; i < NNZ; ++i) {
Bp[row_data[i]]++;
}
// cumsum
for (int64_t i = 0, cumsum = 0; i < N; ++i) {
const IdType temp = Bp[i];
Bp[i] = cumsum;
cumsum += temp;
}
Bp[N] = NNZ;
for (int64_t i = 0; i < NNZ; ++i) {
const IdType r = row_data[i];
Bi[Bp[r]] = col_data[i];
if (COOHasData(coo)) {
const DType* data = static_cast<DType*>(coo.data->data);
DType* Bx = static_cast<DType*>(ret_data->data);
Bx[Bp[r]] = data[i];
} else {
IdType* Bx = static_cast<IdType*>(ret_data->data);
Bx[Bp[r]] = i;
}
Bp[r]++;
}
// correct the indptr
for (int64_t i = 0, last = 0; i <= N; ++i) {
IdType temp = Bp[i];
Bp[i] = last;
last = temp;
}
return CSRMatrix{coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data};
}
template CSRMatrix COOToCSR<kDLCPU, int32_t, int32_t>(COOMatrix coo);
template CSRMatrix COOToCSR<kDLCPU, int64_t, int64_t>(COOMatrix coo);
///////////////////////////// COOSliceRows /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
CHECK(start >= 0 && start < coo.num_rows) << "Invalid start row " << start;
CHECK(end > 0 && end <= coo.num_rows) << "Invalid end row " << end;
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* coo_data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
std::vector<IdType> ret_row, ret_col;
std::vector<DType> ret_data;
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType row_id = coo_row_data[i];
const IdType col_id = coo_col_data[i];
if (row_id < end && row_id >= start) {
ret_row.push_back(row_id - start);
ret_col.push_back(col_id);
ret_data.push_back(coo_data ? coo_data[i] : i);
}
}
return COOMatrix{
end - start,
coo.num_cols,
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data)};
}
template COOMatrix COOSliceRows<kDLCPU, int32_t, int32_t>(COOMatrix, int64_t, int64_t);
template COOMatrix COOSliceRows<kDLCPU, int64_t, int64_t>(COOMatrix, int64_t, int64_t);
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* coo_data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
std::vector<IdType> ret_row, ret_col;
std::vector<DType> ret_data;
IdHashMap<IdType> hashmap(rows);
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType row_id = coo_row_data[i];
const IdType col_id = coo_col_data[i];
const IdType mapped_row_id = hashmap.Map(row_id, -1);
if (mapped_row_id != -1) {
ret_row.push_back(mapped_row_id);
ret_col.push_back(col_id);
ret_data.push_back(coo_data ? coo_data[i] : i);
}
}
return COOMatrix{
rows->shape[0],
coo.num_cols,
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data)};
}
template COOMatrix COOSliceRows<kDLCPU, int32_t, int32_t>(COOMatrix , NDArray);
template COOMatrix COOSliceRows<kDLCPU, int64_t, int64_t>(COOMatrix , NDArray);
///////////////////////////// COOSliceMatrix /////////////////////////////
template <DLDeviceType XPU, typename IdType, typename DType>
COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols) {
const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
const DType* coo_data = COOHasData(coo) ? static_cast<DType*>(coo.data->data) : nullptr;
IdHashMap<IdType> row_map(rows), col_map(cols);
std::vector<IdType> ret_row, ret_col;
std::vector<DType> ret_data;
for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
const IdType row_id = coo_row_data[i];
const IdType col_id = coo_col_data[i];
const IdType mapped_row_id = row_map.Map(row_id, -1);
if (mapped_row_id != -1) {
const IdType mapped_col_id = col_map.Map(col_id, -1);
if (mapped_col_id != -1) {
ret_row.push_back(mapped_row_id);
ret_col.push_back(mapped_col_id);
ret_data.push_back(coo_data ? coo_data[i] : i);
}
}
}
return COOMatrix{
rows->shape[0],
cols->shape[0],
NDArray::FromVector(ret_row),
NDArray::FromVector(ret_col),
NDArray::FromVector(ret_data)};
}
template COOMatrix COOSliceMatrix<kDLCPU, int32_t, int32_t>(
COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
template COOMatrix COOSliceMatrix<kDLCPU, int64_t, int64_t>(
COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);
} // namespace impl
} // namespace aten
} // namespace dgl
...@@ -7,8 +7,14 @@ ...@@ -7,8 +7,14 @@
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/packed_func_ext.h> #include <dgl/packed_func_ext.h>
#include <dgl/runtime/container.h> #include <dgl/runtime/container.h>
#include <vector>
#include <tuple>
#include <utility>
#include "../c_api_common.h" #include "../c_api_common.h"
#include "./unit_graph.h" #include "./unit_graph.h"
// TODO(BarclayII): currently CompactGraphs depend on IdHashMap implementation which
// only works on CPU. Should fix later to make it device agnostic.
#include "../array/cpu/array_utils.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -103,6 +109,66 @@ HeteroSubgraph EdgeSubgraphNoPreserveNodes( ...@@ -103,6 +109,66 @@ HeteroSubgraph EdgeSubgraphNoPreserveNodes(
return ret; return ret;
} }
template<typename IdType>
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs) {
// TODO(BarclayII): check whether the node space and metagraph of each graph is the same.
// Step 1: Collect the nodes that has connections for each type.
std::vector<aten::IdHashMap<IdType>> hashmaps(graphs[0]->NumVertexTypes());
std::vector<std::vector<EdgeArray>> all_edges(graphs.size()); // all_edges[i][etype]
for (size_t i = 0; i < graphs.size(); ++i) {
const HeteroGraphPtr curr_graph = graphs[i];
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray edges = curr_graph->Edges(etype, "eid");
hashmaps[srctype].Update(edges.src);
hashmaps[dsttype].Update(edges.dst);
all_edges[i].push_back(edges);
}
}
// Step 2: Relabel the nodes for each type to a smaller ID space and save the mapping.
std::vector<IdArray> induced_nodes;
for (auto &hashmap : hashmaps)
induced_nodes.push_back(hashmap.Values());
// Step 3: Remap the edges of each graph.
std::vector<HeteroGraphPtr> new_graphs;
for (size_t i = 0; i < graphs.size(); ++i) {
std::vector<HeteroGraphPtr> rel_graphs;
const HeteroGraphPtr curr_graph = graphs[i];
const auto meta_graph = curr_graph->meta_graph();
const int64_t num_etypes = curr_graph->NumEdgeTypes();
for (IdType etype = 0; etype < num_etypes; ++etype) {
IdType srctype, dsttype;
std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
const EdgeArray &edges = all_edges[i][etype];
const IdArray mapped_rows = hashmaps[srctype].Map(edges.src, -1);
const IdArray mapped_cols = hashmaps[dsttype].Map(edges.dst, -1);
rel_graphs.push_back(UnitGraph::CreateFromCOO(
srctype == dsttype ? 1 : 2,
induced_nodes[srctype]->shape[0],
induced_nodes[dsttype]->shape[0],
mapped_rows,
mapped_cols));
}
new_graphs.push_back(CreateHeteroGraph(meta_graph, rel_graphs));
}
return std::make_pair(new_graphs, induced_nodes);
}
} // namespace } // namespace
HeteroGraph::HeteroGraph(GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs) HeteroGraph::HeteroGraph(GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& rel_graphs)
...@@ -419,6 +485,15 @@ HeteroGraphPtr CreateHeteroGraph( ...@@ -419,6 +485,15 @@ HeteroGraphPtr CreateHeteroGraph(
return HeteroGraphPtr(new HeteroGraph(meta_graph, rel_graphs)); return HeteroGraphPtr(new HeteroGraph(meta_graph, rel_graphs));
} }
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
CompactGraphs(const std::vector<HeteroGraphPtr> &graphs) {
std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> result;
ATEN_ID_TYPE_SWITCH(graphs[0]->DataType(), IdType, {
result = CompactGraphs<IdType>(graphs);
});
return result;
}
///////////////////////// C APIs ///////////////////////// ///////////////////////// C APIs /////////////////////////
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO") DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO")
...@@ -428,7 +503,9 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO") ...@@ -428,7 +503,9 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO")
int64_t num_dst = args[2]; int64_t num_dst = args[2];
IdArray row = args[3]; IdArray row = args[3];
IdArray col = args[4]; IdArray col = args[4];
auto hgptr = UnitGraph::CreateFromCOO(nvtypes, num_src, num_dst, row, col); SparseFormat restrict_format = ParseSparseFormat(args[5]);
auto hgptr = UnitGraph::CreateFromCOO(
nvtypes, num_src, num_dst, row, col, restrict_format);
*rv = HeteroGraphRef(hgptr); *rv = HeteroGraphRef(hgptr);
}); });
...@@ -440,8 +517,9 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCSR") ...@@ -440,8 +517,9 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCSR")
IdArray indptr = args[3]; IdArray indptr = args[3];
IdArray indices = args[4]; IdArray indices = args[4];
IdArray edge_ids = args[5]; IdArray edge_ids = args[5];
SparseFormat restrict_format = ParseSparseFormat(args[6]);
auto hgptr = UnitGraph::CreateFromCSR( auto hgptr = UnitGraph::CreateFromCSR(
nvtypes, num_src, num_dst, indptr, indices, edge_ids); nvtypes, num_src, num_dst, indptr, indices, edge_ids, restrict_format);
*rv = HeteroGraphRef(hgptr); *rv = HeteroGraphRef(hgptr);
}); });
...@@ -782,6 +860,31 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph") ...@@ -782,6 +860,31 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph")
*rv = HeteroSubgraphRef(subg); *rv = HeteroSubgraphRef(subg);
}); });
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLCompactGraphs")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
List<HeteroGraphRef> graph_refs = args[0];
std::vector<HeteroGraphPtr> graphs;
for (HeteroGraphRef gref : graph_refs)
graphs.push_back(gref.sptr());
const auto &result_pair = CompactGraphs(graphs);
List<HeteroGraphRef> compacted_graph_refs;
List<Value> induced_nodes;
for (const HeteroGraphPtr g : result_pair.first)
compacted_graph_refs.push_back(HeteroGraphRef(g));
for (const IdArray &ids : result_pair.second)
induced_nodes.push_back(Value(MakeValue(ids)));
List<ObjectRef> result;
result.push_back(compacted_graph_refs);
result.push_back(induced_nodes);
*rv = result;
});
// HeteroSubgraph C APIs // HeteroSubgraph C APIs
DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetGraph") DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetGraph")
......
...@@ -41,6 +41,10 @@ class HeteroGraph : public BaseHeteroGraph { ...@@ -41,6 +41,10 @@ class HeteroGraph : public BaseHeteroGraph {
LOG(FATAL) << "Bipartite graph is not mutable."; LOG(FATAL) << "Bipartite graph is not mutable.";
} }
DLDataType DataType() const override {
return relation_graphs_[0]->DataType();
}
DLContext Context() const override { DLContext Context() const override {
return relation_graphs_[0]->Context(); return relation_graphs_[0]->Context();
} }
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief DGL sampler - templated implementation definition of random walks on CPU * \brief DGL sampler - templated implementation definition of random walks on CPU
*/ */
#ifndef DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_ #ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
#define DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_ #define DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
#include <dgl/array.h> #include <dgl/array.h>
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
...@@ -145,4 +145,4 @@ IdArray MetapathBasedRandomWalk( ...@@ -145,4 +145,4 @@ IdArray MetapathBasedRandomWalk(
}; // namespace dgl }; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_METAPATH_RANDOMWALK_H_ #endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_METAPATH_RANDOMWALK_H_
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include <utility> #include <utility>
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "../../c_api_common.h" #include "../../../c_api_common.h"
#include "randomwalks_impl.h" #include "randomwalks_impl.h"
using namespace dgl::runtime; using namespace dgl::runtime;
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief DGL sampler - templated implementation definition of random walks on CPU * \brief DGL sampler - templated implementation definition of random walks on CPU
*/ */
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_ #ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_CPU_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_ #define DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_CPU_H_
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/array.h> #include <dgl/array.h>
...@@ -71,4 +71,4 @@ IdArray GenericRandomWalk( ...@@ -71,4 +71,4 @@ IdArray GenericRandomWalk(
}; // namespace dgl }; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_CPU_H_ #endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_CPU_H_
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief DGL sampler - templated implementation definition of random walks * \brief DGL sampler - templated implementation definition of random walks
*/ */
#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_ #ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_IMPL_H_
#define DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_ #define DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_IMPL_H_
#include <dgl/base_heterograph.h> #include <dgl/base_heterograph.h>
#include <dgl/array.h> #include <dgl/array.h>
...@@ -114,4 +114,4 @@ IdArray RandomWalkWithStepwiseRestart( ...@@ -114,4 +114,4 @@ IdArray RandomWalkWithStepwiseRestart(
}; // namespace dgl }; // namespace dgl
#endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_IMPL_H_ #endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_RANDOMWALKS_IMPL_H_
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment