"vscode:/vscode.git/clone" did not exist on "e333d8222bc5af8b174dad3083cf6d0fca766051"
Unverified Commit eeeb52f4 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Feature] Preference to COO for "hypersparse" unit graphs & graph compaction (#1238)

* unit graph that prefers coo queries

* auto detect coo preference

* forgot some functions

* disable lint on detect_prefer_coo

* reorg

* change comment

* lint

* fix

* move array_utils.h to src

* compact graph impl

* fix redundant copying in idhashmap

* docstring

* moving preference detection to C

* lint

* fix unit test & address comments

* hypersparse autorestrict

* docstring & fix

* revert copyto and asnumbits

* fix stupid bug

* lint

* leave a TODO for sorted COO

* fixing same node type mapping to different id in different graphs

* addresses comments

* made induced nodes a feautre column

* lint?
parent 828a5e5b
......@@ -71,6 +71,8 @@ class UnitGraph : public BaseHeteroGraph {
LOG(FATAL) << "UnitGraph graph is not mutable.";
}
DLDataType DataType() const override;
DLContext Context() const override;
uint8_t NumBits() const override;
......@@ -143,12 +145,13 @@ class UnitGraph : public BaseHeteroGraph {
/*! \brief Create a graph from COO arrays */
static HeteroGraphPtr CreateFromCOO(
int64_t num_vtypes, int64_t num_src, int64_t num_dst,
IdArray row, IdArray col);
IdArray row, IdArray col, SparseFormat restrict_format = SparseFormat::ANY);
/*! \brief Create a graph from (out) CSR arrays */
static HeteroGraphPtr CreateFromCSR(
int64_t num_vtypes, int64_t num_src, int64_t num_dst,
IdArray indptr, IdArray indices, IdArray edge_ids);
IdArray indptr, IdArray indices, IdArray edge_ids,
SparseFormat restrict_format = SparseFormat::ANY);
/*! \brief Convert the graph to use the given number of bits for storage */
static HeteroGraphPtr AsNumBits(HeteroGraphPtr g, uint8_t bits);
......@@ -182,11 +185,32 @@ class UnitGraph : public BaseHeteroGraph {
* \param out_csr out edge csr
* \param coo coo
*/
UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo);
UnitGraph(GraphPtr metagraph, CSRPtr in_csr, CSRPtr out_csr, COOPtr coo,
SparseFormat restrict_format = SparseFormat::ANY);
/*! \return Return any existing format. */
HeteroGraphPtr GetAny() const;
/*!
* \return Return the given format. Perform format conversion if requested format does
* not exist.
*/
HeteroGraphPtr GetFormat(SparseFormat format) const;
/*!
* \brief Determine which format to use with a preference.
*
* If the storage of unit graph is "locked", i.e. no conversion is allowed, then
* it will return the locked format.
*
* Otherwise, it will return whatever DGL thinks is the most appropriate given
* the arguments.
*/
SparseFormat SelectFormat(SparseFormat preferred_format) const;
/*! \return Whether the graph is hypersparse */
bool IsHypersparse() const;
// Graph stored in different format. We use an on-demand strategy: the format is
// only materialized if the operation that suitable for it is invoked.
/*! \brief CSR graph that stores reverse edges */
......@@ -195,6 +219,14 @@ class UnitGraph : public BaseHeteroGraph {
CSRPtr out_csr_;
/*! \brief COO representation */
COOPtr coo_;
/*!
* \brief Storage format restriction.
* If it is not ANY, then conversion is not allowed for graph queries.
*
* Note that GetInCSR/GetOutCSR/GetCOO() can still be called and the conversion will
* still be done if requested explicitly (e.g. in message passing).
*/
SparseFormat restrict_format_;
};
}; // namespace dgl
......
......@@ -62,6 +62,23 @@ def create_test_heterograph2():
})
return g
def create_test_heterograph3():
plays_spmat = ssp.coo_matrix(([1, 1, 1, 1], ([0, 1, 2, 1], [0, 0, 1, 1])))
wishes_nx = nx.DiGraph()
wishes_nx.add_nodes_from(['u0', 'u1', 'u2'], bipartite=0)
wishes_nx.add_nodes_from(['g0', 'g1'], bipartite=1)
wishes_nx.add_edge('u0', 'g1', id=0)
wishes_nx.add_edge('u2', 'g0', id=1)
follows_g = dgl.graph([(0, 1), (1, 2)], 'user', 'follows', _restrict_format='coo')
plays_g = dgl.bipartite(
[(0, 0), (1, 0), (2, 1), (1, 1)], 'user', 'plays', 'game', _restrict_format='coo')
wishes_g = dgl.bipartite([(0, 1), (2, 0)], 'user', 'wishes', 'game', _restrict_format='coo')
develops_g = dgl.bipartite(
[(0, 0), (1, 1)], 'developer', 'develops', 'game', _restrict_format='coo')
g = dgl.hetero_from_relations([follows_g, plays_g, wishes_g, develops_g])
return g
def get_redfn(name):
return getattr(F, name)
......@@ -153,15 +170,13 @@ def test_query():
for i in range(len(etypes)):
assert g.to_canonical_etype(etypes[i]) == canonical_etypes[i]
def _test(g):
# number of nodes
assert [g.number_of_nodes(ntype) for ntype in ntypes] == [3, 2, 2]
# number of edges
assert [g.number_of_edges(etype) for etype in etypes] == [2, 4, 2, 2]
assert not g.is_multigraph
assert g.is_readonly
# has_node & has_nodes
for ntype in ntypes:
n = g.number_of_nodes(ntype)
......@@ -171,7 +186,9 @@ def test_query():
assert np.array_equal(
F.asnumpy(g.has_nodes([0, n], ntype)).astype('int32'), [1, 0])
def _test(g):
assert not g.is_multigraph
assert g.is_readonly
for etype in etypes:
srcs, dsts = edges[etype]
for src, dst in zip(srcs, dsts):
......@@ -252,6 +269,8 @@ def test_query():
_test(g)
g = create_test_heterograph1()
_test(g)
g = create_test_heterograph3()
_test(g)
etypes = canonical_etypes
edges = {
......@@ -271,10 +290,78 @@ def test_query():
_test(g)
g = create_test_heterograph1()
_test(g)
g = create_test_heterograph3()
_test(g)
# test repr
print(g)
def test_hypersparse():
N1 = 1 << 50 # should crash if allocated a CSR
N2 = 1 << 48
g = dgl.heterograph({
('user', 'follows', 'user'): [(0, 1)],
('user', 'plays', 'game'): [(0, N2)]},
{'user': N1, 'game': N1})
assert g.number_of_nodes('user') == N1
assert g.number_of_nodes('game') == N1
assert g.number_of_edges('follows') == 1
assert g.number_of_edges('plays') == 1
assert g.has_edge_between(0, 1, 'follows')
assert not g.has_edge_between(0, 0, 'follows')
mask = F.asnumpy(g.has_edges_between([0, 0], [0, 1], 'follows')).tolist()
assert mask == [0, 1]
assert g.has_edge_between(0, N2, 'plays')
assert not g.has_edge_between(0, 0, 'plays')
mask = F.asnumpy(g.has_edges_between([0, 0], [0, N2], 'plays')).tolist()
assert mask == [0, 1]
assert F.asnumpy(g.predecessors(0, 'follows')).tolist() == []
assert F.asnumpy(g.successors(0, 'follows')).tolist() == [1]
assert F.asnumpy(g.predecessors(1, 'follows')).tolist() == [0]
assert F.asnumpy(g.successors(1, 'follows')).tolist() == []
assert F.asnumpy(g.predecessors(0, 'plays')).tolist() == []
assert F.asnumpy(g.successors(0, 'plays')).tolist() == [N2]
assert F.asnumpy(g.predecessors(N2, 'plays')).tolist() == [0]
assert F.asnumpy(g.successors(N2, 'plays')).tolist() == []
assert g.edge_id(0, 1, etype='follows') == 0
assert g.edge_id(0, N2, etype='plays') == 0
assert F.asnumpy(g.edge_ids(0, 1, etype='follows')).tolist() == [0]
assert F.asnumpy(g.edge_ids(0, N2, etype='plays')).tolist() == [0]
u, v = g.find_edges([0], 'follows')
assert F.asnumpy(u).tolist() == [0]
assert F.asnumpy(v).tolist() == [1]
u, v = g.find_edges([0], 'plays')
assert F.asnumpy(u).tolist() == [0]
assert F.asnumpy(v).tolist() == [N2]
u, v, e = g.all_edges('all', 'eid', 'follows')
assert F.asnumpy(u).tolist() == [0]
assert F.asnumpy(v).tolist() == [1]
assert F.asnumpy(e).tolist() == [0]
u, v, e = g.all_edges('all', 'eid', 'plays')
assert F.asnumpy(u).tolist() == [0]
assert F.asnumpy(v).tolist() == [N2]
assert F.asnumpy(e).tolist() == [0]
assert g.in_degree(0, 'follows') == 0
assert g.in_degree(1, 'follows') == 1
assert F.asnumpy(g.in_degrees([0, 1], 'follows')).tolist() == [0, 1]
assert g.in_degree(0, 'plays') == 0
assert g.in_degree(N2, 'plays') == 1
assert F.asnumpy(g.in_degrees([0, N2], 'plays')).tolist() == [0, 1]
assert g.out_degree(0, 'follows') == 1
assert g.out_degree(1, 'follows') == 0
assert F.asnumpy(g.out_degrees([0, 1], 'follows')).tolist() == [1, 0]
assert g.out_degree(0, 'plays') == 1
assert g.out_degree(N2, 'plays') == 0
assert F.asnumpy(g.out_degrees([0, N2], 'plays')).tolist() == [1, 0]
def test_adj():
g = create_test_heterograph()
adj = F.sparse_to_numpy(g.adj(etype='follows'))
......@@ -693,7 +780,7 @@ def test_convert():
for _mg in [None, mg]:
hg2 = dgl.to_hetero(
g, ['user', 'game', 'developer'], ['follows', 'plays', 'wishes', 'develops'],
g, hg.ntypes, hg.etypes,
ntype_field=dgl.NTYPE, etype_field=dgl.ETYPE, metagraph=_mg)
assert set(hg.ntypes) == set(hg2.ntypes)
assert set(hg.canonical_etypes) == set(hg2.canonical_etypes)
......@@ -769,8 +856,9 @@ def test_subgraph():
g.edges['follows'].data['h'] = y
def _check_subgraph(g, sg):
assert sg.ntypes == ['user', 'game', 'developer']
assert sg.etypes == ['follows', 'plays', 'wishes', 'develops']
assert sg.ntypes == g.ntypes
assert sg.etypes == g.etypes
assert sg.canonical_etypes == g.canonical_etypes
assert F.array_equal(F.tensor(sg.nodes['user'].data[dgl.NID]),
F.tensor([1, 2], F.int64))
assert F.array_equal(F.tensor(sg.nodes['game'].data[dgl.NID]),
......@@ -1278,9 +1366,56 @@ def test_empty_heterograph():
assert g.number_of_edges('develops') == 2
assert g.number_of_nodes('developer') == 2
def test_compact():
g1 = dgl.heterograph({
('user', 'follow', 'user'): [(1, 3), (3, 5)],
('user', 'plays', 'game'): [(2, 4), (3, 4), (2, 5)],
('game', 'wished-by', 'user'): [(6, 7), (5, 7)]},
{'user': 20, 'game': 10})
g2 = dgl.heterograph({
('game', 'clicked-by', 'user'): [(3, 1)],
('user', 'likes', 'user'): [(1, 8), (8, 9)]},
{'user': 20, 'game': 10})
def _check(g, new_g, induced_nodes):
assert g.ntypes == new_g.ntypes
assert g.canonical_etypes == new_g.canonical_etypes
for ntype in g.ntypes:
assert -1 not in induced_nodes[ntype]
for etype in g.canonical_etypes:
g_src, g_dst = g.all_edges(order='eid', etype=etype)
g_src = F.asnumpy(g_src)
g_dst = F.asnumpy(g_dst)
new_g_src, new_g_dst = new_g.all_edges(order='eid', etype=etype)
new_g_src_mapped = induced_nodes[etype[0]][F.asnumpy(new_g_src)]
new_g_dst_mapped = induced_nodes[etype[2]][F.asnumpy(new_g_dst)]
assert (g_src == new_g_src_mapped).all()
assert (g_dst == new_g_dst_mapped).all()
new_g1 = dgl.compact_graphs(g1)
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7])
assert set(induced_nodes['game']) == set([4, 5, 6])
_check(g1, new_g1, induced_nodes)
new_g1, new_g2 = dgl.compact_graphs([g1, g2])
induced_nodes = {ntype: new_g1.nodes[ntype].data[dgl.NID] for ntype in new_g1.ntypes}
induced_nodes = {k: F.asnumpy(v) for k, v in induced_nodes.items()}
assert set(induced_nodes['user']) == set([1, 3, 5, 2, 7, 8, 9])
assert set(induced_nodes['game']) == set([3, 4, 5, 6])
_check(g1, new_g1, induced_nodes)
_check(g2, new_g2, induced_nodes)
if __name__ == '__main__':
test_create()
test_query()
test_hypersparse()
test_adj()
test_inc()
test_view()
......@@ -1297,3 +1432,4 @@ if __name__ == '__main__':
test_updates()
test_backward()
test_empty_heterograph()
test_compact()
......@@ -9,7 +9,7 @@ MD build
PUSHD build
CALL "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
cmake -DCMAKE_CXX_FLAGS="/DDGL_EXPORTS" -DUSE_OPENMP=ON -Dgtest_force_shared_crt=ON -DDMLC_FORCE_SHARED_CRT=ON -DBUILD_CPP_TEST=1 -DCMAKE_CONFIGURATION_TYPES="Release" .. -G "Visual Studio 15 2017 Win64" || EXIT /B 1
msbuild dgl.sln || EXIT /B 1
msbuild dgl.sln /m || EXIT /B 1
COPY Release\dgl.dll .
COPY Release\runUnitTests.exe .
POPD
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment