Commit 2883eda6 authored by Minjie Wang's avatar Minjie Wang
Browse files

[DOC] initial docs

parents c468e068 1209978d
from collections import MutableMapping
import dgl.backend as F
class DGLArray(MutableMapping):
def __init__(self):
pass
def __delitem__(self, key, value):
raise NotImplementedError()
def __getitem__(self, key):
"""
If the key is an DGLArray of identical length, this function performs a
logical filter: i.e. it subselects all the elements in this array
where the corresponding value in the other array evaluates to true.
If the key is an integer this returns a single row of
the DGLArray. If the key is a slice, this returns an DGLArray with the
sliced rows. See the Turi Create User Guide for usage examples.
"""
raise NotImplementedError()
def __iter__(self):
raise NotImplementedError()
def __len__(self):
raise NotImplementedError()
def __setitem__(self, key, value):
raise NotImplementedError()
class DGLDenseArray(DGLArray):
def __init__(self, data, applicable=None):
"""
Parameters
----------
data : list or tensor
"""
if type(data) is list:
raise NotImplementedError()
elif isinstance(data, F.Tensor):
self._data = data
if applicable is None:
self._applicable = F.ones(F.shape(data)[0], dtype=F.bool) # TODO: device
else:
assert isinstance(applicable, F.Tensor)
assert F.device(applicable) == F.device(data)
assert F.isboolean(applicable)
a_shape = F.shape(applicable)
assert len(a_shape) == 1
assert a_shape[0] == F.shape(data)[0]
self._applicable = applicable
def __getitem__(self, key):
"""
If the key is an DGLDenseArray of identical length, this function performs a
logical filter: i.e. it subselects all the elements in this array
where the corresponding value in the other array evaluates to true.
If the key is an integer this returns a single row of
the DGLArray. If the key is a slice, this returns an DGLArray with the
sliced rows. See the Turi Create User Guide for usage examples.
"""
if type(key) is DGLDenseArray:
if type(key._data) is list:
raise NotImplementedError()
elif type(key._data) is F.Tensor:
if type(self._data) is F.Tensor:
shape = F.shape(key._data)
assert len(shape) == 1
assert shape[0] == F.shape(self._data)[0]
assert F.dtype(key._data) is F.bool
data = self._data[key._data]
return DGLDenseArray(data)
else:
raise NotImplementedError()
else:
raise RuntimeError()
elif type(key) is int:
return self._data[key]
elif type(key) is slice:
raise NotImplementedError()
else:
raise RuntimeError()
def __iter__(self):
return iter(range(len(self)))
def __len__(self):
if type(self._data) is F.Tensor:
return F.shape(self._data)[0]
elif type(self._data) is list:
return len(self._data)
else:
raise RuntimeError()
def __setitem__(self, key, value):
if type(key) is int:
if type(self._data) is list:
raise NotImplementedError()
elif type(self._data) is F.Tensor:
assert isinstance(value, F.Tensor)
assert F.device(value) == F.device(self._data)
assert F.dtype(value) == F.dtype(self._data)
# TODO(gaiyu): shape
x = []
if key > 0:
x.append(self._data[:key])
x.append(F.expand_dims(value, 0))
if key < F.shape(self._data)[0] - 1:
x.append(self._data[key + 1:])
self._data = F.concatenate(x)
else:
raise RuntimeError()
elif type(key) is DGLDenseArray:
shape = F.shape(key._data)
assert len(shape) == 1
assert shape[0] == F.shape(self._data)[0]
assert F.isboolean(key._data)
data = self._data[key._data]
elif type(key) is DGLSparseArray:
raise NotImplementedError()
else:
raise RuntimeError()
def _listize(self):
raise NotImplementedError()
def _tensorize(self):
raise NotImplementedError()
def append(self, other):
assert type(other, DGLDenseArray)
if self.shape is None:
return other
elif other.shape is None:
return self
else:
assert self.shape[1:] == other.shape[1:]
data = F.concatenate([self.data, other.data])
return DGLDenseArray(data)
@property
def applicable(self):
return self._applicable
@property
def data(self):
return self._data
def dropna(self):
if type(self._data) is list:
raise NotImplementedError()
elif isinstance(self._data, F.Tensor):
data = F.index_by_bool(self._data, self._applicable)
return DGLDenseArray(data)
else:
raise RuntimeError()
class DGLSparseArray(DGLArray):
def __init__(self):
raise NotImplementedError()
from dgl.array import DGLArray, DGLDenseArray, DGLSparseArray
import dgl.backend as F
def _gridize(frame, key_column_names, src_column_name):
if type(key_column_names) is str:
key_column = frame[key_column_names]
assert F.prod(key_column.applicable)
if type(key_column) is DGLDenseArray:
row = key_column.data
if type(row) is F.Tensor:
assert F.isinteger(row) and len(F.shape(row)) == 1
col = F.unique(row)
xy = (F.expand_dims(row, 1) == F.expand_dims(col, 0))
if src_column_name:
src_column = frame[src_column_name]
if type(src_column) is DGLDenseArray:
z = src_column.data
if type(z) is F.Tensor:
z = F.expand_dims(z, 1)
for i in range(2, len(F.shape(z))):
xy = F.expand_dims(xy, i)
xy = F.astype(xy, F.dtype(z))
return col, xy * z
elif type(z) is list:
raise NotImplementedError()
else:
raise RuntimeError()
else:
return col, xy
elif type(row) is list:
raise NotImplementedError()
else:
raise RuntimeError()
else:
raise NotImplementedError()
elif type(key_column_names) is list:
raise NotImplementedError()
else:
raise RuntimeError()
def aggregator(src_column_name=''):
def decorator(a):
def decorated(frame, key_column_names):
col, xy = _gridize(frame, key_column_names, src_column_name)
trg_column_name = src_column_name + a.__name__
key = DGLDenseArray(col)
trg = DGLDenseArray(a(xy))
return {key_column_names : key, trg_column_name : trg}
return decorated
return decorator
def COUNT():
@aggregator()
def count(x):
return F.sum(x, 0)
return count
def SUM(src_column_name):
@aggregator(src_column_name)
def sum(x):
return F.sum(x, 0)
return sum
import dgl.backend as F
class DGLArray:
def __init__(self):
pass
def __getitem__(self, x):
raise NotImplementedError()
class DGLDenseArray(DGLArray):
def __init__(self):
pass
class DGLSparseArray(DGLArray):
def __init__(self, data, ):
raise NotImplementedError()
from dgl.array import DGLArray, DGLDenseArray, DGLSparseArray
import dgl.backend as F
from collections import MutableMapping
from functools import reduce
from itertools import dropwhile
import operator
class DGLFrame(MutableMapping):
def __init__(self, data=None):
self._columns = {}
if data is None:
pass
elif isinstance(data, dict):
for key, value in data.items():
device = self.device()
if device:
assert value.device() == device
if type(value) is DGLDenseArray:
num_rows = self.num_rows()
if num_rows:
assert value.shape[0] == num_rows
self._columns[key] = value
else:
raise NotImplementedError()
def __copy__(self):
return self._columns.copy()
def __delitem__(self, key):
"""
"""
del self._columns[key]
def __getitem__(self, key):
"""
This method does things based on the type of `key`.
If `key` is:
* str
selects column with name 'key'
* type
selects all columns with types matching the type
* list of str or type
selects all columns with names or type in the list
* DGLArray
Performs a logical filter. Expects given DGLArray to be the same
length as all columns in current DGLFrame. Every row
corresponding with an entry in the given DGLArray that is
equivalent to False is filtered from the result.
* int
Returns a single row of the DGLFrame (the `key`th one) as a dictionary.
* slice
Returns an DGLFrame including only the sliced rows.
"""
if type(key) is str:
return self._columns[key]
elif type(key) is type:
raise NotImplementedError()
elif type(key) is list:
raise NotImplementedError()
elif type(key) is DGLDenseArray:
return DGLFrame({k : v[key] for k, v in self._columns.items()})
elif type(key) is int:
return {k : v[key] for k, v in self._columns.items()}
elif type(key) is slice:
raise NotImplementedError()
else:
raise RuntimeError()
def __iter__(self):
return iter(self._columns.keys())
def __len__(self):
return len(self._columns)
def __setitem__(self, key, value):
"""
A wrapper around add_column(s). Key can be either a list or a str. If
value is an DGLArray, it is added to the DGLFrame as a column. If it is a
constant value (int, str, or float), then a column is created where
every entry is equal to the constant value. Existing columns can also
be replaced using this wrapper.
"""
if type(key) is str:
if type(value) is DGLDenseArray:
assert value.shape[0] == self.num_rows()
self._columns[key] = value
elif type(value) is DGLSparseArray:
raise NotImplementedError()
else:
raise RuntimeError()
elif type(key) is list:
raise NotImplementedError()
else:
raise RuntimeError()
def _next_dense_column(self):
if self._columns:
predicate = lambda x: type(x) is DGLDenseArray
try:
return next(dropwhile(predicate, self._columns.values()))
except StopIteration:
return None
else:
return None
def append(self, other):
"""
Add the rows of an DGLFrame to the end of this DGLFrame.
Both DGLFrames must have the same set of columns with the same column
names and column types.
Parameters
----------
other : DGLFrame
Another DGLFrame whose rows are appended to the current DGLFrame.
Returns
-------
out : DGLFrame
The result DGLFrame from the append operation.
"""
assert isisntance(other, DGLFrame)
assert set(self._columns) == set(other._columns)
if self.num_rows() == 0:
return other.__copy__()
elif self.num_rows() == 0:
return self.__copy__()
else:
return {k : v.append(other[k]) for k, v in self._columns.items()}
def device(self):
dense_column = self._next_dense_column()
return None if dense_column is None else dense_column.device()
def dropna(self, columns=None, how='any'):
columns = list(self._columns) if columns is None else columns
assert type(columns) is list
assert len(columns) > 0
column_list = [self._columns[x] for x in columns]
if all(type(x) is DGLDenseArray for x in column_list):
a_list = [x.applicable for x in column_list]
if how == 'any':
a = reduce(operator.mul, a_list)
elif how == 'all':
a = (reduce(operator.add, a_list) > 0)
else:
raise RuntimeError()
a_array = DGLDenseArray(a)
return DGLFrame({k : v[a_array] for k, v in self._columns.items()})
else:
raise NotImplementedError()
def filter_by(self, values, column_name, exclude=False):
"""
Filter an DGLFrame by values inside an iterable object. Result is an
DGLFrame that only includes (or excludes) the rows that have a column
with the given ``column_name`` which holds one of the values in the
given ``values`` :class:`~turicreate.DGLArray`. If ``values`` is not an
DGLArray, we attempt to convert it to one before filtering.
Parameters
----------
values : DGLArray | list | numpy.ndarray | pandas.Series | str
The values to use to filter the DGLFrame. The resulting DGLFrame will
only include rows that have one of these values in the given
column.
column_name : str
The column of the DGLFrame to match with the given `values`.
exclude : bool
If True, the result DGLFrame will contain all rows EXCEPT those that
have one of ``values`` in ``column_name``.
Returns
-------
out : DGLFrame
The filtered DGLFrame.
"""
if type(values) is DGLDenseArray:
mask = F.isin(self._columns[column_name], values.data)
if exclude:
mask = 1 - mask
return self[mask]
else:
raise NotImplementedError()
def groupby(self, key_column_names, operations, *args):
"""
Perform a group on the key_column_names followed by aggregations on the
columns listed in operations.
The operations parameter is a dictionary that indicates which
aggregation operators to use and which columns to use them on. The
available operators are SUM, MAX, MIN, COUNT, AVG, VAR, STDV, CONCAT,
SELECT_ONE, ARGMIN, ARGMAX, and QUANTILE. For convenience, aggregators
MEAN, STD, and VARIANCE are available as synonyms for AVG, STDV, and
VAR. See :mod:`~turicreate.aggregate` for more detail on the aggregators.
Parameters
----------
key_column_names : string | list[string]
Column(s) to group by. Key columns can be of any type other than
dictionary.
operations : dict, list
Dictionary of columns and aggregation operations. Each key is a
output column name and each value is an aggregator. This can also
be a list of aggregators, in which case column names will be
automatically assigned.
*args
All other remaining arguments will be interpreted in the same
way as the operations argument.
Returns
-------
out_sf : DGLFrame
A new DGLFrame, with a column for each groupby column and each
aggregation operation.
See Also
--------
aggregate
Notes
-----
* Numeric aggregators (such as sum, mean, stdev etc.) follow the skip
None policy i.e they will omit all missing values from the aggregation.
As an example, `sum([None, 5, 10]) = 15` because the `None` value is
skipped.
* Aggregators have a default value when no values (after skipping all
`None` values) are present. Default values are `None` for ['ARGMAX',
'ARGMIN', 'AVG', 'STD', 'MEAN', 'MIN', 'MAX'], `0` for ['COUNT'
'COUNT_DISTINCT', 'DISTINCT'] `[]` for 'CONCAT', 'QUANTILE',
'DISTINCT', and `{}` for 'FREQ_COUNT'.
"""
if type(key_column_names) is str:
if type(operations) is list:
raise NotImplementedError()
elif type(operations) is dict:
if len(operations) == 1:
dst_solumn_name, = operations.keys()
aggregator, = operations.values()
return DGLFrame(aggregator(self, key_column_names))
else:
raise NotImplementedError()
else:
raise RuntimeError()
else:
raise NotImplementedError()
def join(self, right, on=None, how='inner'):
"""
Merge two DGLFrames. Merges the current (left) DGLFrame with the given
(right) DGLFrame using a SQL-style equi-join operation by columns.
Parameters
----------
right : DGLFrame
The DGLFrame to join.
on : None | str | list | dict, optional
The column name(s) representing the set of join keys. Each row that
has the same value in this set of columns will be merged together.
* If 'None' is given, join will use all columns that have the same
name as the set of join keys.
* If a str is given, this is interpreted as a join using one column,
where both DGLFrames have the same column name.
* If a list is given, this is interpreted as a join using one or
more column names, where each column name given exists in both
DGLFrames.
* If a dict is given, each dict key is taken as a column name in the
left DGLFrame, and each dict value is taken as the column name in
right DGLFrame that will be joined together. e.g.
{'left_col_name':'right_col_name'}.
how : {'left', 'right', 'outer', 'inner'}, optional
The type of join to perform. 'inner' is default.
* inner: Equivalent to a SQL inner join. Result consists of the
rows from the two frames whose join key values match exactly,
merged together into one DGLFrame.
* left: Equivalent to a SQL left outer join. Result is the union
between the result of an inner join and the rest of the rows from
the left DGLFrame, merged with missing values.
* right: Equivalent to a SQL right outer join. Result is the union
between the result of an inner join and the rest of the rows from
the right DGLFrame, merged with missing values.
* outer: Equivalent to a SQL full outer join. Result is
the union between the result of a left outer join and a right
outer join.
Returns
-------
out : DGLFrame
"""
assert type(right) == DGLFrame
if on is None:
raise NotImplementedError()
elif type(on) is str:
assert set(self._columns).intersection(set(right._columns)) == {on}
elif type(on) is list:
raise NotImplementedError()
elif type(on) is dict:
raise NotImplementedError()
else:
raise RuntimeError()
if how == 'left':
raise NotImplementedError()
elif how == 'right':
raise NotImplementedError()
elif how == 'outer':
raise NotImplementedError()
elif how == 'inner':
lhs = self._columns[on]
rhs = right._columns[on]
if type(lhs) is DGLDenseArray and type(rhs) is DGLDenseArray:
if isinstance(lhs.data, F.Tensor) and isinstance(rhs.data, F.Tensor) and \
len(F.shape(lhs.data)) == 1 and len(F.shape(rhs.data)) == 1:
assert F.prod(lhs.applicable) and F.prod(rhs.applicable)
isin = F.isin(lhs.data, rhs.data)
columns = {k : v[isin] for k, v in self._columns.items()}
columns.update({k : v for k, v in self._columns.items() if k != on})
else:
raise NotImplementedError()
else:
raise NotImplementedError()
else:
raise RuntimeError()
def num_rows(self):
dense_column = self._next_dense_column()
return None if dense_column is None else dense_column.shape[0]
class NodeDictOverlay(MutableMapping):
def __init__(self, frame):
self._frame = frame
@property
def num_nodes(self):
return self._frame.num_rows()
def add_nodes(self, nodes, attrs):
# NOTE: currently `nodes` are not used. Users need to make sure
# the node ids are continuous ids from 0.
# NOTE: this is a good place to hook any graph mutation logic.
self._frame.append(attrs)
def delete_nodes(self, nodes):
# NOTE: this is a good place to hook any graph mutation logic.
raise NotImplementedError('Delete nodes in the graph is currently not supported.')
def get_node_attrs(self, nodes, key):
if nodes == ALL:
# get the whole column
return self._frame[key]
else:
# TODO(minjie): should not rely on tensor's __getitem__ syntax.
return utils.id_type_dispatch(
nodes,
lambda nid : self._frame[key][nid],
lambda id_array : self._frame[key][id_array])
def set_node_attrs(self, nodes, key, val):
if nodes == ALL:
# replace the whole column
self._frame[key] = val
else:
# TODO(minjie): should not rely on tensor's __setitem__ syntax.
utils.id_type_dispatch(
nodes,
lambda nid : self._frame[key][nid] = val,
lambda id_array : self._frame[key][id_array] = val)
def __getitem__(self, nodes):
def _check_one(nid):
if nid >= self.num_nodes:
raise KeyError
def _check_many(id_array):
if F.max(id_array) >= self.num_nodes:
raise KeyError
utils.id_type_dispatch(nodes, _check_one, _check_many)
return utils.MutableLazyDict(
lambda key: self.get_node_attrs(nodes, key),
lambda key, val: self.set_node_attrs(nodes, key, val)
self._frame.schemes)
def __setitem__(self, nodes, attrs):
# Happens when adding new nodes in the graph.
self.add_nodes(nodes, attrs)
def __delitem__(self, nodes):
# Happens when deleting nodes in the graph.
self.delete_nodes(nodes)
def __len__(self):
return self.num_nodes
def __iter__(self):
raise NotImplementedError()
class AdjOuterOverlay(MutableMapping):
"""
TODO: Replace this with a more efficient dict structure.
TODO: Batch graph mutation is not supported.
"""
def __init__(self):
self._adj = {}
def __setitem__(self, u, inner_dict):
self._adj[u] = inner_dict
def __getitem__(self, u):
def _check_one(nid):
if nid not in self._adj:
raise KeyError
def _check_many(id_array):
pass
utils.id_type_dispatch(u, _check_one, _check_many)
return utils.id_type_dispatch(u)
def __delitem__(self, u):
# The delitem is ignored.
raise NotImplementedError('Delete edges in the graph is currently not supported.')
class AdjInnerOverlay(dict):
"""TODO: replace this with a more efficient dict structure."""
def __setitem__(self, v, attrs):
pass
gt_bench
igraph_bench
nx_bench
sp_bench
dgl_bench
pgp.xml
# Profiling the performance of graph-tool, igraph and Networkx
## Settings
We profiled graph-tool 2.26, igraph 0.7.1 and Networkx 2.1 (Python 3.6) on an Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz with 6 cores.
Benchmark scripts and sources are obtained from:
https://graph-tool.skewed.de/performance
The graph used in profiling is the strongly connected component of the PGP web of trust network circa November 2009. It is a directed graph, with 39,796 vertices and 301,498 edges.
We did not use the betweenness benchmark due to the time it takes (10+ hours for NetworkX).
For the PageRank benchmark of DGL (PageRank with SpMV), we assumed tensorized storage of adjacency matrix and degree vector.
DGL was profiled with both CPU-only and GPU-enabled tensorflow (version 1.8.0).
DGL (CPU) was profiled on the same CPU as graph-tool, igraph and NetworkX, and DGL (GPU) was profiled on a GeForce GTX TITAN X.
## Profiling
```
bash pgp.sh
bash profile.sh
```
## Results
Average time per call:
| Algorithm | graph-tool | igraph | NetworkX | DGL (CPU) | DGL (GPU) |
| --------------------------- | ---------- | ------- | -------- | --------- | --------- |
| Single-source shortest path | 0.004 s | 0.017 s | 0.484 s | N/A | N/A |
| PageRank | 0.009 s | 0.239 s | 4.207 s | 0.040 s | 0.081 s |
| K-core | 0.010 s | 0.029 s | 1.178 s | N/A | N/A |
| Minimum spanning tree | 0.022 s | 0.030 s | 1.961 s | N/A | N/A |
Profiling pagerank_scipy:
| Operation | Time |
| ------------------------- | ------- |
| nx.to_scipy_sparse_matrix | 1.34 s |
| np.ndarray to dict | 0.01 s |
| Power iteration | 0.001 s |
| PageRank | 1.35 s |
#!/bin/env python
from graph_tool.all import *
import cProfile
g = load_graph("pgp.xml")
print("Profiling shortest path")
print("=======================")
print()
cProfile.run("for i in range(1000): shortest_distance(g, g.vertex(0))", sort="cumulative")
print("Profiling PageRank")
print("==================")
print()
cProfile.run("for i in range(100): pagerank(g, damping=0.85, epsilon=1e-3)", sort="cumulative")
print("Profiling k-core")
print("================")
print()
cProfile.run("for i in range(1000): kcore_decomposition(g)", sort="cumulative")
# cProfile.run("for i in range(1000): kcore_decomposition(g, deg='total')", sort="cumulative")
print("Profiling minimum spanning tree")
print("===============================")
print()
cProfile.run("for i in range(1000): min_spanning_tree(g)", sort="cumulative")
'''
print("Profiling betweenness")
print("=====================")
print()
cProfile.run("for i in range(3): betweenness(g)", sort="cumulative")
'''
echo " ncalls tottime percall cumtime percall filename:lineno(function)"
cat gt_bench | grep "__init__.py:1501(shortest_distance)"
cat gt_bench | grep "__init__.py:62(pagerank)"
cat gt_bench | grep "__init__.py:1427(kcore_decomposition)"
cat gt_bench | grep "__init__.py:627(min_spanning_tree)"
#!/bin/env python
from igraph import *
import cProfile
g = Graph.Read_GraphML("pgp.xml")
print("Profiling shortest path")
print("=======================")
print()
cProfile.run("for i in range(1000): g.shortest_paths([g.vs[0]])", sort="cumulative")
print("Profiling PageRank")
print("==================")
print()
cProfile.run("for i in range(100): g.pagerank(damping=0.85)", sort="cumulative")
print("Profiling k-core")
print("================")
print()
cProfile.run("for i in range(1000): g.coreness(mode='all')", sort="cumulative")
print("Profiling minimum spanning tree")
print("===============================")
print()
cProfile.run("for i in range(1000): g.spanning_tree()", sort="cumulative")
'''
print("Profiling betweenness")
print("=====================")
print()
cProfile.run("for i in range(3): g.betweenness(); g.edge_betweenness()", sort="cumulative")
'''
echo " ncalls tottime percall cumtime percall filename:lineno(function)"
cat igraph_bench | grep "{method 'shortest_paths' of 'igraph.Graph' objects}"
cat igraph_bench | grep "{method 'personalized_pagerank' of 'igraph.Graph' objects}"
cat igraph_bench | grep "{method 'coreness' of 'igraph.Graph' objects}"
cat igraph_bench | grep "{method 'subgraph_edges' of 'igraph.Graph' objects}"
#!/bin/env python
from __future__ import print_function
from networkx import *
import cProfile
g = read_graphml("pgp.xml")
print("Profiling shortest path")
print("=======================")
print()
cProfile.run("for i in range(100): shortest_path_length(g, 'n0')", sort="cumulative")
print("Profiling PageRank")
print("==================")
print()
cProfile.run("for i in range(10): pagerank(g, alpha=0.85, tol=1e-3, max_iter=10000000)", sort="cumulative")
print("Profiling k-core")
print("================")
print()
cProfile.run("for i in range(10): core.core_number(g)", sort="cumulative")
print("Profiling minimum spanning tree")
print("===============================")
print()
u = g.to_undirected()
cProfile.run("for i in range(10): minimum_spanning_tree(u)", sort="cumulative")
'''
print("Profiling betweenness")
print("====================")
print()
cProfile.run("for i in range(1): betweenness_centrality(g); edge_betweenness_centrality(g)", sort="cumulative")
'''
echo " ncalls tottime percall cumtime percall filename:lineno(function)"
cat nx_bench | grep "generic.py:149(shortest_path_length)"
cat nx_bench | grep "pagerank_alg.py:16(pagerank)"
cat nx_bench | grep "core.py:42(core_number)"
cat nx_bench | grep "mst.py:493(minimum_spanning_tree)"
"""PageRank analysis of graph structure. """
# Copyright (C) 2004-2018 by
# Aric Hagberg <hagberg@lanl.gov>
# Dan Schult <dschult@colgate.edu>
# Pieter Swart <swart@lanl.gov>
# All rights reserved.
# BSD license.
# NetworkX:http://networkx.github.io/
import time
import networkx as nx
from networkx.utils import not_implemented_for
__author__ = """\n""".join(["Aric Hagberg <aric.hagberg@gmail.com>",
"Brandon Liu <brandon.k.liu@gmail.com"])
__all__ = ['pagerank', 'pagerank_numpy', 'pagerank_scipy', 'google_matrix']
@not_implemented_for('multigraph')
def pagerank(G, alpha=0.85, personalization=None,
max_iter=100, tol=1.0e-6, nstart=None, weight='weight',
dangling=None):
"""Return the PageRank of the nodes in the graph.
PageRank computes a ranking of the nodes in the graph G based on
the structure of the incoming links. It was originally designed as
an algorithm to rank web pages.
Parameters
----------
G : graph
A NetworkX graph. Undirected graphs will be converted to a directed
graph with two directed edges for each undirected edge.
alpha : float, optional
Damping parameter for PageRank, default=0.85.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero.
By default, a uniform distribution is used.
max_iter : integer, optional
Maximum number of iterations in power method eigenvalue solver.
tol : float, optional
Error tolerance used to check convergence in power method solver.
nstart : dictionary, optional
Starting value of PageRank iteration for each node.
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified). This must be selected to result in an irreducible transition
matrix (see notes under google_matrix). It may be common to have the
dangling dict to be the same as the personalization dict.
Returns
-------
pagerank : dictionary
Dictionary of nodes with PageRank as value
Examples
--------
>>> G = nx.DiGraph(nx.path_graph(4))
>>> pr = nx.pagerank(G, alpha=0.9)
Notes
-----
The eigenvector calculation is done by the power iteration method
and has no guarantee of convergence. The iteration will stop after
an error tolerance of ``len(G) * tol`` has been reached. If the
number of iterations exceed `max_iter`, a
:exc:`networkx.exception.PowerIterationFailedConvergence` exception
is raised.
The PageRank algorithm was designed for directed graphs but this
algorithm does not check if the input graph is directed and will
execute on undirected graphs by converting each edge in the
directed graph to two edges.
See Also
--------
pagerank_numpy, pagerank_scipy, google_matrix
Raises
------
PowerIterationFailedConvergence
If the algorithm fails to converge to the specified tolerance
within the specified number of iterations of the power iteration
method.
References
----------
.. [1] A. Langville and C. Meyer,
"A survey of eigenvector methods of web information retrieval."
http://citeseer.ist.psu.edu/713792.html
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
The PageRank citation ranking: Bringing order to the Web. 1999
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
"""
if len(G) == 0:
return {}
if not G.is_directed():
D = G.to_directed()
else:
D = G
# Create a copy in (right) stochastic form
W = nx.stochastic_graph(D, weight=weight)
N = W.number_of_nodes()
# Choose fixed starting vector if not given
if nstart is None:
x = dict.fromkeys(W, 1.0 / N)
else:
# Normalized nstart vector
s = float(sum(nstart.values()))
x = dict((k, v / s) for k, v in nstart.items())
if personalization is None:
# Assign uniform personalization vector if not given
p = dict.fromkeys(W, 1.0 / N)
else:
s = float(sum(personalization.values()))
p = dict((k, v / s) for k, v in personalization.items())
if dangling is None:
# Use personalization vector if dangling vector not specified
dangling_weights = p
else:
s = float(sum(dangling.values()))
dangling_weights = dict((k, v / s) for k, v in dangling.items())
dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
# power iteration: make up to max_iter iterations
for _ in range(max_iter):
xlast = x
x = dict.fromkeys(xlast.keys(), 0)
danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
for n in x:
# this matrix multiply looks odd because it is
# doing a left multiply x^T=xlast^T*W
for nbr in W[n]:
x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0)
# check convergence, l1 norm
err = sum([abs(x[n] - xlast[n]) for n in x])
if err < N * tol:
return x
raise nx.PowerIterationFailedConvergence(max_iter)
def google_matrix(G, alpha=0.85, personalization=None,
nodelist=None, weight='weight', dangling=None):
"""Return the Google matrix of the graph.
Parameters
----------
G : graph
A NetworkX graph. Undirected graphs will be converted to a directed
graph with two directed edges for each undirected edge.
alpha : float
The damping factor.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero.
By default, a uniform distribution is used.
nodelist : list, optional
The rows and columns are ordered according to the nodes in nodelist.
If nodelist is None, then the ordering is produced by G.nodes().
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified) This must be selected to result in an irreducible transition
matrix (see notes below). It may be common to have the dangling dict to
be the same as the personalization dict.
Returns
-------
A : NumPy matrix
Google matrix of the graph
Notes
-----
The matrix returned represents the transition matrix that describes the
Markov chain used in PageRank. For PageRank to converge to a unique
solution (i.e., a unique stationary distribution in a Markov chain), the
transition matrix must be irreducible. In other words, it must be that
there exists a path between every pair of nodes in the graph, or else there
is the potential of "rank sinks."
This implementation works with Multi(Di)Graphs. For multigraphs the
weight between two nodes is set to be the sum of all edge weights
between those nodes.
See Also
--------
pagerank, pagerank_numpy, pagerank_scipy
"""
import numpy as np
if nodelist is None:
nodelist = list(G)
M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
N = len(G)
if N == 0:
return M
# Personalization vector
if personalization is None:
p = np.repeat(1.0 / N, N)
else:
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
p /= p.sum()
# Dangling nodes
if dangling is None:
dangling_weights = p
else:
# Convert the dangling dictionary into an array in nodelist order
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist],
dtype=float)
dangling_weights /= dangling_weights.sum()
dangling_nodes = np.where(M.sum(axis=1) == 0)[0]
# Assign dangling_weights to any dangling nodes (nodes with no out links)
for node in dangling_nodes:
M[node] = dangling_weights
M /= M.sum(axis=1) # Normalize rows to sum to 1
return alpha * M + (1 - alpha) * p
def pagerank_numpy(G, alpha=0.85, personalization=None, weight='weight',
dangling=None):
"""Return the PageRank of the nodes in the graph.
PageRank computes a ranking of the nodes in the graph G based on
the structure of the incoming links. It was originally designed as
an algorithm to rank web pages.
Parameters
----------
G : graph
A NetworkX graph. Undirected graphs will be converted to a directed
graph with two directed edges for each undirected edge.
alpha : float, optional
Damping parameter for PageRank, default=0.85.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero.
By default, a uniform distribution is used.
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified) This must be selected to result in an irreducible transition
matrix (see notes under google_matrix). It may be common to have the
dangling dict to be the same as the personalization dict.
Returns
-------
pagerank : dictionary
Dictionary of nodes with PageRank as value.
Examples
--------
>>> G = nx.DiGraph(nx.path_graph(4))
>>> pr = nx.pagerank_numpy(G, alpha=0.9)
Notes
-----
The eigenvector calculation uses NumPy's interface to the LAPACK
eigenvalue solvers. This will be the fastest and most accurate
for small graphs.
This implementation works with Multi(Di)Graphs. For multigraphs the
weight between two nodes is set to be the sum of all edge weights
between those nodes.
See Also
--------
pagerank, pagerank_scipy, google_matrix
References
----------
.. [1] A. Langville and C. Meyer,
"A survey of eigenvector methods of web information retrieval."
http://citeseer.ist.psu.edu/713792.html
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
The PageRank citation ranking: Bringing order to the Web. 1999
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
"""
import numpy as np
if len(G) == 0:
return {}
M = google_matrix(G, alpha, personalization=personalization,
weight=weight, dangling=dangling)
# use numpy LAPACK solver
eigenvalues, eigenvectors = np.linalg.eig(M.T)
ind = np.argmax(eigenvalues)
# eigenvector of largest eigenvalue is at ind, normalized
largest = np.array(eigenvectors[:, ind]).flatten().real
norm = float(largest.sum())
return dict(zip(G, map(float, largest / norm)))
def pagerank_scipy(G, alpha=0.85, personalization=None,
max_iter=100, tol=1.0e-6, weight='weight',
dangling=None):
"""Return the PageRank of the nodes in the graph.
PageRank computes a ranking of the nodes in the graph G based on
the structure of the incoming links. It was originally designed as
an algorithm to rank web pages.
Parameters
----------
G : graph
A NetworkX graph. Undirected graphs will be converted to a directed
graph with two directed edges for each undirected edge.
alpha : float, optional
Damping parameter for PageRank, default=0.85.
personalization: dict, optional
The "personalization vector" consisting of a dictionary with a
key some subset of graph nodes and personalization value each of those.
At least one personalization value must be non-zero.
If not specfiied, a nodes personalization value will be zero.
By default, a uniform distribution is used.
max_iter : integer, optional
Maximum number of iterations in power method eigenvalue solver.
tol : float, optional
Error tolerance used to check convergence in power method solver.
weight : key, optional
Edge data key to use as weight. If None weights are set to 1.
dangling: dict, optional
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
any outedges. The dict key is the node the outedge points to and the dict
value is the weight of that outedge. By default, dangling nodes are given
outedges according to the personalization vector (uniform if not
specified) This must be selected to result in an irreducible transition
matrix (see notes under google_matrix). It may be common to have the
dangling dict to be the same as the personalization dict.
Returns
-------
pagerank : dictionary
Dictionary of nodes with PageRank as value
Examples
--------
>>> G = nx.DiGraph(nx.path_graph(4))
>>> pr = nx.pagerank_scipy(G, alpha=0.9)
Notes
-----
The eigenvector calculation uses power iteration with a SciPy
sparse matrix representation.
This implementation works with Multi(Di)Graphs. For multigraphs the
weight between two nodes is set to be the sum of all edge weights
between those nodes.
See Also
--------
pagerank, pagerank_numpy, google_matrix
Raises
------
PowerIterationFailedConvergence
If the algorithm fails to converge to the specified tolerance
within the specified number of iterations of the power iteration
method.
References
----------
.. [1] A. Langville and C. Meyer,
"A survey of eigenvector methods of web information retrieval."
http://citeseer.ist.psu.edu/713792.html
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
The PageRank citation ranking: Bringing order to the Web. 1999
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
"""
import scipy.sparse
t0 = time.time()
N = len(G)
if N == 0:
return {}
t1 = time.time()
print('t1 - t0', t1 - t0)
nodelist = list(G)
M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
dtype=float)
S = scipy.array(M.sum(axis=1)).flatten()
S[S != 0] = 1.0 / S[S != 0]
Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
M = Q * M
t2 = time.time()
print('t2 - t1', t2 - t1)
# initial vector
x = scipy.repeat(1.0 / N, N)
t3 = time.time()
print('t3 - t2', t3 - t2)
# Personalization vector
if personalization is None:
p = scipy.repeat(1.0 / N, N)
else:
p = scipy.array([personalization.get(n, 0) for n in nodelist], dtype=float)
p = p / p.sum()
t4 = time.time()
print('t4 - t3', t4 - t3)
# Dangling nodes
if dangling is None:
dangling_weights = p
else:
# Convert the dangling dictionary into an array in nodelist order
dangling_weights = scipy.array([dangling.get(n, 0) for n in nodelist],
dtype=float)
dangling_weights /= dangling_weights.sum()
is_dangling = scipy.where(S == 0)[0]
t5 = time.time()
print('t5 - t4', t5 - t4)
# power iteration: make up to max_iter iterations
for i in range(max_iter):
xlast = x
x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
(1 - alpha) * p
# check convergence, l1 norm
err = scipy.absolute(x - xlast).sum()
if err < N * tol:
t6 = time.time()
print('t6 - t5', t6 - t5)
result = dict(zip(nodelist, map(float, x)))
t7 = time.time()
print('t7 - t6', t7 - t6)
print('t7 - t0', t7 - t0)
print(i)
return result
raise nx.PowerIterationFailedConvergence(max_iter)
# fixture for nose tests
def setup_module(module):
from nose import SkipTest
try:
import numpy
except:
raise SkipTest("NumPy not available")
try:
import scipy
except:
raise SkipTest("SciPy not available")
if __name__ == '__main__':
g = nx.read_graphml("pgp.xml")
pagerank_scipy(g, alpha=0.85, tol=1e-3, max_iter=10000000)
wget https://graph-tool.skewed.de/static/performance/pgp.xml.gz
gzip -d pgp.xml.gz
for lib in gt igraph nx dgl; do
echo "Profiling ${lib}"
python ${lib}_bench.py > ${lib}_bench
bash ${lib}_bench.sh
done
#!/bin/env python
from __future__ import print_function
from networkx import *
import cProfile
g = read_graphml("pgp.xml")
print("Profiling PageRank")
print("==================")
print()
cProfile.run("for i in range(10): pagerank_scipy(g, alpha=0.85, tol=1e-3, max_iter=10000000)", sort="cumulative")
echo " ncalls tottime percall cumtime percall filename:lineno(function)"
cat sp_bench | grep "pagerank_alg.py:338(pagerank_scipy)"
"""
Adapted from code by Lingfan
"""
import argparse
import cProfile
import networkx as nx
import tensorflow as tf
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=int, default=-1)
args = parser.parse_args()
g = nx.read_graphml('pgp.xml')
n = g.number_of_nodes()
adj = nx.adj_matrix(g).tocoo()
indices = list(map(list, zip(adj.row, adj.col)))
values = adj.data.astype('float32').tolist()
device = '/cpu:0' if args.gpu < 0 else '/device:GPU:%d' %args.gpu
with tf.device(device):
m = tf.SparseTensor(indices=indices, values=values, dense_shape=[n, n])
session = tf.Session()
def pagerank(alpha, tol, max_iter):
with tf.device(device):
p = tf.Variable(tf.constant(1.0 / n, shape=[n, 1]), trainable=False)
new_p = alpha * tf.sparse_tensor_dense_matmul(m, p) + (1 - alpha) / n
delta = tf.reduce_sum(tf.abs(new_p - p))
with tf.control_dependencies([delta]):
assignment = p.assign(new_p)
session.run(p.initializer)
for i in range(max_iter):
delta, _ = session.run([delta, assignment])
if delta < tol * n:
break
print("Profiling PageRank")
print("==================")
print()
# cProfile.run("for i in range(10): pagerank(alpha=0.85, tol=1e-3, max_iter=10000000)", sort="cumulative")
import time
t0 = time.time()
for i in range(10):
pagerank(alpha=0.85, tol=1e-3, max_iter=10000000)
print((time.time() - t0) / 10)
echo " ncalls tottime percall cumtime percall filename:lineno(function)"
cat dgl_bench | grep "dgl_bench.py:25(pagerank)"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment