[Doc] Basic tutorials & two model tutorials (#118)

[Doc] Basic tutorials & two model tutorials

[Doc] Basic tutorials & two model tutorials (#118)
[Doc] Basic tutorials & two model tutorials
87ed21ec · Gan Quan · Minjie Wang · 7241a9c0 · 87ed21ec · 87ed21ec
Commit 87ed21ec authored Nov 04, 2018 by Gan Quan Committed by Minjie Wang Nov 04, 2018
10 changed files
--- a/README.md
+++ b/README.md
 # Deep Graph Library
-[![Build Status](http://216.165.71.225:8080/buildStatus/icon?job=DGL/master)](http://216.165.71.225:8080/job/DGL/job/master/)
+[![Build Status](http://34.239.175.180:80/buildStatus/icon?job=DGL/master)](http://34.239.175.180:80/job/DGL/job/master/)
 [![GitHub license](https://dmlc.github.io/img/apache2.svg)](./LICENSE)


+
+
 For how to install and how to play with DGL, please read our
 [Documentation](http://216.165.71.225:23232/index.html)


--- a/tutorials/1_first.py
+++ b/tutorials/1_first.py
 """
-.. _tutorial-first:
-
-Your first example in DGL
+DGL at a glance
 =========================

-TODO: either a pagerank or SSSP example
+**Author**: Minjie Wang, Quan Gan, Zheng Zhang
+
+The goal of DGL is to build, train, and deploy *machine learning models*
+on *graph-structured data*.  To achieve this, DGL provides a ``DGLGraph``
+class that defines the graph structure and the information on its nodes
+and edges.  It also provides a set of feature transformation methods
+and message passing methods to propagate information between nodes and edges.
+
+Goal of this tutorial: get a feeling of how DGL looks like!
 """

 ###############################################################################
-# Create a DGLGraph
-# -----------------
-#
-# To start with, let's first import dgl
+# Building a graph
+# ----------------
+# Let's build a toy graph with two nodes and throw some representations on the
+# nodes and edges:

+import torch as th
+import networkx as nx
 import dgl
+
+def a_boring_graph():
+    g = dgl.DGLGraph()
+    g.add_nodes(2)
+    g.add_edge(1, 0)
+
+    # node and edge features
+    x = th.tensor([[0.0, 0.0], [1.0, 2.0]])
+    w = th.tensor([2]).float()
+    g.ndata['x'] = x
+    g.edata['w'] = w
+
+    return g
+
+###############################################################################
+# We can also convert from networkx:
+
+def an_interesting_graph():
+    import networkx as nx
+
+    N = 100
+    g = nx.erdos_renyi_graph(N, 0.1)
+    g = dgl.DGLGraph(g)
+
+    x = th.randn(N, 6)
+    w = th.randn(g.number_of_edges(), 1)
+    g.ndata['x'] = x
+    g.edata['w'] = w
+
+    return g
+
+###############################################################################
+# One thing to be aware of is that DGL graphs are directional:
+
+g_boring = a_boring_graph()
+g_better = an_interesting_graph()
+
+import matplotlib.pyplot as plt
+nx.draw(g_better.to_networkx(), node_size=50, node_color=[[.5, .5, .5,]])
+plt.show()
+
+###############################################################################
+# Define Computation
+# ------------------
+# The focus of DGL is to provide a way to integrate representation learning
+# (using neural networks) with graph data. The way we do it is with a
+# message-passing interface with scatter-gather paradigm. (i.e. a mailbox metaphor).
+#
+# .. note::
+#
+#    For people familiar with graph convolutional network, it is easy to see the
+#    pattern here.
+
+def super_useful_comp(g):
+
+    def send_source(edges):
+        return {'msg': edges.src['x'] * edges.data['w']}
+
+    def simple_reduce(nodes):
+        msgs = nodes.mailbox['msg']
+        return {'x': msgs.sum(1) + nodes.data['x']}
+
+    def readout(g):
+        return th.sum(g.ndata['x'], dim=0)
+
+    g.register_message_func(send_source)
+    g.register_reduce_func(simple_reduce)
+
+    g.send(g.edges())
+    g.recv(g.nodes())
+
+    return readout(g)
+
+###############################################################################
+# The point is, regardless of what kind of graphs and the form of repretations,
+# DGL handles it uniformly and efficiently.
+
+g_boring = a_boring_graph()
+graph_sum = super_useful_comp(g_boring)
+print("graph sum is: ", graph_sum)
+
+g_better = an_interesting_graph()
+graph_sum = super_useful_comp(g_better)
+print("graph sum is: ", graph_sum)
+
+###############################################################################
+# Next steps
+# ----------
+# In the :doc:`next tutorial <2_basics>`, we will go through defining
+# a graph structure, as well as reading and writing node/edge representations.
--- a/tutorials/2_basics.py
+++ b/tutorials/2_basics.py
+"""
+DGL Basics
+==========
+
+**Author**: Minjie Wang, Quan Gan, Yu Gai, Zheng Zhang
+
+The Goal of this tutorial:
+
+* To create a graph.
+* To read and write node and edge representations.
+"""
+
+###############################################################################
+# Graph Creation
+# --------------
+# The design of ``DGLGraph`` was influenced by other graph libraries. Indeed,
+# you can create a graph from networkx, and convert it into a ``DGLGraph`` and
+# vice versa:
+
+import networkx as nx
+import dgl
+
+g_nx = nx.petersen_graph()
+g_dgl = dgl.DGLGraph(g_nx)
+
+import matplotlib.pyplot as plt
+plt.subplot(121)
+nx.draw(g_nx, with_labels=True)
+plt.subplot(122)
+nx.draw(g_dgl.to_networkx(), with_labels=True)
+
+plt.show()
+
+
+###############################################################################
+# They are the same graph, except that DGLGraph are *always* directional.
+#
+# One can also create a graph by calling DGL's own interface.
+# 
+# Now let's build a star graph. DGLGraph nodes are consecutive range of
+# integers between 0 and ``g.number_of_nodes()`` and can grow by calling
+# ``g.add_nodes``. DGLGraph edges are in order of their additions. Note that
+# edges are accessed in much the same way as nodes, with one extra feature
+# of *edge broadcasting*:
+
+import dgl
+import torch as th
+
+g = dgl.DGLGraph()
+g.add_nodes(10)
+# a couple edges one-by-one
+for i in range(1, 4):
+    g.add_edge(i, 0)
+# a few more with a paired list
+src = list(range(5, 8)); dst = [0]*3
+g.add_edges(src, dst)
+# finish with a pair of tensors
+src = th.tensor([8, 9]); dst = th.tensor([0, 0])
+g.add_edges(src, dst)
+
+# edge broadcasting will do star graph in one go!
+g.clear(); g.add_nodes(10)
+src = th.tensor(list(range(1, 10)));
+g.add_edges(src, 0)
+
+import networkx as nx
+import matplotlib.pyplot as plt
+nx.draw(g.to_networkx(), with_labels=True)
+plt.show()
+
+
+###############################################################################
+# Feature Assignment
+# ------------------
+# One can also assign features to nodes and edges of a ``DGLGraph``.  The
+# features are represented as dictionary of names (strings) and tensors,
+# called **fields**.
+#
+# The following code snippet assigns each node a 3-D vector.
+#
+# .. note::
+#
+#    DGL aims to be framework-agnostic, and currently it supports PyTorch and
+#    MXNet tensors. From now on, we use PyTorch as an example.
+
+import dgl
+import torch as th
+
+x = th.randn(10, 3)
+g.ndata['x'] = x
+
+
+###############################################################################
+# ``ndata`` is a syntax sugar to access states of all nodes, states are stored
+# in a container `data` that hosts user defined dictionary.
+
+print(g.ndata['x'] == g.nodes[:].data['x'])
+
+# access node set with integer, list, or integer tensor
+g.nodes[0].data['x'] = th.zeros(1, 3)
+g.nodes[[0, 1, 2]].data['x'] = th.zeros(3, 3)
+g.nodes[th.tensor([0, 1, 2])].data['x'] = th.zeros(3, 3)
+
+
+###############################################################################
+# Assigning edge features is in a similar fashion to that of node features,
+# except that one can also do it by specifying endpoints of the edges.
+
+g.edata['w'] = th.randn(9, 2)
+
+# access edge set with IDs in integer, list, or integer tensor
+g.edges[1].data['w'] = th.randn(1, 2)
+g.edges[[0, 1, 2]].data['w'] = th.zeros(3, 2)
+g.edges[th.tensor([0, 1, 2])].data['w'] = th.zeros(3, 2)
+
+# one can also access the edges by giving endpoints
+g.edges[1, 0].data['w'] = th.ones(1, 2)                 # edge 1 -> 0
+g.edges[[1, 2, 3], [0, 0, 0]].data['w'] = th.ones(3, 2) # edges [1, 2, 3] -> 0
+
+
+###############################################################################
+# After assignments, each node/edge field will be associated with a scheme
+# containing the shape and data type (dtype) of its field value.
+
+print(g.node_attr_schemes())
+g.ndata['x'] = th.zeros((10, 4))
+print(g.node_attr_schemes())
+
+
+###############################################################################
+# One can also remove node/edge states from the graph. This is particularly
+# useful to save memory during inference.
+
+g.ndata.pop('x')
+g.edata.pop('w')
+
+
+###############################################################################
+# Multigraphs
+# ~~~~~~~~~~~
+# Many graph applications need multi-edges. To enable this, construct DGLGraph
+# with ``multigraph=True``.
+
+g_multi = dgl.DGLGraph(multigraph=True)
+g_multi.add_nodes(10)
+g_multi.ndata['x'] = th.randn(10, 2)
+
+g_multi.add_edges(list(range(1, 10)), 0)
+g_multi.add_edge(1, 0) # two edges on 1->0
+
+g_multi.edata['w'] = th.randn(10, 2)
+g_multi.edges[1].data['w'] = th.zeros(1, 2)
+print(g_multi.edges())
+
+
+###############################################################################
+# An edge in multi-graph cannot be uniquely identified using its incident nodes
+# :math:`u` and :math:`v`; query their edge ids use ``edge_id`` interface.
+
+eid_10 = g_multi.edge_id(1, 0)
+g_multi.edges[eid_10].data['w'] = th.ones(len(eid_10), 2)
+print(g_multi.edata['w'])
+
+
+###############################################################################
+# .. note::
+#
+#    * Nodes and edges can be added but not removed; we will support removal in
+#      the future.
+#    * Updating a feature of different schemes raise error on indivdual node (or
+#      node subset).
+
+
+###############################################################################
+# Next steps
+# ----------
+# In the :doc:`next tutorial <3_pagerank>`, we will go through the
+# DGL message passing interface by implementing PageRank.
--- a/tutorials/2_graph.py
+++ b/tutorials/2_graph.py
-"""
-.. _tutorial-graph:
-
-Use DGLGraph
-============
-**Author**: `Minjie Wang <https://jermainewang.github.io/>`_
-
-In this tutorial, we introduce how to use our graph class -- ``DGLGraph``.
-The ``DGLGraph`` is the very core data structure in our library. It provides the basic
-interfaces to manipulate graph structure, set/get node/edge features and convert
-from/to many other graph formats. You can also perform computation on the graph
-using our message passing APIs (see :ref:`tutorial-mp`).
-
-TODO: 1) explain `tensor`; 2) enable g.nodes/edges[:][key]; 3) networkx conversion in one place
-"""
-
-###############################################################################
-# Construct a graph
-# -----------------
-#
-# The design of ``DGLGraph`` was influenced by other graph libraries. Indeed, you can
-# create a graph from `networkx <https://networkx.github.io/>`__, and convert it into a ``DGLGraph``
-# and vice versa:
-
-import networkx as nx
-import dgl
-
-g_nx = nx.petersen_graph()
-g_dgl = dgl.DGLGraph(g_nx)
-
-import matplotlib.pyplot as plt
-plt.subplot(121)
-nx.draw(g_nx, with_labels=True)
-plt.subplot(122)
-nx.draw(g_dgl.to_networkx(), with_labels=True)
-
-plt.show()
-
-###############################################################################
-# They are the same graph, except that ``DGLGraph`` are always `directional`.
-#
-# Creating a graph is a matter of specifying total number of nodes and the edges among them.
-# In ``DGLGraph``, all nodes are represented using consecutive integers starting from
-# zero, and you can add more nodes repeatedly.
-#
-# .. note::
-#
-#  ``nx.add_node(100)`` adds a node with id 100, ``dgl.add_nodes(100)`` adds another 100 nodes into the graph.
-
-g_dgl.clear()
-g_nx.clear()
-g_dgl.add_nodes(20)
-print("We have %d nodes now" % g_dgl.number_of_nodes())
-g_dgl.add_nodes(100)
-print("Now we have %d nodes!" % g_dgl.number_of_nodes())
-g_nx.add_node(100)
-print("My nx buddy only has %d :( " % g_nx.number_of_nodes())
-
-###############################################################################
-# The most naive way to add edges are just adding them one by one, with a (*src, dst*) pair.
-# Let's generate a star graph where all the edges point to the center (node#0).
-
-star = dgl.DGLGraph()
-star.add_nodes(10)  # add 10 nodes
-for i in range(1, 10):
-    star.add_edge(i, 0)
-nx.draw(star.to_networkx(), with_labels=True)
-
-###############################################################################
-# It's more efficient to add many edges with a pair of list, or better still, with a pair of tensors.
-# TODO: needs to explain ``tensor``, since it's not a Python primitive data type.
-
-# using lists
-star.clear()
-star.add_nodes(10)
-src = [i for i in range(1, 10)]; dst = [0]*9
-star.add_edges(src, dst)
-
-# using tensor
-star.clear()
-star.add_nodes(10)
-import torch as th
-src = th.tensor(src); dst = th.tensor(dst)
-star.add_edges(src, dst)
-
-###############################################################################
-# In addition to this, we also support
-# "edge broadcasting":
-#
-# .. _note-edge-broadcast:
-#
-# .. note::
-#
-#   Given two source and destination node list/tensor ``u`` and ``v``.
-#
-#   - If ``len(u) == len(v)``, then this is a many-many edge set and
-#     each edge is represented by ``(u[i], v[i])``.
-#   - If ``len(u) == 1``, then this is a one-many edge set.
-#   - If ``len(v) == 1``, then this is a many-one edge set.
-#
-# Edge broadcasting is supported in many APIs whenever a bunch of edges need
-# to be specified. The example below creates the same star graph as the previous one.
-
-star.clear()  # clear the previous graph
-star.add_nodes(10)
-u = list(range(1, 10))  # can also use tensor type here (e.g. torch.Tensor)
-star.add_edges(u, 0)  # many-one edge set
-
-###############################################################################
-# In ``DGLGraph``, each edge is assigned an internal edge id (also a consecutive
-# integer starting from zero). The ids follow the addition order of the edges
-# and you can query the id using the ``edge_ids`` interface, which returns a tensor.
-
-print(star.edge_ids(1, 0))  # query edge id of 1->0; it happens to be the first edge!
-print(star.edge_ids([8, 9], 0))  # ask for ids of multiple edges
-
-
-###############################################################################
-# Assigning consecutive integer ids for nodes and edges makes it easier to batch
-# their features together (see next section). As a result, removing nodes or edges
-# of a ``DGLGraph`` is currently not supported because this will break the assumption
-# that the ids form a consecutive range from zero.
-
-
-###############################################################################
-# Node and edge features
-# ----------------------
-# Nodes and edges can have feature data in tensor type. They can be accessed/updated
-# through a key-value storage interface. The key must be hashable. The value should
-# be features of each node and edge, batched on the *first* dimension. For example,
-# the following codes create features for all nodes (``hv``) and features for all
-# edges (``he``). Each feature is a vector of length 3.
-#
-# .. note::
-#
-#   The first dimension is usually reserved as batch dimension in DGL. Thus, even setting
-#   only one node/edge still needs to have an extra dimension (of length one).
-
-import torch as th
-D = 3  # the feature dimension
-N = star.number_of_nodes()
-M = star.number_of_edges()
-nfeat = th.randn((N, D))  # some random node features
-efeat = th.randn((M, D))  # some random edge features
-# TODO(minjie): enable following syntax
-# star.nodes[:]['hv'] = nfeat
-# star.edges[:]['he'] = efeat
-star.set_n_repr({'hv' : nfeat})
-star.set_e_repr({'he' : efeat})
-
-
-###############################################################################
-# .. note::
-#    The first dimension of the node feature has length equal the number of nodes,
-#    whereas of the edge feature the number of edges.
-#
-# We can then set some nodes' features to be zero.
-
-# TODO(minjie): enable following syntax
-# print(star.nodes[:]['hv'])
-print("node features:")
-print(star.get_n_repr()['hv'])
-print("\nedge features:")
-print(star.get_e_repr()['he'])
-# set node 0, 2, 4 feature to zero
-print("\nresetting features at node 0, 2 and 4...")
-star.set_n_repr({'hv' : th.zeros((3, D))}, [0, 2, 4])
-print(star.get_n_repr()['hv'])
-
-
-###############################################################################
-# Once created, each node/edge feature will be associated with a *scheme* containing
-# the shape, dtype information of the feature tensor. Updating features using data
-# of different scheme will raise error unless all the features are updated,
-# in which case the scheme will be replaced with the new one.
-
-print(star.node_attr_schemes())
-# updating features with different scheme will raise error
-# star.set_n_repr({'hv' : th.zeros((3, 2*D))}, [0, 2, 4])
-# updating all the nodes is fine, the old scheme will be replaced
-star.set_n_repr({'hv' : th.zeros((N, 2*D))})
-print(star.node_attr_schemes())
-
-
-###############################################################################
-# If a new feature is added for some but not all of the nodes/edges, we will
-# automatically create empty features for the others to make sure that features are
-# always aligned. By default, we zero-fill the empty features. The behavior
-# can be changed using ``set_n_initializer`` and ``set_e_initializer``.
-
-star.set_n_repr({'hv_1' : th.randn((3, D+1))}, [0, 2, 4])
-print(star.node_attr_schemes())
-print(star.get_n_repr()['hv'])
-print(star.get_n_repr()['hv_1'])
-
-
-###############################################################################
-# Convert from/to other formats
-# -----------------------------
-# DGLGraph can be easily converted from/to ``networkx`` graph.
-
-import networkx as nx
-# note that networkx create undirected graph by default, so when converting
-# to DGLGraph, directed edges of both directions will be added.
-nx_star = nx.star_graph(9)
-star = dgl.DGLGraph(nx_star)
-print('#Nodes:', star.number_of_nodes())
-print('#Edges:', star.number_of_edges())
-
-
-###############################################################################
-# Node and edge attributes can be automatically batched when converting from
-# ``networkx`` graph. Since ``networkx`` graph by default does not tell which
-# edge is added the first, we use the ``"id"`` edge attribute as a hint
-# if available.
-
-for i in range(10):
-    nx_star.nodes[i]['feat'] = th.randn((D,))
-star = dgl.DGLGraph()
-star.from_networkx(nx_star, node_attrs=['feat'])  # auto-batch specified node features
-print(star.get_n_repr()['feat'])
-
-
-###############################################################################
-# Multi-edge graph
-# ----------------
-# There are many applications that work on graphs containing multi-edges. To enable
-# this, construct ``DGLGraph`` with ``multigraph=True``.
-
-g = dgl.DGLGraph(multigraph=True)
-g.add_nodes(5)
-g.add_edge(0, 1)
-g.add_edge(1, 2)
-g.add_edge(0, 1)
-print('#Nodes:', g.number_of_nodes())
-print('#Edges:', g.number_of_edges())
-# init random edge features
-M = g.number_of_edges()
-g.set_e_repr({'he' : th.randn((M, D))})
-
-
-###############################################################################
-# Because an edge in multi-graph cannot be uniquely identified using its incident
-# nodes ``u`` and ``v``, you need to use edge id to access edge features. The
-# edge ids can be queried from ``edge_id`` interface.
-
-eid_01 = g.edge_id(0, 1)
-print(eid_01)
-
-
-###############################################################################
-# We can then use the edge id to set/get the features of the corresponding edge.
-g.set_e_repr_by_id({'he' : th.ones(len(eid_01), D)}, eid=eid_01)
-print(g.get_e_repr()['he'])
--- a/tutorials/3_mp.py
+++ b/tutorials/3_mp.py
-"""
-.. _tutorial-mp:
-
-Message passing on graph
-========================
-**Author**: `Minjie Wang <https://jermainewang.github.io/>`_
-
-Many of the graph-based deep neural networks are based on *"message passing"* --
-nodes compute messages that are sent to others and the features are updated
-using the messages. In this tutorial, we introduce the basic mechanism of message
-passing in DGL.
-"""
-
-###############################################################################
-# Let us start by import DGL and create an example graph used throughput this
-# tutorial. The graph has 10 nodes, with node#0 be the source and node#9 be the
-# sink. The source node (node#0) connects to all other nodes besides the sink
-# node. Similarly, the sink node is connected by all other nodes besides the
-# source node. We also initialize the feature vector of the source node to be
-# all one, while the others have features of all zero.
-# The code to create such graph is as follows (using pytorch syntax):
-
-import dgl
-import torch as th
-
-g = dgl.DGLGraph()
-g.add_nodes(10)
-g.add_edges(0, list(range(1, 9)))
-g.add_edges(list(range(1, 9)), 9)
-# TODO(minjie): plot the graph here.
-N = g.number_of_nodes()
-M = g.number_of_edges()
-print('#Nodes:', N)
-print('#Edges:', M)
-# initialize the node features
-D = 1  # feature size
-g.set_n_repr({'feat' : th.zeros((N, D))})
-g.set_n_repr({'feat' : th.ones((1, D))}, 0)
-print(g.get_n_repr()['feat'])
-
-###############################################################################
-# User-defined functions and high-level APIs
-# ------------------------------------------
-#
-# There are two core components in DGL's message passing programming model:
-#
-# * **User-defined functions (UDFs)** on how the messages are computed and used.
-# * **High-level APIs** on who are sending messages to whom and are being updated.
-#
-# For example, one simple user-defined message function can be as follows:
-
-def send_source(src, edge):
-    return {'msg' : src['feat']}
-
-###############################################################################
-# The above function computes the messages over **a batch of edges**.
-# It has two arguments: `src` for source node features and
-# `edge` for the edge features, and it returns the messages computed. The argument
-# and return type is dictionary from the feature/message name to tensor values.
-# We can trigger this function using out ``send`` API:
-
-g.send(0, 1, message_func=send_source)
-
-###############################################################################
-# Here, the message is computed using the feature of node#0. The result message
-# (on 0->1) is not returned but directly saved in ``DGLGraph`` for the later
-# receive phase.
-#
-# You can send multiple messages at once using the
-# :ref:`multi-edge semantics <note-edge-broadcast>`.
-# In such case, the source node and edge features are batched on the first dimension.
-# You can simply print out the shape of the feature tensor in your message
-# function.
-
-def send_source_print(src, edge):
-    print('src feat shape:', src['feat'].shape)
-    return {'msg' : src['feat']}
-g.send(0, [4, 5, 6], message_func=send_source_print)
-
-###############################################################################
-# To receive and aggregate in-coming messages, user can define a reduce function
-# that operators on **a batch of nodes**.
-
-def simple_reduce(node, msgs):
-    return {'feat' : th.sum(msgs['msg'], dim=1)}
-
-###############################################################################
-# The reduce function has two arguments: ``node`` for the node features and
-# ``msgs`` for the in-coming messages. It returns the updated node features.
-# The function can be triggered using the ``recv`` API. Again, DGL support
-# receive messages for multiple nodes at the same time. In such case, the
-# node features are batched on the first dimension. Because each node can
-# receive different number of in-coming messages, we divide the receiving
-# nodes into buckets based on their numbers of receiving messages. As a result,
-# the message tensor has at least three dimensions (B, n, D), where the second
-# dimension concats all the messages for each node together. This also means
-# the reduce UDF will be called for each bucket. You can simply print out
-# the shape of the message tensor as follows:
-
-def simple_reduce_print(node, msgs):
-    print('msg shape:', msgs['msg'].shape)
-    return {'feat' : th.sum(msgs['msg'], dim=1)}
-g.recv([1, 4, 5, 6], reduce_func=simple_reduce_print)
-print(g.get_n_repr()['feat'])
-
-###############################################################################
-# You can see that, after send and recv, the value of node#0 has been propagated
-# to node 1, 4, 5 and 6.
-
-
-###############################################################################
-# DGL message passing APIs
-# ------------------------
-#
-# TODO(minjie): enable backreference for all the mentioned APIs below.
-#
-# In DGL, we categorize the message passing APIs into three levels. All of them
-# can be configured using UDFs such as the message and reduce functions.
-#
-# **Level-1 routines:** APIs that trigger computation on either a batch of nodes
-# or a batch of edges. This includes:
-#
-# * ``send(u, v)`` and ``recv(v)``
-# * ``update_edge(u, v)``: This updates the edge features using the current edge
-#   features and the source and destination nodes features.
-# * ``apply_nodes(v)``: This transforms the node features using the current node
-#   features.
-# * ``apply_edges(u, v)``: This transforms the edge features using the current edge
-#   features.
-
-
-###############################################################################
-# **Level-2 routines:** APIs that combines several level-1 routines.
-# 
-# * ``send_and_recv(u, v)``: This first computes messages over u->v, then reduce
-#   them on v. An optional node apply function can be provided.
-# * ``pull(v)``: This computes the messages over all the in-edges of v, then reduce
-#   them on v. An optional node apply function can be provided.
-# * ``push(v)``: This computes the messages over all the out-edges of v, then
-#   reduce them on the successors. An optional node apply function can be provided.
-# * ``update_all()``: Send out and reduce messages on every node. An optional node
-#   apply function can be provided.
-#
-# The following example uses ``send_and_recv`` to continue propagate signals to the
-# sink node#9:
-
-g.send_and_recv([1, 4, 5, 6], 9, message_func=send_source, reduce_func=simple_reduce)
-print(g.get_n_repr()['feat'])
-
-###############################################################################
-# **Level-3 routines:** APIs that calls multiple level-2 routines.
-#
-# * ``propagate()``: TBD after Yu's traversal PR.
-
-###############################################################################
-# Builtin functions
-# -----------------
-#
-# Since many message and reduce UDFs are very common (such as sending source
-# node features as the message and aggregating messages using summation), DGL
-# actually provides builtin functions that can be directly used:
-
-import dgl.function as fn
-g.send_and_recv(0, [2, 3], fn.copy_src(src='feat', out='msg'), fn.sum(msg='msg', out='feat'))
-print(g.get_n_repr()['feat'])
-
-###############################################################################
-# TODO(minjie): document on multiple builtin function syntax after Lingfan
-# finished his change.
-
-###############################################################################
-# Using builtin functions not only saves your time in writing codes, but also
-# allows DGL to use more efficient implementation automatically. To see this,
-# you can continue to our tutorial on Graph Convolutional Network.
-# TODO(minjie): need a hyperref to the GCN tutorial here.
--- a/tutorials/3_pagerank.py
+++ b/tutorials/3_pagerank.py
+"""
+PageRank with DGL Message Passing
+=================================
+
+**Author**: Minjie Wang, Quan Gan, Yu Gai, Zheng Zhang
+
+In this section we illustrate the usage of different levels of message
+passing API with PageRank on a small graph. In DGL, the message passing and
+feature transformations are all **User-Defined Functions** (UDFs).
+
+The goal of this tutorial: to implement PageRank using DGL message passing
+interface.
+"""
+
+###############################################################################
+# The PageRank Algorithm
+# ----------------------
+# In each iteration of PageRank, every node (web page) first scatters its
+# PageRank value uniformly to its downstream nodes. The new PageRank value of
+# each node is computed by aggregating the received PageRank values from its
+# neighbors, which is then adjusted by the damping factor:
+#
+# .. math::
+#
+#    PV(u) = \frac{1-d}{N} + d \times \sum_{v \in \mathcal{N}(u)}
+#    \frac{PV(v)}{D(v)}
+#
+# where :math:`N` is the number of nodes in the graph; :math:`D(v)` is the
+# out-degree of a node :math:`v`; and :math:`\mathcal{N}(u)` is the neighbor
+# nodes.
+
+
+###############################################################################
+# A naive implementation
+# ----------------------
+# Let us first create a graph with 100 nodes with NetworkX and convert it to a
+# ``DGLGraph``:
+
+import networkx as nx
+import matplotlib.pyplot as plt
+import torch
+import dgl
+
+N = 100  # number of nodes
+DAMP = 0.85  # damping factor
+K = 10  # number of iterations
+g = nx.nx.erdos_renyi_graph(N, 0.1)
+g = dgl.DGLGraph(g)
+nx.draw(g.to_networkx(), node_size=50, node_color=[[.5, .5, .5,]])
+plt.show()
+
+
+###############################################################################
+# According to the algorithm, PageRank consists of two phases in a typical
+# scatter-gather pattern. We first initialize the PageRank value of each node
+# to :math:`\frac{1}{N}` and store each node's out-degree as a node feature:
+
+g.ndata['pv'] = torch.ones(N) / N
+g.ndata['deg'] = g.out_degrees(g.nodes()).float()
+
+
+###############################################################################
+# We then define the message function, which divides every node's PageRank
+# value by its out-degree and passes the result as message to its neighbors:
+
+def pagerank_message_func(edges):
+    return {'pv' : edges.src['pv'] / edges.src['deg']}
+
+
+###############################################################################
+# In DGL, the message functions are expressed as **Edge UDFs**.  Edge UDFs
+# take in a single argument ``edges``.  It has three members ``src``, ``dst``,
+# and ``data`` for accessing source node features, destination node features,
+# and edge features respectively.  Here, the function computes messages only
+# from source node features.
+#
+# Next, we define the reduce function, which removes and aggregates the
+# messages from its ``mailbox``, and computes its new PageRank value:
+
+def pagerank_reduce_func(nodes):
+    msgs = torch.sum(nodes.mailbox['pv'], dim=1)
+    pv = (1 - DAMP) / N + DAMP * msgs
+    return {'pv' : pv}
+
+
+###############################################################################
+# The reduce functions are **Node UDFs**.  Node UDFs have a single argument
+# ``nodes``, which has two members ``data`` and ``mailbox``.  ``data``
+# contains the node features while ``mailbox`` contains all incoming message
+# features, stacked along the second dimension (hence the ``dim=1`` argument).
+#
+# The message UDF works on a batch of edges, whereas the reduce UDF works on
+# a batch of edges but outputs a batch of nodes. Their relationships are as
+# follows:
+#
+# .. image:: https://i.imgur.com/kIMiuFb.png
+#
+# We register the message function and reduce function, which will be called
+# later by DGL.
+
+g.register_message_func(pagerank_message_func)
+g.register_reduce_func(pagerank_reduce_func)
+
+
+###############################################################################
+# The algorithm is then very straight-forward. Here is the code for one
+# PageRank iteration:
+
+def pagerank_naive(g):
+    # Phase #1: send out messages along all edges.
+    for u, v in zip(*g.edges()):
+        g.send((u, v))
+    # Phase #2: receive messages to compute new PageRank values.
+    for v in g.nodes():
+        g.recv(v)
+
+
+###############################################################################
+# Improvement with batching semantics
+# -----------------------------------
+# The above code does not scale to large graph because it iterates over all
+# the nodes. DGL solves this by letting user compute on a *batch* of nodes or
+# edges. For example, the following codes trigger message and reduce functions
+# on multiple nodes and edges at once.
+
+def pagerank_batch(g):
+    g.send(g.edges())
+    g.recv(g.nodes())
+
+
+###############################################################################
+# Note that we are still using the same reduce function ``pagerank_reduce_func``,
+# where ``nodes.mailbox['pv']`` is a *single* tensor, stacking the incoming
+# messages along the second dimension.
+#
+# Naturally, one will wonder if this is even possible to perform reduce on all
+# nodes in parallel, since each node may have different number of incoming
+# messages and one cannot really "stack" tensors of different lengths together.
+# In general, DGL solves the problem by grouping the nodes by the number of
+# incoming messages, and calling the reduce function for each group.
+
+
+###############################################################################
+# More improvement with higher level APIs
+# ---------------------------------------
+# DGL provides many routines that combines basic ``send`` and ``recv`` in
+# various ways. They are called **level-2 APIs**. For example, the PageRank
+# example can be further simplified as follows:
+
+def pagerank_level2(g):
+    g.update_all()
+
+
+###############################################################################
+# Besides ``update_all``, we also have ``pull``, ``push``, and ``send_and_recv``
+# in this level-2 category. Please refer to their own API reference documents
+# for more details. (TODO: a link to the document).
+
+
+###############################################################################
+# Even more improvement with DGL builtin functions
+# ------------------------------------------------
+# As some of the message and reduce functions are very commonly used, DGL also
+# provides **builtin functions**. For example, two builtin functions can be
+# used in the PageRank example.
+#
+# * ``dgl.function.copy_src(src, out)`` is an edge UDF that computes the
+#   output using the source node feature data. User needs to specify the name of
+#   the source feature data (``src``) and the output name (``out``).
+# 
+# * ``dgl.function.sum(msg, out)`` is a node UDF that sums the messages in
+#   the node's mailbox. User needs to specify the message name (``msg``) and the
+#   output name (``out``).
+#
+# For example, the PageRank example can be rewritten as following:
+
+import dgl.function as fn
+
+def pagerank_builtin(g):
+    g.ndata['pv'] = g.ndata['pv'] / g.ndata['deg']
+    g.update_all(message_func=fn.copy_src(src='pv', out='m'),
+                 reduce_func=fn.sum(msg='m',out='m_sum'))
+    g.ndata['pv'] = (1 - DAMP) / N + DAMP * g.ndata['m_sum']
+
+
+###############################################################################
+# Here, we directly provide the UDFs to the `update_all` as its arguments.
+# This will override the previously registered UDFs.
+#
+# In addition to cleaner code, using builtin functions also gives DGL the
+# opportunity to fuse operations together, resulting in faster execution.  For
+# example, DGL will fuse the ``copy_src`` message function and ``sum`` reduce
+# function into one sparse matrix-vector (spMV) multiplication.
+#
+# `This section <spmv_>`_ describes why spMV can speed up the scatter-gather
+# phase in PageRank.  For more details about the builtin functions in DGL,
+# please read their API reference documents. (TODO: a link here).
+#
+# You can also download and run the codes to feel the difference.
+
+for k in range(K):
+    # Uncomment the corresponding line to select different version.
+    # pagerank_naive(g)
+    # pagerank_batch(g)
+    # pagerank_level2(g)
+    pagerank_builtin(g)
+print(g.ndata['pv'])
+
+
+###############################################################################
+# .. _spmv:
+#
+# Using spMV for PageRank
+# -----------------------
+# Using builtin functions allows DGL to understand the semantics of UDFs and
+# thus allows more efficient implementation for you. For example, in the case
+# of PageRank, one common trick to accelerate it is using its linear algebra
+# form.
+#
+# .. math::
+#
+#    \mathbf{R}^{k} = \frac{1-d}{N} \mathbf{1} + d \mathbf{A}*\mathbf{R}^{k-1}
+#
+# Here, :math:`\mathbf{R}^k` is the vector of the PageRank values of all nodes
+# at iteration :math:`k`; :math:`\mathbf{A}` is the sparse adjacency matrix
+# of the graph.
+# Computing this equation is quite efficient because there exists efficient
+# GPU kernel for the *sparse-matrix-vector-multiplication* (spMV). DGL
+# detects whether such optimization is available through the builtin
+# functions. If the certain combination of builtins can be mapped to a spMV
+# kernel (e.g. the pagerank example), DGL will use it automatically. As a
+# result, *we recommend using builtin functions whenever it is possible*.
+
+
+###############################################################################
+# Next steps
+# ----------
+# Check out :doc:`GCN <models/1_gcn>` and :doc:`Capsule <models/2_capsule>`
+# for more model implemenetations in DGL.
--- a/tutorials/models/README.txt
+++ b/tutorials/models/README.txt
-Model Tutorials
-===============
+Graph-based DNN models in DGL
+=============================

-Graph-based DNN models in DGL.
+Graph-based DNN models in DGL
--- a/tutorials/models/capsule.py
+++ b/tutorials/models/capsule.py
-"""
-Capsule Network
-================
-
-**Author**: `Jinjing Zhou`
- 
-This tutorial explains how to use DGL library and its language to implement the
-`capsule network <http://arxiv.org/abs/1710.09829>`__ proposed by Geoffrey Hinton and his team.
-The algorithm aims to provide a better alternative to current neural network structures.
-By using DGL library, users can implement the algorithm in a more intuitive way.
-"""
-
-
-##############################################################################
-# Model Overview
-# ---------------
-# Introduction
-# ```````````````````
-# Capsule Network were first introduced in 2011 by Geoffrey Hinton, et al.,
-# in paper `Transforming Autoencoders <https://www.cs.toronto.edu/~fritz/absps/transauto6.pdf>`__,
-# but it was only a few months ago, in November 2017, that Sara Sabour, Nicholas Frosst,
-# and Geoffrey Hinton published a paper called Dynamic Routing between Capsules, where they
-# introduced a CapsNet architecture that reached state-of-the-art performance on MNIST.
-#  
-# What's a capsule?
-# ```````````````````
-# In papers, author states that "A capsule is a group of neurons whose activity vector
-# represents the instantiation parameters of a specific type of entity such as an object
-# or an object part."
-#
-# Generally speaking, the idea of capsule is to encode all the information about the
-# features into a vector form, by substituting scalars in traditional neural network with vectors.
-# And use the norm of the vector to represents the meaning of original scalars. 
-# 
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f1.png
-# 
-# Dynamic Routing Algorithm
-# `````````````````````````````
-# Due to the different structure of network, capsules network has different operations to
-# calculate results. This figure shows the comparison, drawn by
-# `Max Pechyonkin <https://medium.com/ai%C2%B3-theory-practice-business/understanding-hintons-capsule-networks-part-ii-how-capsules-work-153b6ade9f66O>`__
-# 
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f2.png
-#    :height: 250px
-# 
-# The key idea is that the output of each capsule is the sum of weighted input vectors.
-# We will go into details in the later section with code implementations.
-# 
-# Model Implementations
-# -------------------------
-
-##############################################################################
-# Algorithm Overview
-# ```````````````````````````
-#
-# .. image:: https://raw.githubusercontent.com/VoVAllen/DGL_Capsule/master/algorithm.png
-#
-# The main step of routing algorithm is line 4 - 7. In ``DGLGraph`` structure, we consider these steps as a message passing
-# procedure.
-
-##############################################################################
-# Consider capsule routing as a graph structure
-# ````````````````````````````````````````````````````````````````````````````
-# We can consider each capsule as a node in a graph, and connect all the nodes between layers.
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f3.png
-#    :height: 150px
-#
-def construct_graph(self):
-    g = dgl.DGLGraph()
-    g.add_nodes(self.input_capsule_num + self.output_capsule_num)
-    input_nodes = list(range(self.input_capsule_num))
-    output_nodes = list(range(self.input_capsule_num, self.input_capsule_num + self.output_capsule_num))
-    u, v = [], []
-    for i in input_nodes:
-        for j in output_nodes:
-            u.append(i)
-            v.append(j)
-    g.add_edges(u, v)
-    return g, input_nodes, output_nodes
-
-
-##############################################################################
-# Write Message Passing Functions
-# ``````````````````````````````````
-# Reduce Functions (line 4 - 5)
-# .............................................
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f5.png
-#
-# At this stage, we need to define a reduce function to aggregate the node features
-# from layer :math:`l` and weighted sum them into layer :math:`(l+1)`'s node features.
-#
-# .. note::
-#    The softmax operation is over dimension :math:`j` instead of :math:`i`.
-def capsule_reduce(node, msg):
-    b_ij_c, u_hat = msg['b_ij'], msg['u_hat']
-    # line 4
-    c_i = F.softmax(b_ij_c, dim=0)
-    # line 5
-    s_j = (c_i.unsqueeze(2).unsqueeze(3) * u_hat).sum(dim=1)
-    return {'h': s_j}
-
-
-##############################################################################
-# Node Update Functions (line 6)
-# ......................................................
-# Squash the intermediate representations into node features :math:`v_j`
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step6.png
-#
-def capsule_update(msg):
-    v_j = squash(msg['h'])
-    return {'h': v_j}
-
-
-##############################################################################
-# Edge Update Functions (line 7)
-# ...........................................................................
-# Update the routing parameters by updating edges in graph
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step7.png
-#
-def update_edge(u, v, edge):
-    return {'b_ij': edge['b_ij'] + (v['h'] * edge['u_hat']).mean(dim=1).sum(dim=1)}
-
-
-##############################################################################
-# Call DGL function to execute algorithm
-# ````````````````````````````````````````````````````````````````````````````
-# Call ``update_all`` and ``update_edge`` functions to execute the whole algorithms.
-# Message function is to define which attributes are needed in further computations
-#
-def routing(self):
-    def capsule_msg(src, edge):
-        return {'b_ij': edge['b_ij'], 'h': src['h'], 'u_hat': edge['u_hat']}
-
-    self.g.update_all(capsule_msg, capsule_reduce, capsule_update)
-    self.g.update_edge(edge_func=update_edge)
-
-
-##############################################################################
-# Forward Function
-# ````````````````````````````````````````````````````````````````````````````
-# This section shows the whole process of forward process of capsule routing algorithm.
-def forward(self, x):
-    self.batch_size = x.size(0)
-    u_hat = self.compute_uhat(x)
-    self.initialize_nodes_and_edges_features(u_hat)
-    for i in range(self.num_routing):
-        self.routing()
-    this_layer_nodes_feature = self.g.get_n_repr()['h'][
-                               self.input_capsule_num:self.input_capsule_num + self.output_capsule_num]
-    return this_layer_nodes_feature.transpose(0, 1).unsqueeze(1).unsqueeze(4).squeeze(1)
-
-
-##############################################################################
-# Other Workaround
-# ````````````````````````````````````````````````````````````````
-# Initialization & Affine Transformation
-# ..................................................
-# This section implements the transformation operation in capsule networks,
-# which transform capsule into different dimensions.
-# - Pre-compute :math:`\hat{u}_{j|i}`, initialize :math:`b_{ij}` and store them as edge attribute
-# - Initialize node features as zero
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f4.png
-#
-
-def compute_uhat(self, x):
-    # x is the input vextor with shape [batch_size, input_capsule_dim, input_num]
-    # Transpose x to [batch_size, input_num, input_capsule_dim]
-    x = x.transpose(1, 2)
-    # Expand x to [batch_size, input_num, output_num, input_capsule_dim, 1]
-    x = torch.stack([x] * self.output_capsule_num, dim=2).unsqueeze(4)
-    # Expand W from [input_num, output_num, input_capsule_dim, output_capsule_dim]
-    # to [batch_size, input_num, output_num, output_capsule_dim, input_capsule_dim]
-    W = self.weight.expand(self.batch_size, *self.weight.size())
-    # u_hat's shape is [input_num, output_num, batch_size, output_capsule_dim]
-    u_hat = torch.matmul(W, x).permute(1, 2, 0, 3, 4).squeeze().contiguous()
-    return u_hat
-
-
-def initialize_nodes_and_edges_features(self, u_hat):
-    b_ij = torch.zeros(self.input_capsule_num, self.output_capsule_num).to(self.device)
-    self.g.set_e_repr({'b_ij': b_ij.view(-1)})
-    self.g.set_e_repr({'u_hat': u_hat.view(-1, self.batch_size, self.output_capsule_dim)})
-
-    # Initialize all node features as zero
-    node_features = torch.zeros(self.input_capsule_num + self.output_capsule_num, self.batch_size,
-                                self.output_capsule_dim).to(self.device)
-    self.g.set_n_repr({'h': node_features})
-
-
-##############################################################################
-# Squash function
-# ..................
-# Squashing function is to ensure that short vectors get shrunk to almost zero length and
-# long vectors get shrunk to a length slightly below 1. Its norm is expected to represents probabilities
-# at some levels.
-#
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/squash.png
-#    :height: 100px
-#
-def squash(s, dim=2):
-    sq = torch.sum(s ** 2, dim=dim, keepdim=True)
-    s_std = torch.sqrt(sq)
-    s = (sq / (1.0 + sq)) * (s / s_std)
-    return s
-
-
-##############################################################################
-# General Setup
-# .................
-
-import dgl
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-
-class DGLDigitCapsuleLayer(nn.Module):
-    def __init__(self, input_capsule_dim=8, input_capsule_num=1152, output_capsule_num=10, output_capsule_dim=16,
-                 num_routing=3, device='cpu'):
-        super(DGLDigitCapsuleLayer, self).__init__()
-        self.device = device
-        self.input_capsule_dim = input_capsule_dim
-        self.input_capsule_num = input_capsule_num
-        self.output_capsule_dim = output_capsule_dim
-        self.output_capsule_num = output_capsule_num
-        self.num_routing = num_routing
-        self.weight = nn.Parameter(
-            torch.randn(input_capsule_num, output_capsule_num, output_capsule_dim, input_capsule_dim))
-        self.g, self.input_nodes, self.output_nodes = self.construct_graph()
-
-
-# This section is for defining class in multiple cells.
-DGLDigitCapsuleLayer.construct_graph = construct_graph
-DGLDigitCapsuleLayer.forward = forward
-DGLDigitCapsuleLayer.routing = routing
-DGLDigitCapsuleLayer.compute_uhat = compute_uhat
-DGLDigitCapsuleLayer.initialize_nodes_and_edges_features = initialize_nodes_and_edges_features
--- a/tutorials/models/gcnTutorialNew.py
+++ b/tutorials/models/gcnTutorialNew.py
-
-
-"""
-Graph Convolutional Network New
-====================================
-**Author**: `Qi Huang`
-
-This is a brief entry to DGL and its message passing API through GCN(graph convolutional network).
-"""
-
-##############################################################################
-# Message Passing: Warming up
-# ---------------------------
-#
-# Let's begin with the simplest graph possible with two nodes, and set the node representations:
-
-import torch as th
-import dgl
-
-g = dgl.DGLGraph()
-g.add_nodes(2)
-g.add_edge(1, 0)
-
-x = th.tensor([[0.0, 0.0], [1.0, 2.0]])
-g.set_n_repr({'x': x})
-
-##############################################################################
-# What we want to do is simply to copy representation from node#1 to node#0, but with
-# a message passing interface. We do this like what we will do over a pair of sockets,
-# with a ``send`` and a ``recv`` interface.
-# The two `user defined function (UDF)` specifies the actions: deposit the value into an internal
-# key-value store with the key `msg`, and retrive it. Note that there may be multiple incoming edges
-# to a node, and the receiving end aggregates them.
-#
-# .. note::
-#  * ``send(src, dst)`` defines an edge explictly, so ``message_func`` taking ``edge`` as an
-#    argument is confusing.
-#  * following graph construction semantics, it'll be nice to allow ``src`` and ``dst`` as a pair
-#    of lists, or a pair of tensor, though this example doesn't demonstrate it.
-#  * likewise, since we allow edge broadcasting, we should allow it in ``send`` as well.
-#  * what's the side-effect of doing a send action? we are left with the impression that the second argument
-#    in the ``reduce_func`` (i.e. ``msgs``) magically gets the stuff with the same key.
-#  * my preference is to say that expected side-effect is simply that the result  of a ``send`` action is available
-#    at ``dst['key']``, where ``key`` is whatever the user specified in ``message_func``. this allows
-#    for cases where we use ``apply_node_func``.
-#  * in other words,
-#    ``message_func`` returns ``{'hey': [1.0]}``, we expect to see ``dst['hey']``. if that happens
-#    to be the represnetation key, then a replacement is done. user can define a new key, e.g. ``accum``,
-#    then the ``reduce_func`` and ``apply_node_func`` can do whatever they want. typically,
-#    they should return with the representation key to perform update.
-#
-
-def send_source(src, edge):
-    return {'msg': src['x']}
-
-def simple_reduce(node, msgs):
-    return {'x' : th.sum(msgs['msg'], dim=1)}
-
-g.send(1, 0, message_func=send_source)
-g.recv([0], reduce_func=simple_reduce)
-print(g.get_n_repr())
-
-
-##############################################################################
-# Some times the computation may involve representations on the edges. Let's say we want to "amplify"
-# the message:
-
-w = th.tensor([2.0])
-g.set_e_repr({'w': w})
-
-def send_source_with_edge_weight(src, edge):
-    return {'msg': src['x'] * edge['w']}
-
-g.send(1, 0, message_func=send_source_with_edge_weight)
-g.recv([0], reduce_func=simple_reduce)
-print(g.get_n_repr())
-
-##############################################################################
-# Or we may need to involve the desination's representation, and here is one version:
-
-def simple_reduce_addup(node, msgs):
-    return {'x' : node['x'] + th.sum(msgs['msg'], dim=1)}
-
-g.send(1, 0, message_func=send_source_with_edge_weight)
-g.recv([0], reduce_func=simple_reduce_addup)
-print(g.get_n_repr())
-
-##############################################################################
-# A slightly more complex but more flexible one is to store the reduced sum at the node under
-# a different key, and then call the ``apply_node_func``:
-#
-# .. note::
-#   that the stuff magically appear as part of node's key-value is non-intuitive.
-
-
-def simple_reduce_to_accum(node, msgs):
-    return {'accum' : th.sum(msgs['msg'], dim=1)}
-
-def simple_apply(node):
-    return {'x': node['x'] + node['accum']}
-
-g.send(1, 0, message_func=send_source_with_edge_weight)
-g.recv([0], reduce_func=simple_reduce_to_accum, apply_node_func=simple_apply)
-print(g.get_n_repr())
-
-##############################################################################
-# The ``send`` and ``recv`` is **level-1** call in DGL, they have the finest control over routing
-# the message.
-#
-# TODO: build a star graph (reuse the one in 2_graph.py), and use pull (or push)
-#
-# TODO: build a much bigger graph, explain with spMV and the use of ``update_all``
-#
-
-##############################################################################
-# Model Overview
-# ---------------
-# Introduction
-# ```````````````````
-# This is a simple implementation of Kipf & Welling's Semi-Supervised Classificaton with Graph Convolutional Networks in ICLR 2017, which propose a simple yet efficient model that extends convolutional neual network from the grid structured data we all familiar and like to graphs, like social network and knowledge graph. It starts from the framework of spectral graph convolutions and makes reasonable simplifications to achieve both faster training and higher prediction accuracy. It also achieves start-of-the-art classification results on a number of graph datasets like CORA, etc. /TODO: elaborate.
-# Note that this is not intended to be an end-to-end lecture on Kiph & Willing's GCN paper. In this tutorial, we aim at providing a friendly entry to showcase how to code up a contemporary NN model operating on graph structure data, and increases user's understanding of DGL's message passing API in action. For a more thorough understanding of the derivation and all details of GCN, please visit the original paper. /TODO(hq): add link.
-#
-# GCN in one formula
-# `````````````````````
-# Essentially, GCN's model boils down to the following oen formula
-# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
-#
-# The equation above describes a "graph convolution layer" in GCN.
-# Essentially, :math:`H^{(l)}` denotes the lth layer in the network, :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for this layer. :math:`D` and :math:`A`, as commonly seen, represent degree matrix and adjacency matrix, respectively. The ~ is a renormalization trick in which we add a self-connection to each node of the graph, and build the corresponding degree and adjacency matrix.
-#
-# The shape of the input :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes and :math:`D` is the number of input features. We can chain up multiple layers as such to produce a node-level representation output with shape :math:`N \times F`, where :math:`F` is the dimension of the output node feature vector.
-#
-# Derivation of GCN
-# ``````````````````
-# \TODO(hq) do we need a short description of how we departure from spectral based method and end with GCN?
-# According to others, this amounts to a laplacian smoothing.
-#
-# Understanding GCN from Message Passing
-# ````````````````````````````````````````
-# Think about :math:`W^{(l)}` just as a matrix of
-# filter parameters to project :math:`H^{(l)}`.
-# :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` as a symmetrical normalization of the
-# adjacency matrix.
-#
-# Combining these two, we arrives at a must succint form of GCN :
-# :math:`\sigma(\hat{A}\hat{H}^{(l)})`
-# where :math:`\hat{A}` means a normalized version of
-# adjacency matrix, and :math:`\hat{H}` means a
-# projection of last layer's node-level representation :math:`H`.
-#
-# We can further formulate multiplication with the adjacency matrix as performing message passing between nodes following paths encoding in the adjacency matrix.
-# To make it simple, let's denote the input signal on a graph :math:`G = (V,E)` as :math:`x \in \mathcal{R}^{|\mathcal{V}|x1}`, assume each node's feature is only a scalar.
-# Then, if we calculate :math:`x_{t+1} = Ax_{t}`, it amounts to perform a message passing operation on existing edges. The ith node's new feature :math:`x_{t+1}^{i}` essentially adds up the old feature vector :math:`x_{t}`, when the corresponding node index has non-zero entry on the ith row of the adjacency matrix A, i.e. has edge connection with node i. If we multiply the resulting vector :math:`x_{t+1}` again with A, the resulting vector, :math:`A^{2}x_{t}`, will be the resulting feature vector after two rounds of message-passing is performed. In this sense, :math:`A^2` encodes 2-hop neighborhood information for each node. By k-hop neighborhood, we mean any node reachable with exactly k steps starting from the current node (if self connection is not included in the original adjacency matrix), or any node reachable within k steps from the current node if self connection is included). In another view, we can also understand :math:`A^2` as :math:`A^2_{i,j}` = OR(k){ A_{i,k} && A_{k,j}}.
-#
-# Nonetheless, in GCN we only use :math:`\sigma(\hat{A}\hat{H}^{(l)})` in each layer, meaning we only propagate information among each node's 1-hop neighborhood for each layer.
-#
-#
-# Model Implementation
-# ------------------------
-# Warming up of message passing API
-# ````````````````````````````````````
-# DGL provides 3 levels of message passing API, giving user different level of control. Below we demonstrate three different levels of APIs on a simple star graph of size 10, where node 1-9 all sends information to node 0.
-#
-# Level 1 -- send, recv, and apply_node
-# ..........................................
-# The most basic level is ``send(srs,dst,message_function)``, ``recv(node,reduce_function)``, and ``apply_nodes(nodes)``.
-# ``send()`` and ``recv()`` allow users to designate specific pairs of (source, destination) to pass information. ``apply_nodes()`` allow users to perform per-node computation.
-#
-# Three functions need to be pre-specified when using message pasing api: 1) message function 2) reduce function 3) apply function. Message function determines what message is passed along edges; reduce function determines how messages are aggregated at the destination node; apply functions determines  Note that all these three functions can be either defined by users, or use built-in functions when importing ``dgl.function``. For a more detailed description of built-in function syntax, please see \TODO(hq) add hyperref.
-#
-# User don't have to pass message_function and reduce_function everytime as parameters to the function if they registered them in the graph in priori, as shown in the following code.
-import argparse
-import time
-import torch as th
-import numpy as np
-import torch.nn as nn
-import torch.nn.functional as F
-import dgl
-import networkx as nx
-from dgl import DGLGraph
-from dgl.data import register_data_args, load_data
-
-star = dgl.DGLGraph()
-star.add_nodes(10)
-u = list(range(1,10))
-star.add_edges(u,0) # create the graph
-D = 1  # the feature dimension
-N = star.number_of_nodes()
-M = star.number_of_edges()
-nfeat = th.ones((N, D))  # each node's feature is just 1
-efeat = th.ones((M, D))*2  # each edge's feature is 2.
-star.set_n_repr({'hv' : nfeat})
-star.set_e_repr({'he' : efeat})
-u = th.tensor([0])
-v = th.tensor([1,2,3,4,5]) #sending node 1-5's node feature to node 0's.
-def _message_test(src,edge):
-    return {'hv':src['hv']}
-def _reduce(node,msgs):
-    return{'hv':node['hv']+msgs['hv'].sum(1)}
-    # aggregate alone the second dimension as
-    # the first dimension is reserved for batching in DGL.
-star.register_message_func(_message_test)
-star.register_reduce_func(_reduce)
-star.send(v,u)
-# DGL supports batching send/recv and broadcasting.
-star.recv(u)
-#We expect to get 6 on node 0.
-print(star.get_n_repr()['hv'])
-##########################################################################
-# Level 2 -- pull, push, and send_and_recv
-# ............................................
-# It could be both tedious and inefficient for user to call ``send()`` and ``recv()`` respectively. DGL comes into aid by providing a series of higher level APIs which also increase the performance by operator fusion in the backend ``/TODO(gaiyu) verify this statement please``.
-# ``send_and_recv(src,dst,message_func,reduce_func,apply_func)`` is essentially a wrapper around send and receive.
-# pull(node,message_func,reduce_func,apply_func) will take the input nodes as destination nodes, and all their predeseccor nodes as source nodes, and perform ``send_and_recv()``
-# push(node,message_func,reduce_func,apply_func) will take the input nodes as source nodes, and all their descendant nodes as destination nodes, and perform ``send_and_recv()``
-#
-# Notice that apply function is usually optional in message passing APIs.
-star.set_n_repr({'hv' : nfeat}) #reset node repr
-star.set_e_repr({'he' : efeat}) #reset edge repr
-star.send_and_recv(v,u) #note that here apply functon is left blank
-print(star.get_n_repr()['hv']) # we expect to get 6 on node 0
-#####################################################################
-#
-# Then we register the apply function.
-#
-def _apply_test(node):
-    return {'hv':500*node['hv']}
-star.register_apply_node_func(_apply_test)
-star.apply_nodes(u)
-print(star.get_n_repr()['hv']) #we expect to get 3000 on node 0
-#########################################################################
-star.set_n_repr({'hv' : nfeat}) #reset node repr
-star.set_e_repr({'he' : efeat}) #reset edge repr
-star.pull(u)
-print(star.get_n_repr()['hv']) # we expect to get 3000 on node 0
-###################################################################
-star.set_n_repr({'hv' : nfeat}) #reset node repr
-star.set_e_repr({'he' : efeat}) #reset edge repr
-star.push(v)
-print(star.get_n_repr()['hv']) # we expect to get 3000 on node 0
-#######################################################################
-# Level 3 -- update_all
-# ..........................
-# In many cases, user would like to perform message passing on all the edges simoutaneously, such as in the case of adjacency matrix multiplication in GCN. DGL also provides ``update_all()`` method to achieve this, also optimizing the performance under the hood.
-star.set_n_repr({'hv' : nfeat}) #reset node repr
-star.set_e_repr({'he' : efeat}) #reset edge repr
-star.update_all(apply_node_func = None)
-print(star.get_n_repr()['hv']) # we expect to get 10 on node 0, as we choose not to perform any apply_node functions
-#
-##########################################################
-# Model Implementation
-# ``````````````````````````````
-# Model definition
-# ....................
-# Similar to above, we first define the message function, reduce function and apply function for GCN.
-def gcn_msg(src, edge):
-    return {'m' : src['h']} #return node feature
-
-def gcn_reduce(node, msgs):
-    return {'h' : th.sum(msgs['m'], 1)} # aggregate incoming node features
-
-class NodeApplyModule(nn.Module):
-    def __init__(self, in_feats, out_feats, activation=None):
-        super(NodeApplyModule, self).__init__()
-        self.linear = nn.Linear(in_feats, out_feats)
-        self.activation = activation #apply a filter and non-linearity.
-
-    def forward(self, node):
-        h = self.linear(node['h'])
-        if self.activation:
-            h = self.activation(h)
-            #raise RuntimeError(h.shape)
-        return {'h' : h}
-
-class GCN(nn.Module):
-    def __init__(self,
-                 g,
-                 in_feats,
-                 n_hidden,
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout,
-                 mode=1):
-        super(GCN, self).__init__()
-        self.g = g #graph is passed as a parameter to the model
-        self.dropout = dropout
-        # input layer
-        self.layers = nn.ModuleList([NodeApplyModule(in_feats, n_hidden, activation)])
-        # hidden layers
-        for i in range(n_layers - 1):
-            self.layers.append(NodeApplyModule(n_hidden, n_hidden, activation))
-        # output layer
-        self.layers.append(NodeApplyModule(n_hidden, n_classes))
-        self.mode = mode # indicate DGL message passing level for subsequent use
-
-    # Message passing in 3 levels --- level 1
-    def lv1_mp(self, layer):
-        nodeIdList = list(i for i in range(self.g.number_of_nodes()))
-        for s in nodeIdList:
-                self.g.send(s, nodeIdList, gcn_msg)
-        self.g.recv(nodeIdList, gcn_reduce, layer)
-        #self.g.apply_nodes(nodeIdList, layer)
-
-    # Message passing in 3 levels --- level 2
-    def lv2_mp(self, layer):
-        dst = list(i for i in range(self.g.number_of_nodes()))
-        self.g.pull(dst, gcn_msg, gcn_reduce, layer)
-
-    # Message passing in 3 levels -- level 3
-    def lv3_mp(self, layer):
-        #nodeIdList = list(i for i in range(self.g.number_of_nodes()))
-        self.g.update_all(gcn_msg, gcn_reduce, layer)
-        #self.g.update_all(gcn_msg, gcn_reduce)
-        #self.g.apply_nodes(nodeIdList, layer)
-
-    # Below is the forward function
-
-    def forward(self, features):
-        self.g.set_n_repr({'h' : features})
-        for layer in self.layers:
-            # apply dropout
-            if self.dropout:
-                g.apply_nodes(apply_node_func=
-                        lambda node: F.dropout(node['h'], p=self.dropout))
-            assert self.mode in [1,2,3]
-            if self.mode == 1 :
-                self.lv1_mp(layer)
-            elif self.mode == 2 :
-                self.lv2_mp(layer)
-            else :
-                self.lv3_mp(layer)
-
-        return self.g.pop_n_repr('h')
-######################################################################
-# Training & Inference
-# ``````````````````````````````````
-# Below we train the model and perform inference.
-from dgl.data import citation_graph as citegrh
-data = citegrh.load_cora()
-features = th.FloatTensor(data.features)
-print(type(features))
-print(type(data.features))
-labels = th.LongTensor(data.labels)
-mask = th.ByteTensor(data.train_mask)
-in_feats = features.shape[1]
-n_classes = data.num_labels
-n_edges = data.graph.number_of_edges()
-
-# Some training hyperparameters for illustration
-#cuda = False #Not sure whether there is cuda or not
-cuda = True
-th.cuda.set_device(-1)
-features = features.cuda()
-labels = labels.cuda()
-mask = mask.cuda()
-
-n_hidden = 16
-n_layers = 1
-dropout = 0
-n_epochs = 200
-lr = 1e-3
-g = DGLGraph(data.graph)
-model = GCN(g,
-            in_feats,
-            n_hidden,
-            n_classes,
-            n_layers,
-            F.relu,
-            dropout,
-            mode = 3) #level 3 message passing
-model2 = GCN(g,
-            in_feats,
-            n_hidden,
-            n_classes,
-            n_layers,
-            F.relu,
-            dropout,
-            mode = 3) #level 2 message passing
-model.cuda()
-model2.cuda()
-# use optimizer
-optimizer = th.optim.Adam(model2.parameters(), lr=lr)
-# initialize graph
-dur = []
-for epoch in range(n_epochs):
-    if epoch >=3:
-        t0 = time.time()
-    #forward
-    logits = model2(features)
-    logp = F.log_softmax(logits, 1)
-    loss = F.nll_loss(logp[mask], labels[mask])
-
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
-
-    if epoch >= 3:
-        dur.append(time.time() - t0)
-
-        print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f} | ETputs(KTEPS) {:.2f}".format(
-            epoch, loss.item(), np.mean(dur), n_edges / np.mean(dur) /1000))
--- a/tutorials/models/gcn_gat.py
+++ b/tutorials/models/gcn_gat.py
-
-
-"""
-Graph Convolutional Network New
-====================================
-**Author**: `Quan Gan`
-
-In this tutorial, we will go through the basics of DGL, in the following order:
-    1. Creating a graph
-    2. Setting/getting node/edge states
-    3. Updating node/edge states using user-defined functions
-    4. Passing information to edges from endpoint nodes
-    5. Passing information to nodes from adjacent nodes and edges
-    6. Implementing a Graph Convolutional Network (GCN) and a Graph Attention
-       Network (GAT)
-    7. Using built-in functions to simplify your implementation
-"""
-
-##############################################################################
-# Section 1. Creating a Graph
-# ---------------------------
-#
-# Let's say we want to create the following graph:
-#
-# .. digraph:: foo
-#
-#    digraph foo {
-#            layout=circo;
-#            "A" -> "B" -> "C" -> "A";
-#    }
-#
-# First, we need to create a ``DGLGraph`` object.
-
-from dgl import DGLGraph
-
-g = DGLGraph()
-
-
-##############################################################################
-# And then we add 3 vertices (or *nodes*) into ``g``:
-
-g.add_nodes(3)
-
-
-##############################################################################
-# In DGL, all vertices are uniquely identified by integers, starting from 0.
-# Assuming that we map the node ``A``, ``B``, and ``C`` to ID 0, 1, and 2, we
-# can add the edges of the desired graph above as follows:
-
-g.add_edge(0, 1)
-g.add_edge(1, 2)
-g.add_edge(2, 0)
-# Or, equivalently
-# g.add_edges([0, 1, 2], [1, 2, 0])
-
-
-##############################################################################
-# All the edges are also uniquely identified by integers, again starting from
-# 0.  The edges are labeled in the order of addition.  In the example above,
-# the edge ``0 -> 1`` is labeled as edge #0, ``1 -> 2`` as edge #1, and
-# ``2 -> 0`` as edge #2.
-
-
-##############################################################################
-# Section 2. Setting/getting node/edge states
-# --------------------------------------
-# Now, we wish to assign the nodes some states, or features.
-#
-# In DGL, the node/edge states are represented as dictionaries, with strings
-# as keys (or *fields*), and tensors as values.  DGL aims to be
-# framework-agnostic, and currently it supports PyTorch and MXNet.  From now
-# on, we use PyTorch as an example.
-#
-# You can set up states for some or all nodes at the same time in DGL.
-# All you need is to stack the tensors along the first dimension for each
-# key, and feed the dictionary of the stacked tensors into ``set_n_repr``
-# as a whole.
-
-import torch
-
-# We are going to assign each node two states X and Y.  For each node,
-# X is a 2-D vector and Y is a 2x4 matrix.  You only need to make sure
-# the tensors with the same key across all the (set) nodes to have the
-# same shape and data type.
-X = torch.randn(3, 2)
-Y = torch.randn(3, 2, 4)
-
-# You can set the states for all of them...
-g.set_n_repr({'X': X, 'Y': Y})
-# ... or setting partial states, but only after you have set all nodes on
-# at least one key.
-# TODO: do we want to fix this behavior to allow initial partial setting?
-g.set_n_repr({'X': X[0:2], 'Y': Y[0:2]}, [0, 1])
-# You can also overwrite part of the fields.  The following overwrites field
-# X while keeping Y intact.
-X = torch.randn(3, 2)
-g.set_n_repr({'X': X})
-
-
-##############################################################################
-# You can also efficiently get the node states as a dictionary of tensors.
-# The dictionary will also have strings as keys and stacked tensors as values.
-
-# Getting all node states.  The tensors will be stacked along the first
-# dimension, in the same order as node ID.
-n_repr = g.get_n_repr()
-X_ = n_repr['X']
-Y_ = n_repr['Y']
-assert torch.allclose(X_, X)
-assert torch.allclose(Y_, Y)
-
-# You can also get the states from a subset of nodes.  The tensors will be
-# stacked along the first dimension, in the same order as what you feed in.
-n_repr_subset = g.get_n_repr([0, 2])
-X_ = n_repr_subset['X']
-Y_ = n_repr_subset['Y']
-assert torch.allclose(X_, X[[0, 2]])
-assert torch.allclose(Y_, Y[[0, 2]])
-
-
-##############################################################################
-# Setting/getting edge states is very similar.  We provide two ways of reading
-# and writing edge states: by source-destination pairs, and by edge ID.
-
-# We are going to assign each edge a state A and a state B, both of which are
-# 3-D vectors for each edge.
-A = torch.randn(3, 3)
-B = torch.randn(3, 3)
-
-# You can either set the states of all edges...
-g.set_e_repr({'A': A, 'B': B})
-# ... or by source-destination pair (in this case, assigning A[0] to (0 -> 1)
-# and A[2] to (2 -> 0) ...
-g.set_e_repr({'A': A[[0, 2]], 'B': B[[0, 2]]}, [0, 2], [1, 0])
-# ... or by edge ID (#0 and #2)
-g.set_e_repr_by_id({'A': A[[0, 2]], 'B': B[[0, 2]]}, [0, 2])
-# Note that the latter two options are available only if you have set at least
-# one field on all edges.
-# TODO: do we want to fix this behavior to allow initial partial setting?
-
-# Getting edge states is also easy...
-e_repr = g.get_e_repr()
-A_ = e_repr['A']
-assert torch.allclose(A_, A)
-# ... and you can also do it either by specifying source-destination pair...
-e_repr_subset = g.get_e_repr([0], [1])
-assert torch.allclose(e_repr_subset['A'], A[[0]])
-# ... or by edge ID
-e_repr_subset = g.get_e_repr_by_id([0])
-assert torch.allclose(e_repr_subset['A'], A[[0]])
-
-
-##############################################################################
-# One can also remove node/edge states from the graph.  This is particularly
-# useful to save memory during inference.
-
-B_ = g.pop_e_repr('B')
-assert torch.allclose(B_, B)
-
-
-##############################################################################
-# Section 3. Updating node/edge states
-# ------------------------------------
-# The most direct way to update node/edge states is by getting/setting the
-# states directly.  Of course, you can update the states on a subset of
-# nodes and/or edges this way.
-
-X_new = g.get_n_repr()['X'] + 2
-g.set_n_repr({'X': X_new})
-
-##############################################################################
-# A better structured implementation would wrap the update procedure as a
-# function/module, to decouple the update logic from the rest of the system.
-
-def updateX(node_state_dict):
-    return {'X': node_state_dict['X'] + 2}
-
-g.set_n_repr(updateX(g.get_n_repr()))
-
-##############################################################################
-# If your node state update function is a **node-wise map** operation (i.e.
-# the update on a single node only depends on the current state of that
-# particular node), you can also call ``apply_nodes`` method.
-#
-# .. note::
-#  In distributed computation, 
-
-g.apply_nodes(apply_node_func=updateX)
-# You can also update node states partially
-g.apply_nodes(v=[0, 1], apply_node_func=updateX)
-
-
-##############################################################################
-# For edges, DGL also has an ``apply_edges`` method for **edge-wise map**
-# operations.
-
-def updateA(edge_state_dict):
-    return {'A': edge_state_dict['A'] + 2}
-
-g.apply_edges(apply_edge_func=updateA)
-# You can also update edge states by specifying endpoints or edge IDs
-g.apply_edges(u=[0, 2], v=[1, 0], apply_edge_func=updateA)
-g.apply_edges(eid=[0, 2], apply_edge_func=updateA)