[API][Doc] API change & basic tutorials (#113)

* Add SH tutorials * setup sphinx-gallery; work on graph tutorial * draft dglgraph tutorial * update readme to include document url * rm obsolete file * Draft the message passing tutorial * Capsule code (#102) * add capsule example * clean code * better naming * better naming * [GCN]tutorial scaffold * fix capsule example code * remove previous capsule example code * graph struc edit * modified: 2_graph.py * update doc of capsule * update capsule docs * update capsule docs * add msg passing prime * GCN-GAT tutorial Section 1 and 2 * comment for API improvement * section 3 * Tutorial API change (#115) * change the API as discusses; toy example * enable the new set/get syntax * fixed pytorch utest * fixed gcn example * fixed gat example * fixed mx utests * fix mx utest * delete apply edges; add utest for update_edges * small change on toy example * fix utest * fix out in degrees bug * update pagerank example and add it to CI * add delitem for dataview * make edges() return form that is compatible with send/update_edges etc * fix index bug when the given data is one-int-tensor * fix doc

[API][Doc] API change & basic tutorials (#113)
* Add SH tutorials * setup sphinx-gallery; work on graph tutorial * draft dglgraph tutorial * update readme to include document url * rm obsolete file * Draft the message passing tutorial * Capsule code (#102) * add capsule example * clean code * better naming * better naming * [GCN]tutorial scaffold * fix capsule example code * remove previous capsule example code * graph struc edit * modified: 2_graph.py * update doc of capsule * update capsule docs * update capsule docs * add msg passing prime * GCN-GAT tutorial Section 1 and 2 * comment for API improvement * section 3 * Tutorial API change (#115) * change the API as discusses; toy example * enable the new set/get syntax * fixed pytorch utest * fixed gcn example * fixed gat example * fixed mx utests * fix mx utest * delete apply edges; add utest for update_edges * small change on toy example * fix utest * fix out in degrees bug * update pagerank example and add it to CI * add delitem for dataview * make edges() return form that is compatible with send/update_edges etc * fix index bug when the given data is one-int-tensor * fix doc
68ec6247 · Minjie Wang · GitHub · 2ecd2b23 · 68ec6247 · 68ec6247
Unverified Commit 68ec6247 authored Nov 02, 2018 by Minjie Wang Committed by GitHub Nov 02, 2018
9 changed files
--- a/tests/pytorch/test_line_graph.py
+++ b/tests/pytorch/test_line_graph.py
@@ -8,24 +8,24 @@ D = 5
 def test_line_graph():
    N = 5
    G = dgl.DGLGraph(nx.star_graph(N))
-    G.set_e_repr({'h' : th.randn((2 * N, D))})
+    G.edata['h'] = th.randn((2 * N, D))
    n_edges = G.number_of_edges()
    L = G.line_graph(shared=True)
    assert L.number_of_nodes() == 2 * N
-    L.set_n_repr({'h' : th.randn((2 * N, D))})
+    L.ndata['h'] = th.randn((2 * N, D))
    # update node features on line graph should reflect to edge features on
    # original graph.
    u = [0, 0, 2, 3]
    v = [1, 2, 0, 0]
    eid = G.edge_ids(u, v)
-    L.set_n_repr({'h' : th.zeros((4, D))}, eid)
+    L.nodes[eid].data['h'] = th.zeros((4, D))
-    assert th.allclose(G.get_e_repr(u, v)['h'], th.zeros((4, D)))
+    assert th.allclose(G.edges[u, v].data['h'], th.zeros((4, D)))
    # adding a new node feature on line graph should also reflect to a new
    # edge feature on original graph
    data = th.randn(n_edges, D)
-    L.set_n_repr({'w': data})
+    L.ndata['w'] = data
-    assert th.allclose(G.get_e_repr()['w'], data)
+    assert th.allclose(G.edata['w'], data)
 def test_no_backtracking():
    N = 5

--- a/tests/pytorch/test_specialization.py
+++ b/tests/pytorch/test_specialization.py
@@ -21,41 +21,41 @@ def generate_graph():
 def test_update_all():
    def _test(fld):
-        def message_func(hu, edge):
+        def message_func(edges):
-            return {'m' : hu[fld]}
+            return {'m' : edges.src[fld]}
-        def message_func_edge(hu, edge):
+        def message_func_edge(edges):
-            if len(hu[fld].shape) == 1:
+            if len(edges.src[fld].shape) == 1:
-                return {'m' : hu[fld] * edge['e1']}
+                return {'m' : edges.src[fld] * edges.data['e1']}
            else:
-                return {'m' : hu[fld] * edge['e2']}
+                return {'m' : edges.src[fld] * edges.data['e2']}
-        def reduce_func(hv, msgs):
+        def reduce_func(nodes):
-            return {fld : th.sum(msgs['m'], 1)}
+            return {fld : th.sum(nodes.mailbox['m'], 1)}
-        def apply_func(hu):
+        def apply_func(nodes):
-            return {fld : 2 * hu[fld]}
+            return {fld : 2 * nodes.data[fld]}
        g = generate_graph()
        # update all
-        v1 = g.get_n_repr()[fld]
+        v1 = g.ndata[fld]
        g.update_all(fn.copy_src(src=fld, out='m'), fn.sum(msg='m', out=fld), apply_func)
-        v2 = g.get_n_repr()[fld]
+        v2 = g.ndata[fld]
        g.set_n_repr({fld : v1})
        g.update_all(message_func, reduce_func, apply_func)
-        v3 = g.get_n_repr()[fld]
+        v3 = g.ndata[fld]
        assert th.allclose(v2, v3)
        # update all with edge weights
-        v1 = g.get_n_repr()[fld]
+        v1 = g.ndata[fld]
        g.update_all(fn.src_mul_edge(src=fld, edge='e1', out='m'),
                fn.sum(msg='m', out=fld), apply_func)
-        v2 = g.get_n_repr()[fld]
+        v2 = g.ndata[fld]
        g.set_n_repr({fld : v1})
        g.update_all(fn.src_mul_edge(src=fld, edge='e2', out='m'),
                fn.sum(msg='m', out=fld), apply_func)
-        v3 = g.get_n_repr()[fld]
+        v3 = g.ndata[fld]
        g.set_n_repr({fld : v1})
        g.update_all(message_func_edge, reduce_func, apply_func)
-        v4 = g.get_n_repr()[fld]
+        v4 = g.ndata[fld]
        assert th.allclose(v2, v3)
        assert th.allclose(v3, v4)
    # test 1d node features
@@ -67,42 +67,42 @@ def test_send_and_recv():
    u = th.tensor([0, 0, 0, 3, 4, 9])
    v = th.tensor([1, 2, 3, 9, 9, 0])
    def _test(fld):
-        def message_func(hu, edge):
+        def message_func(edges):
-            return {'m' : hu[fld]}
+            return {'m' : edges.src[fld]}
-        def message_func_edge(hu, edge):
+        def message_func_edge(edges):
-            if len(hu[fld].shape) == 1:
+            if len(edges.src[fld].shape) == 1:
-                return {'m' : hu[fld] * edge['e1']}
+                return {'m' : edges.src[fld] * edges.data['e1']}
            else:
-                return {'m' : hu[fld] * edge['e2']}
+                return {'m' : edges.src[fld] * edges.data['e2']}
-        def reduce_func(hv, msgs):
+        def reduce_func(nodes):
-            return {fld : th.sum(msgs['m'], 1)}
+            return {fld : th.sum(nodes.mailbox['m'], 1)}
-        def apply_func(hu):
+        def apply_func(nodes):
-            return {fld : 2 * hu[fld]}
+            return {fld : 2 * nodes.data[fld]}
        g = generate_graph()
        # send and recv
-        v1 = g.get_n_repr()[fld]
+        v1 = g.ndata[fld]
-        g.send_and_recv(u, v, fn.copy_src(src=fld, out='m'),
+        g.send_and_recv((u, v), fn.copy_src(src=fld, out='m'),
                fn.sum(msg='m', out=fld), apply_func)
-        v2 = g.get_n_repr()[fld]
+        v2 = g.ndata[fld]
        g.set_n_repr({fld : v1})
-        g.send_and_recv(u, v, message_func, reduce_func, apply_func)
+        g.send_and_recv((u, v), message_func, reduce_func, apply_func)
-        v3 = g.get_n_repr()[fld]
+        v3 = g.ndata[fld]
        assert th.allclose(v2, v3)
        # send and recv with edge weights
-        v1 = g.get_n_repr()[fld]
+        v1 = g.ndata[fld]
-        g.send_and_recv(u, v, fn.src_mul_edge(src=fld, edge='e1', out='m'),
+        g.send_and_recv((u, v), fn.src_mul_edge(src=fld, edge='e1', out='m'),
                fn.sum(msg='m', out=fld), apply_func)
-        v2 = g.get_n_repr()[fld]
+        v2 = g.ndata[fld]
        g.set_n_repr({fld : v1})
-        g.send_and_recv(u, v, fn.src_mul_edge(src=fld, edge='e2', out='m'),
+        g.send_and_recv((u, v), fn.src_mul_edge(src=fld, edge='e2', out='m'),
                fn.sum(msg='m', out=fld), apply_func)
-        v3 = g.get_n_repr()[fld]
+        v3 = g.ndata[fld]
        g.set_n_repr({fld : v1})
-        g.send_and_recv(u, v, message_func_edge, reduce_func, apply_func)
+        g.send_and_recv((u, v), message_func_edge, reduce_func, apply_func)
-        v4 = g.get_n_repr()[fld]
+        v4 = g.ndata[fld]
        assert th.allclose(v2, v3)
        assert th.allclose(v3, v4)
    # test 1d node features
@@ -111,14 +111,14 @@ def test_send_and_recv():
    _test('f2')
 def test_update_all_multi_fn():
-    def message_func(hu, edge):
+    def message_func(edges):
-        return {'m2': hu['f2']}
+        return {'m2': edges.src['f2']}
-    def message_func_edge(hu, edge):
+    def message_func_edge(edges):
-        return {'m2': hu['f2'] * edge['e2']}
+        return {'m2': edges.src['f2'] * edges.data['e2']}
-    def reduce_func(hv, msgs):
+    def reduce_func(nodes):
-        return {'v2': th.sum(msgs['m2'], 1)}
+        return {'v2': th.sum(nodes.mailbox['m2'], 1)}
    g = generate_graph()
    g.set_n_repr({'v1' : th.zeros((10,)), 'v2' : th.zeros((10,))})
@@ -127,19 +127,19 @@ def test_update_all_multi_fn():
    g.update_all([fn.copy_src(src=fld, out='m1'), message_func],
                 [fn.sum(msg='m1', out='v1'), reduce_func],
                 None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
    assert th.allclose(v1, v2)
    # run builtin with single message and reduce
    g.update_all(fn.copy_src(src=fld, out='m'), fn.sum(msg='m', out='v1'), None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
    assert th.allclose(v1, v2)
    # 1 message, 2 reduces
    g.update_all(fn.copy_src(src=fld, out='m'), [fn.sum(msg='m', out='v2'), fn.sum(msg='m', out='v3')], None)
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
-    v3 = g.get_n_repr()['v3']
+    v3 = g.ndata['v3']
    assert th.allclose(v1, v2)
    assert th.allclose(v1, v3)
@@ -147,29 +147,29 @@ def test_update_all_multi_fn():
    g.update_all([fn.src_mul_edge(src=fld, edge='e1', out='m1'), fn.src_mul_edge(src=fld, edge='e2', out='m2')],
                 [fn.sum(msg='m1', out='v1'), fn.sum(msg='m2', out='v2'), fn.sum(msg='m1', out='v3')],
                 None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
-    v3 = g.get_n_repr()['v3']
+    v3 = g.ndata['v3']
    assert th.allclose(v1, v2)
    assert th.allclose(v1, v3)
    # run UDF with single message and reduce
    g.update_all(message_func_edge, reduce_func, None)
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
    assert th.allclose(v1, v2)
 def test_send_and_recv_multi_fn():
    u = th.tensor([0, 0, 0, 3, 4, 9])
    v = th.tensor([1, 2, 3, 9, 9, 0])
-    def message_func(hu, edge):
+    def message_func(edges):
-        return {'m2': hu['f2']}
+        return {'m2': edges.src['f2']}
-    def message_func_edge(hu, edge):
+    def message_func_edge(edges):
-        return {'m2': hu['f2'] * edge['e2']}
+        return {'m2': edges.src['f2'] * edges.data['e2']}
-    def reduce_func(hv, msgs):
+    def reduce_func(nodes):
-        return {'v2' : th.sum(msgs['m2'], 1)}
+        return {'v2' : th.sum(nodes.mailbox['m2'], 1)}
    g = generate_graph()
    g.set_n_repr({'v1' : th.zeros((10, D)), 'v2' : th.zeros((10, D)),
@@ -177,45 +177,45 @@ def test_send_and_recv_multi_fn():
    fld = 'f2'
    # send and recv, mix of builtin and UDF
-    g.send_and_recv(u, v,
+    g.send_and_recv((u, v),
                    [fn.copy_src(src=fld, out='m1'), message_func],
                    [fn.sum(msg='m1', out='v1'), reduce_func],
                    None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
    assert th.allclose(v1, v2)
    # run builtin with single message and reduce
-    g.send_and_recv(u, v, fn.copy_src(src=fld, out='m'), fn.sum(msg='m', out='v1'),
+    g.send_and_recv((u, v), fn.copy_src(src=fld, out='m'), fn.sum(msg='m', out='v1'),
                    None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
    assert th.allclose(v1, v2)
    # 1 message, 2 reduces
-    g.send_and_recv(u, v,
+    g.send_and_recv((u, v),
            fn.copy_src(src=fld, out='m'),
            [fn.sum(msg='m', out='v2'), fn.sum(msg='m', out='v3')],
            None)
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
-    v3 = g.get_n_repr()['v3']
+    v3 = g.ndata['v3']
    assert th.allclose(v1, v2)
    assert th.allclose(v1, v3)
    # send and recv with edge weights, 2 message, 3 reduces
-    g.send_and_recv(u, v,
+    g.send_and_recv((u, v),
                    [fn.src_mul_edge(src=fld, edge='e1', out='m1'), fn.src_mul_edge(src=fld, edge='e2', out='m2')],
                    [fn.sum(msg='m1', out='v1'), fn.sum(msg='m2', out='v2'), fn.sum(msg='m1', out='v3')],
                    None)
-    v1 = g.get_n_repr()['v1']
+    v1 = g.ndata['v1']
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
-    v3 = g.get_n_repr()['v3']
+    v3 = g.ndata['v3']
    assert th.allclose(v1, v2)
    assert th.allclose(v1, v3)
    # run UDF with single message and reduce
-    g.send_and_recv(u, v, message_func_edge,
+    g.send_and_recv((u, v), message_func_edge,
            reduce_func, None)
-    v2 = g.get_n_repr()['v2']
+    v2 = g.ndata['v2']
    assert th.allclose(v1, v2)
 if __name__ == '__main__':

--- a/tests/pytorch/test_subgraph.py
+++ b/tests/pytorch/test_subgraph.py
@@ -16,27 +16,27 @@ def generate_graph(grad=False):
    g.add_edge(9, 0)
    ncol = Variable(th.randn(10, D), requires_grad=grad)
    ecol = Variable(th.randn(17, D), requires_grad=grad)
-    g.set_n_repr({'h' : ncol})
+    g.ndata['h'] = ncol
-    g.set_e_repr({'l' : ecol})
+    g.edata['l'] = ecol
    return g
 def test_basics():
    g = generate_graph()
-    h = g.get_n_repr()['h']
+    h = g.ndata['h']
-    l = g.get_e_repr()['l']
+    l = g.edata['l']
    nid = [0, 2, 3, 6, 7, 9]
    sg = g.subgraph(nid)
    eid = {2, 3, 4, 5, 10, 11, 12, 13, 16}
    assert set(sg.parent_eid.numpy()) == eid
    eid = sg.parent_eid
    # the subgraph is empty initially
-    assert len(sg.get_n_repr()) == 0
+    assert len(sg.ndata) == 0
-    assert len(sg.get_e_repr()) == 0
+    assert len(sg.edata) == 0
    # the data is copied after explict copy from
    sg.copy_from_parent()
-    assert len(sg.get_n_repr()) == 1
+    assert len(sg.ndata) == 1
-    assert len(sg.get_e_repr()) == 1
+    assert len(sg.edata) == 1
-    sh = sg.get_n_repr()['h']
+    sh = sg.ndata['h']
    assert th.allclose(h[nid], sh)
    '''
    s, d, eid
@@ -58,11 +58,11 @@ def test_basics():
    8, 9, 15      3
    9, 0, 16   1
    '''
-    assert th.allclose(l[eid], sg.get_e_repr()['l'])
+    assert th.allclose(l[eid], sg.edata['l'])
    # update the node/edge features on the subgraph should NOT
    # reflect to the parent graph.
-    sg.set_n_repr({'h' : th.zeros((6, D))})
+    sg.ndata['h'] = th.zeros((6, D))
-    assert th.allclose(h, g.get_n_repr()['h'])
+    assert th.allclose(h, g.ndata['h'])
 def test_merge():
    # FIXME: current impl cannot handle this case!!!
@@ -85,8 +85,8 @@ def test_merge():
    g.merge([sg1, sg2, sg3])
-    h = g.get_n_repr()['h'][:,0]
+    h = g.ndata['h'][:,0]
-    l = g.get_e_repr()['l'][:,0]
+    l = g.edata['l'][:,0]
    assert th.allclose(h, th.tensor([3., 0., 3., 3., 2., 0., 1., 1., 0., 1.]))
    assert th.allclose(l,
            th.tensor([0., 0., 1., 1., 1., 1., 0., 0., 0., 3., 1., 4., 1., 4., 0., 3., 1.]))

--- a/tests/scripts/test_examples.sh
+++ b/tests/scripts/test_examples.sh
 #!/bin/bash
-GCN_EXAMPLE_DIR="../../examples/pytorch/gcn"
+GCN_EXAMPLE_DIR="../../examples/pytorch/"
 function fail {
    echo FAIL: $@
@@ -29,8 +29,9 @@ fi
 pushd $GCN_EXAMPLE_DIR> /dev/null
-# test CPU
+# test
-python3 gcn.py --dataset cora --gpu $dev || fail "run gcn.py on $1"
+python3 pagerank.py || fail "run pagerank.py on $1"
-python3 gcn_spmv.py --dataset cora --gpu $dev || fail "run gcn_spmv.py on $1"
+python3 gcn/gcn.py --dataset cora --gpu $dev || fail "run gcn/gcn.py on $1"
+python3 gcn/gcn_spmv.py --dataset cora --gpu $dev || fail "run gcn/gcn_spmv.py on $1"
 popd > /dev/null
--- a/toy.py
+++ b/toy.py
+###############################################################################
+# A toy example
+# -------------
+#
+# Let’s begin with the simplest graph possible with two nodes, and set
+# the node representations:
+import torch as th
+import dgl
+g = dgl.DGLGraph()
+g.add_nodes(2)
+g.add_edge(1, 0)
+x = th.tensor([[0.0, 0.0], [1.0, 2.0]])
+g.nodes[:].data['x'] = x
+###############################################################################
+# A syntax sugar for accessing feature data of all nodes
+print(g.ndata['x'])
+###############################################################################
+# What we want to do is simply to copy representation from node#1 to
+# node#0, but with a message passing interface. We do this like what we
+# will do over a pair of sockets, with a send and a recv interface. The
+# two user defined function (UDF) specifies the actions: deposit the
+# value into an internal key-value store with the key msg, and retrive
+# it. Note that there may be multiple incoming edges to a node, and the
+# receiving end aggregates them.
+def send_source(edges):  # type is dgl.EdgeBatch
+    return {'msg': edges.src['x']}
+def simple_reduce(nodes):  # type is dgl.NodeBatch
+    msgs = nodes.mailbox['msg']
+    return {'x' : th.sum(msgs, dim=1)}
+g.send((1, 0), message_func=send_source)
+g.recv(0, reduce_func=simple_reduce)
+print(g.ndata)
+###############################################################################
+# Some times the computation may involve representations on the edges.
+# Let’s say we want to “amplify” the message:
+w = th.tensor([2.0])
+g.edata['w'] = w
+def send_source_with_edge_weight(edges):
+    return {'msg': edges.src['x'] * edges.data['w']}
+g.send((1, 0), message_func=send_source_with_edge_weight)
+g.recv(0, reduce_func=simple_reduce)
+print(g.ndata)
+###############################################################################
+# Or we may need to involve the desination’s representation, and here
+# is one version:
+def simple_reduce_addup(nodes):
+    msgs = nodes.mailbox['msg']
+    return {'x' : nodes.data['x'] + th.sum(msgs, dim=1)}
+g.send((1, 0), message_func=send_source_with_edge_weight)
+g.recv(0, reduce_func=simple_reduce_addup)
+print(g.ndata)
+del g.ndata['x']
+del g.edata['w']
--- a/tutorials/2_graph.py
+++ b/tutorials/2_graph.py
@@ -10,36 +10,87 @@ The ``DGLGraph`` is the very core data structure in our library. It provides the
 interfaces to manipulate graph structure, set/get node/edge features and convert
 from/to many other graph formats. You can also perform computation on the graph
 using our message passing APIs (see :ref:`tutorial-mp`).
+TODO: 1) explain `tensor`; 2) enable g.nodes/edges[:][key]; 3) networkx conversion in one place
 """
 ###############################################################################
 # Construct a graph
 # -----------------
-# 
+#
-# In ``DGLGraph``, all nodes are represented using consecutive integers starting from
+# The design of ``DGLGraph`` was influenced by other graph libraries. Indeed, you can
-# zero. All edges are directed. Let us start by creating a star network of 10 nodes
+# create a graph from `networkx <https://networkx.github.io/>`__, and convert it into a ``DGLGraph``
-# where all the edges point to the center node (node#0).
+# and vice versa:
-# TODO(minjie): it's better to plot the graph here.
+import networkx as nx
 import dgl
+g_nx = nx.petersen_graph()
+g_dgl = dgl.DGLGraph(g_nx)
+import matplotlib.pyplot as plt
+plt.subplot(121)
+nx.draw(g_nx, with_labels=True)
+plt.subplot(122)
+nx.draw(g_dgl.to_networkx(), with_labels=True)
+plt.show()
+###############################################################################
+# They are the same graph, except that ``DGLGraph`` are always `directional`.
+#
+# Creating a graph is a matter of specifying total number of nodes and the edges among them.
+# In ``DGLGraph``, all nodes are represented using consecutive integers starting from
+# zero, and you can add more nodes repeatedly.
+#
+# .. note::
+#
+#  ``nx.add_node(100)`` adds a node with id 100, ``dgl.add_nodes(100)`` adds another 100 nodes into the graph.
+g_dgl.clear()
+g_nx.clear()
+g_dgl.add_nodes(20)
+print("We have %d nodes now" % g_dgl.number_of_nodes())
+g_dgl.add_nodes(100)
+print("Now we have %d nodes!" % g_dgl.number_of_nodes())
+g_nx.add_node(100)
+print("My nx buddy only has %d :( " % g_nx.number_of_nodes())
+###############################################################################
+# The most naive way to add edges are just adding them one by one, with a (*src, dst*) pair.
+# Let's generate a star graph where all the edges point to the center (node#0).
 star = dgl.DGLGraph()
 star.add_nodes(10)  # add 10 nodes
 for i in range(1, 10):
    star.add_edge(i, 0)
-print('#Nodes:', star.number_of_nodes())
+nx.draw(star.to_networkx(), with_labels=True)
-print('#Edges:', star.number_of_edges())
+###############################################################################
+# It's more efficient to add many edges with a pair of list, or better still, with a pair of tensors.
+# TODO: needs to explain ``tensor``, since it's not a Python primitive data type.
+# using lists
+star.clear()
+star.add_nodes(10)
+src = [i for i in range(1, 10)]; dst = [0]*9
+star.add_edges(src, dst)
+# using tensor
+star.clear()
+star.add_nodes(10)
+import torch as th
+src = th.tensor(src); dst = th.tensor(dst)
+star.add_edges(src, dst)
 ###############################################################################
-# ``DGLGraph`` also supports adding multiple edges at once by providing multiple
+# In addition to this, we also support
-# source and destination nodes. Multiple nodes are represented using either a
-# list or a 1D integer tensor(vector). In addition to this, we also support
 # "edge broadcasting":
 #
 # .. _note-edge-broadcast:
 #
 # .. note::
-# 
+#
 #   Given two source and destination node list/tensor ``u`` and ``v``.
 #
 #   - If ``len(u) == len(v)``, then this is a many-many edge set and
@@ -54,16 +105,13 @@ star.clear()  # clear the previous graph
 star.add_nodes(10)
 u = list(range(1, 10))  # can also use tensor type here (e.g. torch.Tensor)
 star.add_edges(u, 0)  # many-one edge set
-print('#Nodes:', star.number_of_nodes())
-print('#Edges:', star.number_of_edges())
 ###############################################################################
 # In ``DGLGraph``, each edge is assigned an internal edge id (also a consecutive
 # integer starting from zero). The ids follow the addition order of the edges
-# and you can query the id using the ``edge_ids`` interface.
+# and you can query the id using the ``edge_ids`` interface, which returns a tensor.
-print(star.edge_ids(1, 0))  # the first edge
+print(star.edge_ids(1, 0))  # query edge id of 1->0; it happens to be the first edge!
 print(star.edge_ids([8, 9], 0))  # ask for ids of multiple edges
@@ -79,8 +127,8 @@ print(star.edge_ids([8, 9], 0))  # ask for ids of multiple edges
 # ----------------------
 # Nodes and edges can have feature data in tensor type. They can be accessed/updated
 # through a key-value storage interface. The key must be hashable. The value should
-# be features of each node and edge batched on the *first* dimension. For example,
+# be features of each node and edge, batched on the *first* dimension. For example,
-# following codes create features for all nodes (``hv``) and features for all
+# the following codes create features for all nodes (``hv``) and features for all
 # edges (``he``). Each feature is a vector of length 3.
 #
 # .. note::
@@ -102,12 +150,20 @@ star.set_e_repr({'he' : efeat})
 ###############################################################################
+# .. note::
+#    The first dimension of the node feature has length equal the number of nodes,
+#    whereas of the edge feature the number of edges.
+#
 # We can then set some nodes' features to be zero.
 # TODO(minjie): enable following syntax
 # print(star.nodes[:]['hv'])
+print("node features:")
 print(star.get_n_repr()['hv'])
+print("\nedge features:")
+print(star.get_e_repr()['he'])
 # set node 0, 2, 4 feature to zero
+print("\nresetting features at node 0, 2 and 4...")
 star.set_n_repr({'hv' : th.zeros((3, D))}, [0, 2, 4])
 print(star.get_n_repr()['hv'])
@@ -129,11 +185,12 @@ print(star.node_attr_schemes())
 ###############################################################################
 # If a new feature is added for some but not all of the nodes/edges, we will
 # automatically create empty features for the others to make sure that features are
-# always aligned. By default, we fill zero for the empty features. The behavior
+# always aligned. By default, we zero-fill the empty features. The behavior
 # can be changed using ``set_n_initializer`` and ``set_e_initializer``.
 star.set_n_repr({'hv_1' : th.randn((3, D+1))}, [0, 2, 4])
 print(star.node_attr_schemes())
+print(star.get_n_repr()['hv'])
 print(star.get_n_repr()['hv_1'])

--- a/tutorials/models/capsule.py
+++ b/tutorials/models/capsule.py
@@ -10,6 +10,7 @@ The algorithm aims to provide a better alternative to current neural network str
 By using DGL library, users can implement the algorithm in a more intuitive way.
 """
 ##############################################################################
 # Model Overview
 # ---------------
@@ -25,8 +26,9 @@ By using DGL library, users can implement the algorithm in a more intuitive way.
 # ```````````````````
 # In papers, author states that "A capsule is a group of neurons whose activity vector
 # represents the instantiation parameters of a specific type of entity such as an object
-# or an object part."    
+# or an object part."
-# Generally Speaking, the idea of capsule is to encode all the information about the
+#
+# Generally speaking, the idea of capsule is to encode all the information about the
 # features into a vector form, by substituting scalars in traditional neural network with vectors.
 # And use the norm of the vector to represents the meaning of original scalars. 
 # 
@@ -46,36 +48,24 @@ By using DGL library, users can implement the algorithm in a more intuitive way.
 # 
 # Model Implementations
 # -------------------------
-# Setup
-# ```````````````````````````
-import dgl
-import torch
-import torch.nn.functional as F
-from torch import nn
-class DGLBatchCapsuleLayer(nn.Module):
+##############################################################################
-    def __init__(self, input_capsule_dim, input_capsule_num, output_capsule_num, output_capsule_dim, num_routing,
+# Algorithm Overview
-                 cuda_enabled):
+# ```````````````````````````
-        super(DGLBatchCapsuleLayer, self).__init__()
+#
-        self.device = "cuda" if cuda_enabled else "cpu"
+# .. image:: https://raw.githubusercontent.com/VoVAllen/DGL_Capsule/master/algorithm.png
-        self.input_capsule_dim = input_capsule_dim
+#
-        self.input_capsule_num = input_capsule_num
+# The main step of routing algorithm is line 4 - 7. In ``DGLGraph`` structure, we consider these steps as a message passing
-        self.output_capsule_dim = output_capsule_dim
+# procedure.
-        self.output_capsule_num = output_capsule_num
-        self.num_routing = num_routing
-        self.weight = nn.Parameter(
-            torch.randn(input_capsule_num, output_capsule_num, output_capsule_dim, input_capsule_dim))
-        self.g, self.input_nodes, self.output_nodes = self.construct_graph()
 ##############################################################################
-# Consider capsule routing  as a graph structure
+# Consider capsule routing as a graph structure
 # ````````````````````````````````````````````````````````````````````````````
-# We can consider each capsule as a node in a graph, and connect all the nodes between layers.  
+# We can consider each capsule as a node in a graph, and connect all the nodes between layers.
-# 
+#
 # .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f3.png
-#    :height: 200px
+#    :height: 150px
-# 
+#
 def construct_graph(self):
    g = dgl.DGLGraph()
    g.add_nodes(self.input_capsule_num + self.output_capsule_num)
@@ -88,116 +78,165 @@ def construct_graph(self):
            v.append(j)
    g.add_edges(u, v)
    return g, input_nodes, output_nodes
-DGLBatchCapsuleLayer.construct_graph = construct_graph  # This line is for defining class in multiple cells.
 ##############################################################################
-# Initialization & Affine Transformation
+# Write Message Passing Functions
+# ``````````````````````````````````
+# Reduce Functions (line 4 - 5)
+# .............................................
+#
+# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f5.png
+#
+# At this stage, we need to define a reduce function to aggregate the node features
+# from layer :math:`l` and weighted sum them into layer :math:`(l+1)`'s node features.
+#
+# .. note::
+#    The softmax operation is over dimension :math:`j` instead of :math:`i`.
+def capsule_reduce(node, msg):
+    b_ij_c, u_hat = msg['b_ij'], msg['u_hat']
+    # line 4
+    c_i = F.softmax(b_ij_c, dim=0)
+    # line 5
+    s_j = (c_i.unsqueeze(2).unsqueeze(3) * u_hat).sum(dim=1)
+    return {'h': s_j}
+##############################################################################
+# Node Update Functions (line 6)
+# ......................................................
+# Squash the intermediate representations into node features :math:`v_j`
+#
+# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step6.png
+#
+def capsule_update(msg):
+    v_j = squash(msg['h'])
+    return {'h': v_j}
+##############################################################################
+# Edge Update Functions (line 7)
+# ...........................................................................
+# Update the routing parameters by updating edges in graph
+#
+# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step7.png
+#
+def update_edge(u, v, edge):
+    return {'b_ij': edge['b_ij'] + (v['h'] * edge['u_hat']).mean(dim=1).sum(dim=1)}
+##############################################################################
+# Call DGL function to execute algorithm
+# ````````````````````````````````````````````````````````````````````````````
+# Call ``update_all`` and ``update_edge`` functions to execute the whole algorithms.
+# Message function is to define which attributes are needed in further computations
+#
+def routing(self):
+    def capsule_msg(src, edge):
+        return {'b_ij': edge['b_ij'], 'h': src['h'], 'u_hat': edge['u_hat']}
+    self.g.update_all(capsule_msg, capsule_reduce, capsule_update)
+    self.g.update_edge(edge_func=update_edge)
+##############################################################################
+# Forward Function
 # ````````````````````````````````````````````````````````````````````````````
+# This section shows the whole process of forward process of capsule routing algorithm.
+def forward(self, x):
+    self.batch_size = x.size(0)
+    u_hat = self.compute_uhat(x)
+    self.initialize_nodes_and_edges_features(u_hat)
+    for i in range(self.num_routing):
+        self.routing()
+    this_layer_nodes_feature = self.g.get_n_repr()['h'][
+                               self.input_capsule_num:self.input_capsule_num + self.output_capsule_num]
+    return this_layer_nodes_feature.transpose(0, 1).unsqueeze(1).unsqueeze(4).squeeze(1)
+##############################################################################
+# Other Workaround
+# ````````````````````````````````````````````````````````````````
+# Initialization & Affine Transformation
+# ..................................................
+# This section implements the transformation operation in capsule networks,
+# which transform capsule into different dimensions.
 # - Pre-compute :math:`\hat{u}_{j|i}`, initialize :math:`b_{ij}` and store them as edge attribute
 # - Initialize node features as zero
-# 
+#
 # .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f4.png
-# 
+#
-def forward(self, x):
-    self.batch_size = x.size(0)
+def compute_uhat(self, x):
    # x is the input vextor with shape [batch_size, input_capsule_dim, input_num]
-    # Transpose x to [batch_size, input_num, input_capsule_dim]   
+    # Transpose x to [batch_size, input_num, input_capsule_dim]
    x = x.transpose(1, 2)
    # Expand x to [batch_size, input_num, output_num, input_capsule_dim, 1]
    x = torch.stack([x] * self.output_capsule_num, dim=2).unsqueeze(4)
-    # Expand W from [input_num, output_num, input_capsule_dim, output_capsule_dim] 
+    # Expand W from [input_num, output_num, input_capsule_dim, output_capsule_dim]
    # to [batch_size, input_num, output_num, output_capsule_dim, input_capsule_dim]
    W = self.weight.expand(self.batch_size, *self.weight.size())
    # u_hat's shape is [input_num, output_num, batch_size, output_capsule_dim]
    u_hat = torch.matmul(W, x).permute(1, 2, 0, 3, 4).squeeze().contiguous()
+    return u_hat
-    b_ij = torch.zeros(self.input_capsule_num, self.output_capsule_num).to(self.device)
+def initialize_nodes_and_edges_features(self, u_hat):
+    b_ij = torch.zeros(self.input_capsule_num, self.output_capsule_num).to(self.device)
    self.g.set_e_repr({'b_ij': b_ij.view(-1)})
    self.g.set_e_repr({'u_hat': u_hat.view(-1, self.batch_size, self.output_capsule_dim)})
-    self.routing()
    # Initialize all node features as zero
    node_features = torch.zeros(self.input_capsule_num + self.output_capsule_num, self.batch_size,
                                self.output_capsule_dim).to(self.device)
    self.g.set_n_repr({'h': node_features})
-DGLBatchCapsuleLayer.forward = forward
 ##############################################################################
-# Write Message Passing functions and Squash function
-# ````````````````````````````````````````````````````````````````````````````
 # Squash function
 # ..................
 # Squashing function is to ensure that short vectors get shrunk to almost zero length and
-# long vectors get shrunk to a length slightly below 1.
+# long vectors get shrunk to a length slightly below 1. Its norm is expected to represents probabilities
-# 
+# at some levels.
+#
 # .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/squash.png
 #    :height: 100px
-# 
+#
-def squash(s):
+def squash(s, dim=2):
-    mag_sq = torch.sum(s ** 2, dim=2, keepdim=True)
+    sq = torch.sum(s ** 2, dim=dim, keepdim=True)
-    mag = torch.sqrt(mag_sq)
+    s_std = torch.sqrt(sq)
-    s = (mag_sq / (1.0 + mag_sq)) * (s / mag)
+    s = (sq / (1.0 + sq)) * (s / s_std)
    return s
 ##############################################################################
-# Message Functions
+# General Setup
-# ..................
+# .................
-# At first stage, we need to define a message function to get all the attributes we need
-# in the further computations.
-def capsule_msg(src, edge):
-    return {'b_ij': edge['b_ij'], 'h': src['h'], 'u_hat': edge['u_hat']}
-##############################################################################
+import dgl
-# Reduce Functions
+import torch
-# ..................
+import torch.nn.functional as F
-# At this stage, we need to define a reduce function to aggregate all the information we
+from torch import nn
-# get from message function into node features.
-# This step implements the line 4 and line 5 in routing algorithms, which softmax over
-# :math:`b_{ij}` and calculate weighted sum of input features.
-# 
-# .. note::
-#    The softmax operation is over dimension :math:`j` instead of :math:`i`. 
-# 
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/capsule_f5.png
-# 
-def capsule_reduce(node, msg):
-    b_ij_c, u_hat = msg['b_ij'], msg['u_hat']
-    # line 4
-    c_i = F.softmax(b_ij_c, dim=0)
-    # line 5
-    s_j = (c_i.unsqueeze(2).unsqueeze(3) * u_hat).sum(dim=1)
-    return {'h': s_j}
-##############################################################################
-# Node Update Functions
-# ...........................
-# Squash the intermidiate representations into node features :math:`v_j`
-# 
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step6.png
-# 
-def capsule_update(msg):
-    v_j = squash(msg['h'])
-    return {'h': v_j}
-##############################################################################
+class DGLDigitCapsuleLayer(nn.Module):
-# Edge Update Functions
+    def __init__(self, input_capsule_dim=8, input_capsule_num=1152, output_capsule_num=10, output_capsule_dim=16,
-# ..........................
+                 num_routing=3, device='cpu'):
-# Update the routing parameters
+        super(DGLDigitCapsuleLayer, self).__init__()
-# 
+        self.device = device
-# .. image:: https://raw.githubusercontent.com/dmlc/web-data/master/dgl/tutorials/capsule/step7.png
+        self.input_capsule_dim = input_capsule_dim
-# 
+        self.input_capsule_num = input_capsule_num
-def update_edge(u, v, edge):
+        self.output_capsule_dim = output_capsule_dim
-    return {'b_ij': edge['b_ij'] + (v['h'] * edge['u_hat']).mean(dim=1).sum(dim=1)}
+        self.output_capsule_num = output_capsule_num
+        self.num_routing = num_routing
+        self.weight = nn.Parameter(
+            torch.randn(input_capsule_num, output_capsule_num, output_capsule_dim, input_capsule_dim))
+        self.g, self.input_nodes, self.output_nodes = self.construct_graph()
-##############################################################################
-# Executing algorithm
+# This section is for defining class in multiple cells.
-# .....................
+DGLDigitCapsuleLayer.construct_graph = construct_graph
-# Call `update_all` and `update_edge` functions to execute the algorithms
+DGLDigitCapsuleLayer.forward = forward
-def routing(self):
+DGLDigitCapsuleLayer.routing = routing
-    for i in range(self.num_routing):
+DGLDigitCapsuleLayer.compute_uhat = compute_uhat
-        self.g.update_all(capsule_msg, capsule_reduce, capsule_update)
+DGLDigitCapsuleLayer.initialize_nodes_and_edges_features = initialize_nodes_and_edges_features
-        self.g.update_edge(edge_func=update_edge)
-DGLBatchCapsuleLayer.routing = routing
--- a/tutorials/models/gcnTutorialNew.py
+++ b/tutorials/models/gcnTutorialNew.py
+"""
+Graph Convolutional Network New
+====================================
+**Author**: `Qi Huang`
+This is a brief entry to DGL and its message passing API through GCN(graph convolutional network).
+"""
+##############################################################################
+# Message Passing: Warming up
+# ---------------------------
+#
+# Let's begin with the simplest graph possible with two nodes, and set the node representations:
+import torch as th
+import dgl
+g = dgl.DGLGraph()
+g.add_nodes(2)
+g.add_edge(1, 0)
+x = th.tensor([[0.0, 0.0], [1.0, 2.0]])
+g.set_n_repr({'x': x})
+##############################################################################
+# What we want to do is simply to copy representation from node#1 to node#0, but with
+# a message passing interface. We do this like what we will do over a pair of sockets,
+# with a ``send`` and a ``recv`` interface.
+# The two `user defined function (UDF)` specifies the actions: deposit the value into an internal
+# key-value store with the key `msg`, and retrive it. Note that there may be multiple incoming edges
+# to a node, and the receiving end aggregates them.
+#
+# .. note::
+#  * ``send(src, dst)`` defines an edge explictly, so ``message_func`` taking ``edge`` as an
+#    argument is confusing.
+#  * following graph construction semantics, it'll be nice to allow ``src`` and ``dst`` as a pair
+#    of lists, or a pair of tensor, though this example doesn't demonstrate it.
+#  * likewise, since we allow edge broadcasting, we should allow it in ``send`` as well.
+#  * what's the side-effect of doing a send action? we are left with the impression that the second argument
+#    in the ``reduce_func`` (i.e. ``msgs``) magically gets the stuff with the same key.
+#  * my preference is to say that expected side-effect is simply that the result  of a ``send`` action is available
+#    at ``dst['key']``, where ``key`` is whatever the user specified in ``message_func``. this allows
+#    for cases where we use ``apply_node_func``.
+#  * in other words,
+#    ``message_func`` returns ``{'hey': [1.0]}``, we expect to see ``dst['hey']``. if that happens
+#    to be the represnetation key, then a replacement is done. user can define a new key, e.g. ``accum``,
+#    then the ``reduce_func`` and ``apply_node_func`` can do whatever they want. typically,
+#    they should return with the representation key to perform update.
+#
+def send_source(src, edge):
+    return {'msg': src['x']}
+def simple_reduce(node, msgs):
+    return {'x' : th.sum(msgs['msg'], dim=1)}
+g.send(1, 0, message_func=send_source)
+g.recv([0], reduce_func=simple_reduce)
+print(g.get_n_repr())
+##############################################################################
+# Some times the computation may involve representations on the edges. Let's say we want to "amplify"
+# the message:
+w = th.tensor([2.0])
+g.set_e_repr({'w': w})
+def send_source_with_edge_weight(src, edge):
+    return {'msg': src['x'] * edge['w']}
+g.send(1, 0, message_func=send_source_with_edge_weight)
+g.recv([0], reduce_func=simple_reduce)
+print(g.get_n_repr())
+##############################################################################
+# Or we may need to involve the desination's representation, and here is one version:
+def simple_reduce_addup(node, msgs):
+    return {'x' : node['x'] + th.sum(msgs['msg'], dim=1)}
+g.send(1, 0, message_func=send_source_with_edge_weight)
+g.recv([0], reduce_func=simple_reduce_addup)
+print(g.get_n_repr())
+##############################################################################
+# A slightly more complex but more flexible one is to store the reduced sum at the node under
+# a different key, and then call the ``apply_node_func``:
+#
+# .. note::
+#   that the stuff magically appear as part of node's key-value is non-intuitive.
+def simple_reduce_to_accum(node, msgs):
+    return {'accum' : th.sum(msgs['msg'], dim=1)}
+def simple_apply(node):
+    return {'x': node['x'] + node['accum']}
+g.send(1, 0, message_func=send_source_with_edge_weight)
+g.recv([0], reduce_func=simple_reduce_to_accum, apply_node_func=simple_apply)
+print(g.get_n_repr())
+##############################################################################
+# The ``send`` and ``recv`` is **level-1** call in DGL, they have the finest control over routing
+# the message.
+#
+# TODO: build a star graph (reuse the one in 2_graph.py), and use pull (or push)
+#
+# TODO: build a much bigger graph, explain with spMV and the use of ``update_all``
+#
+##############################################################################
+# Model Overview
+# ---------------
+# Introduction
+# ```````````````````
+# This is a simple implementation of Kipf & Welling's Semi-Supervised Classificaton with Graph Convolutional Networks in ICLR 2017, which propose a simple yet efficient model that extends convolutional neual network from the grid structured data we all familiar and like to graphs, like social network and knowledge graph. It starts from the framework of spectral graph convolutions and makes reasonable simplifications to achieve both faster training and higher prediction accuracy. It also achieves start-of-the-art classification results on a number of graph datasets like CORA, etc. /TODO: elaborate.
+# Note that this is not intended to be an end-to-end lecture on Kiph & Willing's GCN paper. In this tutorial, we aim at providing a friendly entry to showcase how to code up a contemporary NN model operating on graph structure data, and increases user's understanding of DGL's message passing API in action. For a more thorough understanding of the derivation and all details of GCN, please visit the original paper. /TODO(hq): add link.
+#
+# GCN in one formula
+# `````````````````````
+# Essentially, GCN's model boils down to the following oen formula
+# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
+#
+# The equation above describes a "graph convolution layer" in GCN.
+# Essentially, :math:`H^{(l)}` denotes the lth layer in the network, :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for this layer. :math:`D` and :math:`A`, as commonly seen, represent degree matrix and adjacency matrix, respectively. The ~ is a renormalization trick in which we add a self-connection to each node of the graph, and build the corresponding degree and adjacency matrix.
+#
+# The shape of the input :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes and :math:`D` is the number of input features. We can chain up multiple layers as such to produce a node-level representation output with shape :math:`N \times F`, where :math:`F` is the dimension of the output node feature vector.
+#
+# Derivation of GCN
+# ``````````````````
+# \TODO(hq) do we need a short description of how we departure from spectral based method and end with GCN?
+# According to others, this amounts to a laplacian smoothing.
+#
+# Understanding GCN from Message Passing
+# ````````````````````````````````````````
+# Think about :math:`W^{(l)}` just as a matrix of
+# filter parameters to project :math:`H^{(l)}`.
+# :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` as a symmetrical normalization of the
+# adjacency matrix.
+#
+# Combining these two, we arrives at a must succint form of GCN :
+# :math:`\sigma(\hat{A}\hat{H}^{(l)})`
+# where :math:`\hat{A}` means a normalized version of
+# adjacency matrix, and :math:`\hat{H}` means a
+# projection of last layer's node-level representation :math:`H`.
+#
+# We can further formulate multiplication with the adjacency matrix as performing message passing between nodes following paths encoding in the adjacency matrix.
+# To make it simple, let's denote the input signal on a graph :math:`G = (V,E)` as :math:`x \in \mathcal{R}^{|\mathcal{V}|x1}`, assume each node's feature is only a scalar.
+# Then, if we calculate :math:`x_{t+1} = Ax_{t}`, it amounts to perform a message passing operation on existing edges. The ith node's new feature :math:`x_{t+1}^{i}` essentially adds up the old feature vector :math:`x_{t}`, when the corresponding node index has non-zero entry on the ith row of the adjacency matrix A, i.e. has edge connection with node i. If we multiply the resulting vector :math:`x_{t+1}` again with A, the resulting vector, :math:`A^{2}x_{t}`, will be the resulting feature vector after two rounds of message-passing is performed. In this sense, :math:`A^2` encodes 2-hop neighborhood information for each node. By k-hop neighborhood, we mean any node reachable with exactly k steps starting from the current node (if self connection is not included in the original adjacency matrix), or any node reachable within k steps from the current node if self connection is included). In another view, we can also understand :math:`A^2` as :math:`A^2_{i,j}` = OR(k){ A_{i,k} && A_{k,j}}.
+#
+# Nonetheless, in GCN we only use :math:`\sigma(\hat{A}\hat{H}^{(l)})` in each layer, meaning we only propagate information among each node's 1-hop neighborhood for each layer.
+#
+#
+# Model Implementation
+# ------------------------
+# Warming up of message passing API
+# ````````````````````````````````````
+# DGL provides 3 levels of message passing API, giving user different level of control. Below we demonstrate three different levels of APIs on a simple star graph of size 10, where node 1-9 all sends information to node 0.
+#
+# Level 1 -- send, recv, and apply_node
+# ..........................................
+# The most basic level is ``send(srs,dst,message_function)``, ``recv(node,reduce_function)``, and ``apply_nodes(nodes)``.
+# ``send()`` and ``recv()`` allow users to designate specific pairs of (source, destination) to pass information. ``apply_nodes()`` allow users to perform per-node computation.
+#
+# Three functions need to be pre-specified when using message pasing api: 1) message function 2) reduce function 3) apply function. Message function determines what message is passed along edges; reduce function determines how messages are aggregated at the destination node; apply functions determines  Note that all these three functions can be either defined by users, or use built-in functions when importing ``dgl.function``. For a more detailed description of built-in function syntax, please see \TODO(hq) add hyperref.
+#
+# User don't have to pass message_function and reduce_function everytime as parameters to the function if they registered them in the graph in priori, as shown in the following code.
+import argparse
+import time
+import torch as th
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import dgl
+import networkx as nx
+from dgl import DGLGraph
+from dgl.data import register_data_args, load_data
+star = dgl.DGLGraph()
+star.add_nodes(10)
+u = list(range(1,10))
+star.add_edges(u,0) # create the graph
+D = 1  # the feature dimension
+N = star.number_of_nodes()
+M = star.number_of_edges()
+nfeat = th.ones((N, D))  # each node's feature is just 1
+efeat = th.ones((M, D))*2  # each edge's feature is 2.
+star.set_n_repr({'hv' : nfeat})
+star.set_e_repr({'he' : efeat})
+u = th.tensor([0])
+v = th.tensor([1,2,3,4,5]) #sending node 1-5's node feature to node 0's.
+def _message_test(src,edge):
+    return {'hv':src['hv']}
+def _reduce(node,msgs):
+    return{'hv':node['hv']+msgs['hv'].sum(1)}
+    # aggregate alone the second dimension as
+    # the first dimension is reserved for batching in DGL.
+star.register_message_func(_message_test)
+star.register_reduce_func(_reduce)
+star.send(v,u)
+# DGL supports batching send/recv and broadcasting.
+star.recv(u)
+#We expect to get 6 on node 0.
+print(star.get_n_repr()['hv'])
+##########################################################################
+# Level 2 -- pull, push, and send_and_recv
+# ............................................
+# It could be both tedious and inefficient for user to call ``send()`` and ``recv()`` respectively. DGL comes into aid by providing a series of higher level APIs which also increase the performance by operator fusion in the backend ``/TODO(gaiyu) verify this statement please``.
+# ``send_and_recv(src,dst,message_func,reduce_func,apply_func)`` is essentially a wrapper around send and receive.
+# pull(node,message_func,reduce_func,apply_func) will take the input nodes as destination nodes, and all their predeseccor nodes as source nodes, and perform ``send_and_recv()``
+# push(node,message_func,reduce_func,apply_func) will take the input nodes as source nodes, and all their descendant nodes as destination nodes, and perform ``send_and_recv()``
+#
+# Notice that apply function is usually optional in message passing APIs.
+star.set_n_repr({'hv' : nfeat}) #reset node repr
+star.set_e_repr({'he' : efeat}) #reset edge repr
+star.send_and_recv(v,u) #note that here apply functon is left blank
+print(star.get_n_repr()['hv']) # we expect to get 6 on node 0
+#####################################################################
+#
+# Then we register the apply function.
+#
+def _apply_test(node):
+    return {'hv':500*node['hv']}
+star.register_apply_node_func(_apply_test)
+star.apply_nodes(u)
+print(star.get_n_repr()['hv']) #we expect to get 3000 on node 0
+#########################################################################
+star.set_n_repr({'hv' : nfeat}) #reset node repr
+star.set_e_repr({'he' : efeat}) #reset edge repr
+star.pull(u)
+print(star.get_n_repr()['hv']) # we expect to get 3000 on node 0
+###################################################################
+star.set_n_repr({'hv' : nfeat}) #reset node repr
+star.set_e_repr({'he' : efeat}) #reset edge repr
+star.push(v)
+print(star.get_n_repr()['hv']) # we expect to get 3000 on node 0
+#######################################################################
+# Level 3 -- update_all
+# ..........................
+# In many cases, user would like to perform message passing on all the edges simoutaneously, such as in the case of adjacency matrix multiplication in GCN. DGL also provides ``update_all()`` method to achieve this, also optimizing the performance under the hood.
+star.set_n_repr({'hv' : nfeat}) #reset node repr
+star.set_e_repr({'he' : efeat}) #reset edge repr
+star.update_all(apply_node_func = None)
+print(star.get_n_repr()['hv']) # we expect to get 10 on node 0, as we choose not to perform any apply_node functions
+#
+##########################################################
+# Model Implementation
+# ``````````````````````````````
+# Model definition
+# ....................
+# Similar to above, we first define the message function, reduce function and apply function for GCN.
+def gcn_msg(src, edge):
+    return {'m' : src['h']} #return node feature
+def gcn_reduce(node, msgs):
+    return {'h' : th.sum(msgs['m'], 1)} # aggregate incoming node features
+class NodeApplyModule(nn.Module):
+    def __init__(self, in_feats, out_feats, activation=None):
+        super(NodeApplyModule, self).__init__()
+        self.linear = nn.Linear(in_feats, out_feats)
+        self.activation = activation #apply a filter and non-linearity.
+    def forward(self, node):
+        h = self.linear(node['h'])
+        if self.activation:
+            h = self.activation(h)
+            #raise RuntimeError(h.shape)
+        return {'h' : h}
+class GCN(nn.Module):
+    def __init__(self,
+                 g,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 activation,
+                 dropout,
+                 mode=1):
+        super(GCN, self).__init__()
+        self.g = g #graph is passed as a parameter to the model
+        self.dropout = dropout
+        # input layer
+        self.layers = nn.ModuleList([NodeApplyModule(in_feats, n_hidden, activation)])
+        # hidden layers
+        for i in range(n_layers - 1):
+            self.layers.append(NodeApplyModule(n_hidden, n_hidden, activation))
+        # output layer
+        self.layers.append(NodeApplyModule(n_hidden, n_classes))
+        self.mode = mode # indicate DGL message passing level for subsequent use
+    # Message passing in 3 levels --- level 1
+    def lv1_mp(self, layer):
+        nodeIdList = list(i for i in range(self.g.number_of_nodes()))
+        for s in nodeIdList:
+                self.g.send(s, nodeIdList, gcn_msg)
+        self.g.recv(nodeIdList, gcn_reduce, layer)
+        #self.g.apply_nodes(nodeIdList, layer)
+    # Message passing in 3 levels --- level 2
+    def lv2_mp(self, layer):
+        dst = list(i for i in range(self.g.number_of_nodes()))
+        self.g.pull(dst, gcn_msg, gcn_reduce, layer)
+    # Message passing in 3 levels -- level 3
+    def lv3_mp(self, layer):
+        #nodeIdList = list(i for i in range(self.g.number_of_nodes()))
+        self.g.update_all(gcn_msg, gcn_reduce, layer)
+        #self.g.update_all(gcn_msg, gcn_reduce)
+        #self.g.apply_nodes(nodeIdList, layer)
+    # Below is the forward function
+    def forward(self, features):
+        self.g.set_n_repr({'h' : features})
+        for layer in self.layers:
+            # apply dropout
+            if self.dropout:
+                g.apply_nodes(apply_node_func=
+                        lambda node: F.dropout(node['h'], p=self.dropout))
+            assert self.mode in [1,2,3]
+            if self.mode == 1 :
+                self.lv1_mp(layer)
+            elif self.mode == 2 :
+                self.lv2_mp(layer)
+            else :
+                self.lv3_mp(layer)
+        return self.g.pop_n_repr('h')
+######################################################################
+# Training & Inference
+# ``````````````````````````````````
+# Below we train the model and perform inference.
+from dgl.data import citation_graph as citegrh
+data = citegrh.load_cora()
+features = th.FloatTensor(data.features)
+print(type(features))
+print(type(data.features))
+labels = th.LongTensor(data.labels)
+mask = th.ByteTensor(data.train_mask)
+in_feats = features.shape[1]
+n_classes = data.num_labels
+n_edges = data.graph.number_of_edges()
+# Some training hyperparameters for illustration
+#cuda = False #Not sure whether there is cuda or not
+cuda = True
+th.cuda.set_device(-1)
+features = features.cuda()
+labels = labels.cuda()
+mask = mask.cuda()
+n_hidden = 16
+n_layers = 1
+dropout = 0
+n_epochs = 200
+lr = 1e-3
+g = DGLGraph(data.graph)
+model = GCN(g,
+            in_feats,
+            n_hidden,
+            n_classes,
+            n_layers,
+            F.relu,
+            dropout,
+            mode = 3) #level 3 message passing
+model2 = GCN(g,
+            in_feats,
+            n_hidden,
+            n_classes,
+            n_layers,
+            F.relu,
+            dropout,
+            mode = 3) #level 2 message passing
+model.cuda()
+model2.cuda()
+# use optimizer
+optimizer = th.optim.Adam(model2.parameters(), lr=lr)
+# initialize graph
+dur = []
+for epoch in range(n_epochs):
+    if epoch >=3:
+        t0 = time.time()
+    #forward
+    logits = model2(features)
+    logp = F.log_softmax(logits, 1)
+    loss = F.nll_loss(logp[mask], labels[mask])
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    if epoch >= 3:
+        dur.append(time.time() - t0)
+        print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f} | ETputs(KTEPS) {:.2f}".format(
+            epoch, loss.item(), np.mean(dur), n_edges / np.mean(dur) /1000))
--- a/tutorials/models/gcn_gat.py
+++ b/tutorials/models/gcn_gat.py
+"""
+Graph Convolutional Network New
+====================================
+**Author**: `Quan Gan`
+In this tutorial, we will go through the basics of DGL, in the following order:
+    1. Creating a graph
+    2. Setting/getting node/edge states
+    3. Updating node/edge states using user-defined functions
+    4. Passing information to edges from endpoint nodes
+    5. Passing information to nodes from adjacent nodes and edges
+    6. Implementing a Graph Convolutional Network (GCN) and a Graph Attention
+       Network (GAT)
+    7. Using built-in functions to simplify your implementation
+"""
+##############################################################################
+# Section 1. Creating a Graph
+# ---------------------------
+#
+# Let's say we want to create the following graph:
+#
+# .. digraph:: foo
+#
+#    digraph foo {
+#            layout=circo;
+#            "A" -> "B" -> "C" -> "A";
+#    }
+#
+# First, we need to create a ``DGLGraph`` object.
+from dgl import DGLGraph
+g = DGLGraph()
+##############################################################################
+# And then we add 3 vertices (or *nodes*) into ``g``:
+g.add_nodes(3)
+##############################################################################
+# In DGL, all vertices are uniquely identified by integers, starting from 0.
+# Assuming that we map the node ``A``, ``B``, and ``C`` to ID 0, 1, and 2, we
+# can add the edges of the desired graph above as follows:
+g.add_edge(0, 1)
+g.add_edge(1, 2)
+g.add_edge(2, 0)
+# Or, equivalently
+# g.add_edges([0, 1, 2], [1, 2, 0])
+##############################################################################
+# All the edges are also uniquely identified by integers, again starting from
+# 0.  The edges are labeled in the order of addition.  In the example above,
+# the edge ``0 -> 1`` is labeled as edge #0, ``1 -> 2`` as edge #1, and
+# ``2 -> 0`` as edge #2.
+##############################################################################
+# Section 2. Setting/getting node/edge states
+# --------------------------------------
+# Now, we wish to assign the nodes some states, or features.
+#
+# In DGL, the node/edge states are represented as dictionaries, with strings
+# as keys (or *fields*), and tensors as values.  DGL aims to be
+# framework-agnostic, and currently it supports PyTorch and MXNet.  From now
+# on, we use PyTorch as an example.
+#
+# You can set up states for some or all nodes at the same time in DGL.
+# All you need is to stack the tensors along the first dimension for each
+# key, and feed the dictionary of the stacked tensors into ``set_n_repr``
+# as a whole.
+import torch
+# We are going to assign each node two states X and Y.  For each node,
+# X is a 2-D vector and Y is a 2x4 matrix.  You only need to make sure
+# the tensors with the same key across all the (set) nodes to have the
+# same shape and data type.
+X = torch.randn(3, 2)
+Y = torch.randn(3, 2, 4)
+# You can set the states for all of them...
+g.set_n_repr({'X': X, 'Y': Y})
+# ... or setting partial states, but only after you have set all nodes on
+# at least one key.
+# TODO: do we want to fix this behavior to allow initial partial setting?
+g.set_n_repr({'X': X[0:2], 'Y': Y[0:2]}, [0, 1])
+# You can also overwrite part of the fields.  The following overwrites field
+# X while keeping Y intact.
+X = torch.randn(3, 2)
+g.set_n_repr({'X': X})
+##############################################################################
+# You can also efficiently get the node states as a dictionary of tensors.
+# The dictionary will also have strings as keys and stacked tensors as values.
+# Getting all node states.  The tensors will be stacked along the first
+# dimension, in the same order as node ID.
+n_repr = g.get_n_repr()
+X_ = n_repr['X']
+Y_ = n_repr['Y']
+assert torch.allclose(X_, X)
+assert torch.allclose(Y_, Y)
+# You can also get the states from a subset of nodes.  The tensors will be
+# stacked along the first dimension, in the same order as what you feed in.
+n_repr_subset = g.get_n_repr([0, 2])
+X_ = n_repr_subset['X']
+Y_ = n_repr_subset['Y']
+assert torch.allclose(X_, X[[0, 2]])
+assert torch.allclose(Y_, Y[[0, 2]])
+##############################################################################
+# Setting/getting edge states is very similar.  We provide two ways of reading
+# and writing edge states: by source-destination pairs, and by edge ID.
+# We are going to assign each edge a state A and a state B, both of which are
+# 3-D vectors for each edge.
+A = torch.randn(3, 3)
+B = torch.randn(3, 3)
+# You can either set the states of all edges...
+g.set_e_repr({'A': A, 'B': B})
+# ... or by source-destination pair (in this case, assigning A[0] to (0 -> 1)
+# and A[2] to (2 -> 0) ...
+g.set_e_repr({'A': A[[0, 2]], 'B': B[[0, 2]]}, [0, 2], [1, 0])
+# ... or by edge ID (#0 and #2)
+g.set_e_repr_by_id({'A': A[[0, 2]], 'B': B[[0, 2]]}, [0, 2])
+# Note that the latter two options are available only if you have set at least
+# one field on all edges.
+# TODO: do we want to fix this behavior to allow initial partial setting?
+# Getting edge states is also easy...
+e_repr = g.get_e_repr()
+A_ = e_repr['A']
+assert torch.allclose(A_, A)
+# ... and you can also do it either by specifying source-destination pair...
+e_repr_subset = g.get_e_repr([0], [1])
+assert torch.allclose(e_repr_subset['A'], A[[0]])
+# ... or by edge ID
+e_repr_subset = g.get_e_repr_by_id([0])
+assert torch.allclose(e_repr_subset['A'], A[[0]])
+##############################################################################
+# One can also remove node/edge states from the graph.  This is particularly
+# useful to save memory during inference.
+B_ = g.pop_e_repr('B')
+assert torch.allclose(B_, B)
+##############################################################################
+# Section 3. Updating node/edge states
+# ------------------------------------
+# The most direct way to update node/edge states is by getting/setting the
+# states directly.  Of course, you can update the states on a subset of
+# nodes and/or edges this way.
+X_new = g.get_n_repr()['X'] + 2
+g.set_n_repr({'X': X_new})
+##############################################################################
+# A better structured implementation would wrap the update procedure as a
+# function/module, to decouple the update logic from the rest of the system.
+def updateX(node_state_dict):
+    return {'X': node_state_dict['X'] + 2}
+g.set_n_repr(updateX(g.get_n_repr()))
+##############################################################################
+# If your node state update function is a **node-wise map** operation (i.e.
+# the update on a single node only depends on the current state of that
+# particular node), you can also call ``apply_nodes`` method.
+#
+# .. note::
+#  In distributed computation, 
+g.apply_nodes(apply_node_func=updateX)
+# You can also update node states partially
+g.apply_nodes(v=[0, 1], apply_node_func=updateX)
+##############################################################################
+# For edges, DGL also has an ``apply_edges`` method for **edge-wise map**
+# operations.
+def updateA(edge_state_dict):
+    return {'A': edge_state_dict['A'] + 2}
+g.apply_edges(apply_edge_func=updateA)
+# You can also update edge states by specifying endpoints or edge IDs
+g.apply_edges(u=[0, 2], v=[1, 0], apply_edge_func=updateA)
+g.apply_edges(eid=[0, 2], apply_edge_func=updateA)