[Doc] Improve Capsule with Jinyang & Fix wrong tutorial level layout (#236)

* improve capsule tutorial with jinyang * fix wrong layout of second-level tutorial * delete transformer

[Doc] Improve Capsule with Jinyang & Fix wrong tutorial level layout (#236)
* improve capsule tutorial with jinyang * fix wrong layout of second-level tutorial * delete transformer
16cc5287 · VoVAllen · Minjie Wang · dafe4671 · 16cc5287 · 16cc5287
Commit 16cc5287 authored Dec 04, 2018 by VoVAllen Committed by Minjie Wang Dec 04, 2018
17 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
 # -*- coding: utf-8 -*-
 #
 # Configuration file for the Sphinx documentation builder.
 #
 # This file does only contain a selection of the most common options. For a
 # full list see the documentation:
 # http://www.sphinx-doc.org/en/master/config
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
 import sys
 sys.path.insert(0, os.path.abspath('../../python'))
 # -- Project information -----------------------------------------------------
 project = 'DGL'
 copyright = '2018, DGL Team'
 author = 'DGL Team'
 # The short X.Y version
 version = '0.0.1'
 # The full version, including alpha/beta/rc tags
 release = '0.0.1'
 # -- General configuration ---------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
 #
 # needs_sphinx = '1.0'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'sphinx.ext.coverage',
    'sphinx.ext.mathjax',
    'sphinx.ext.napoleon',
    'sphinx.ext.viewcode',
    'sphinx.ext.intersphinx',
    'sphinx.ext.graphviz',
    'sphinx_gallery.gen_gallery',
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 source_suffix = ['.rst', '.md']
 # The master toctree document.
 master_doc = 'index'
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
 language = None
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
 # html_theme_options = {}
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
 #
 # The default sidebars (for documents that don't match any pattern) are
 # defined by theme itself.  Builtin themes are using these templates by
 # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
 # 'searchbox.html']``.
 #
 # html_sidebars = {}
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
 htmlhelp_basename = 'dgldoc'
 # -- Options for LaTeX output ------------------------------------------------
 latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
    (master_doc, 'dgl.tex', 'DGL Documentation',
     'DGL Team', 'manual'),
 ]
 # -- Options for manual page output ------------------------------------------
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
    (master_doc, 'dgl', 'DGL Documentation',
     [author], 1)
 ]
 # -- Options for Texinfo output ----------------------------------------------
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
    (master_doc, 'dgl', 'DGL Documentation',
     author, 'dgl', 'Library for deep learning on graphs.',
     'Miscellaneous'),
 ]
 # -- Options for Epub output -------------------------------------------------
 # Bibliographic Dublin Core info.
 epub_title = project
 # The unique identifier of the text. This can be a ISBN number
 # or the project homepage.
 #
 # epub_identifier = ''
 # A unique identification for the text.
 #
 # epub_uid = ''
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ['search.html']
 # -- Extension configuration -------------------------------------------------
 autosummary_generate = True
 intersphinx_mapping = {
    'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
    'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
    'matplotlib': ('http://matplotlib.org/', None),
    'networkx' : ('https://networkx.github.io/documentation/stable', None),
 }
 # sphinx gallery configurations
 from sphinx_gallery.sorting import FileNameSortKey
-examples_dirs = ['../../tutorials']  # path to find sources
+examples_dirs = ['../../tutorials/basics','../../tutorials/models']  # path to find sources
-gallery_dirs = ['tutorials']  # path to generate docs
+gallery_dirs = ['tutorials/basics','tutorials/models']  # path to generate docs
 reference_url = {
    'dgl' : None,
    'numpy': 'http://docs.scipy.org/doc/numpy/',
    'scipy': 'http://docs.scipy.org/doc/scipy/reference',
    'matplotlib': 'http://matplotlib.org/',
    'networkx' : 'https://networkx.github.io/documentation/stable',
 }
 sphinx_gallery_conf = {
    'backreferences_dir' : 'generated/backreferences',
    'doc_module' : ('dgl', 'numpy'),
    'examples_dirs' : examples_dirs,
    'gallery_dirs' : gallery_dirs,
    'within_subsection_order' : FileNameSortKey,
    'filename_pattern' : '.py',
 }
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -65,7 +65,8 @@ credit, see `here <https://www.dgl.ai/ack>`_.
   :caption: Tutorials
   :glob:
-   tutorials/index
+   tutorials/basics/index
+   tutorials/models/index
 .. toctree::
   :maxdepth: 2

--- a/tutorials/1_first.py
+++ b/tutorials/1_first.py
--- a/tutorials/2_basics.py
+++ b/tutorials/2_basics.py
 """
 .. currentmodule:: dgl
 DGL Basics
 ==========
 **Author**: `Minjie Wang <https://jermainewang.github.io/>`_, Quan Gan, Yu Gai,
 Zheng Zhang
 The Goal of this tutorial:
 * To create a graph.
 * To read and write node and edge representations.
 """
 ###############################################################################
 # Graph Creation
 # --------------
 # The design of :class:`DGLGraph` was influenced by other graph libraries. Indeed,
 # you can create a graph from networkx, and convert it into a :class:`DGLGraph` and
 # vice versa:
 import networkx as nx
 import dgl
 g_nx = nx.petersen_graph()
 g_dgl = dgl.DGLGraph(g_nx)
 import matplotlib.pyplot as plt
 plt.subplot(121)
 nx.draw(g_nx, with_labels=True)
 plt.subplot(122)
 nx.draw(g_dgl.to_networkx(), with_labels=True)
 plt.show()
 ###############################################################################
 # They are the same graph, except that :class:`DGLGraph` is *always* directional.
 #
 # One can also create a graph by calling DGL's own interface.
 # 
 # Now let's build a star graph. :class:`DGLGraph` nodes are consecutive range of
 # integers between 0 and :func:`number_of_nodes() <DGLGraph.number_of_nodes>`
 # and can grow by calling :func:`add_nodes <DGLGraph.add_nodes>`.
 # :class:`DGLGraph` edges are in order of their additions. Note that
 # edges are accessed in much the same way as nodes, with one extra feature
 # of *edge broadcasting*:
 import dgl
 import torch as th
 g = dgl.DGLGraph()
 g.add_nodes(10)
 # a couple edges one-by-one
 for i in range(1, 4):
    g.add_edge(i, 0)
 # a few more with a paired list
 src = list(range(5, 8)); dst = [0]*3
 g.add_edges(src, dst)
 # finish with a pair of tensors
 src = th.tensor([8, 9]); dst = th.tensor([0, 0])
 g.add_edges(src, dst)
 # edge broadcasting will do star graph in one go!
 g.clear(); g.add_nodes(10)
 src = th.tensor(list(range(1, 10)));
 g.add_edges(src, 0)
 import networkx as nx
 import matplotlib.pyplot as plt
 nx.draw(g.to_networkx(), with_labels=True)
 plt.show()
 ###############################################################################
 # Feature Assignment
 # ------------------
 # One can also assign features to nodes and edges of a :class:`DGLGraph`.  The
 # features are represented as dictionary of names (strings) and tensors,
 # called **fields**.
 #
 # The following code snippet assigns each node a vector (len=3).
 #
 # .. note::
 #
 #    DGL aims to be framework-agnostic, and currently it supports PyTorch and
 #    MXNet tensors. From now on, we use PyTorch as an example.
 import dgl
 import torch as th
 x = th.randn(10, 3)
 g.ndata['x'] = x
 ###############################################################################
 # :func:`ndata <DGLGraph.ndata>` is a syntax sugar to access states of all nodes,
 # states are stored
 # in a container ``data`` that hosts user defined dictionary.
 print(g.ndata['x'] == g.nodes[:].data['x'])
 # access node set with integer, list, or integer tensor
 g.nodes[0].data['x'] = th.zeros(1, 3)
 g.nodes[[0, 1, 2]].data['x'] = th.zeros(3, 3)
 g.nodes[th.tensor([0, 1, 2])].data['x'] = th.zeros(3, 3)
 ###############################################################################
 # Assigning edge features is in a similar fashion to that of node features,
 # except that one can also do it by specifying endpoints of the edges.
 g.edata['w'] = th.randn(9, 2)
 # access edge set with IDs in integer, list, or integer tensor
 g.edges[1].data['w'] = th.randn(1, 2)
 g.edges[[0, 1, 2]].data['w'] = th.zeros(3, 2)
 g.edges[th.tensor([0, 1, 2])].data['w'] = th.zeros(3, 2)
 # one can also access the edges by giving endpoints
 g.edges[1, 0].data['w'] = th.ones(1, 2)                 # edge 1 -> 0
 g.edges[[1, 2, 3], [0, 0, 0]].data['w'] = th.ones(3, 2) # edges [1, 2, 3] -> 0
 ###############################################################################
 # After assignments, each node/edge field will be associated with a scheme
 # containing the shape and data type (dtype) of its field value.
 print(g.node_attr_schemes())
 g.ndata['x'] = th.zeros((10, 4))
 print(g.node_attr_schemes())
 ###############################################################################
 # One can also remove node/edge states from the graph. This is particularly
 # useful to save memory during inference.
 g.ndata.pop('x')
 g.edata.pop('w')
 ###############################################################################
 # Multigraphs
 # ~~~~~~~~~~~
 # Many graph applications need multi-edges. To enable this, construct :class:`DGLGraph`
 # with ``multigraph=True``.
 g_multi = dgl.DGLGraph(multigraph=True)
 g_multi.add_nodes(10)
 g_multi.ndata['x'] = th.randn(10, 2)
 g_multi.add_edges(list(range(1, 10)), 0)
 g_multi.add_edge(1, 0) # two edges on 1->0
 g_multi.edata['w'] = th.randn(10, 2)
 g_multi.edges[1].data['w'] = th.zeros(1, 2)
 print(g_multi.edges())
 ###############################################################################
 # An edge in multi-graph cannot be uniquely identified using its incident nodes
 # :math:`u` and :math:`v`; query their edge ids use ``edge_id`` interface.
 eid_10 = g_multi.edge_id(1, 0)
 g_multi.edges[eid_10].data['w'] = th.ones(len(eid_10), 2)
 print(g_multi.edata['w'])
 ###############################################################################
 # .. note::
 #
 #    * Nodes and edges can be added but not removed; we will support removal in
 #      the future.
 #    * Updating a feature of different schemes raise error on indivdual node (or
 #      node subset).
 ###############################################################################
 # Next steps
 # ----------
 # In the :doc:`next tutorial <3_pagerank>`, we will go through the
 # DGL message passing interface by implementing PageRank.
--- a/tutorials/3_pagerank.py
+++ b/tutorials/3_pagerank.py
 """
 .. currentmodule:: dgl
 PageRank with DGL Message Passing
 =================================
 **Author**: `Minjie Wang <https://jermainewang.github.io/>`_, Quan Gan, Yu Gai,
 Zheng Zhang
 In this section we illustrate the usage of different levels of message
 passing API with PageRank on a small graph. In DGL, the message passing and
 feature transformations are all **User-Defined Functions** (UDFs).
 The goal of this tutorial: to implement PageRank using DGL message passing
 interface.
 """
 ###############################################################################
 # The PageRank Algorithm
 # ----------------------
 # In each iteration of PageRank, every node (web page) first scatters its
 # PageRank value uniformly to its downstream nodes. The new PageRank value of
 # each node is computed by aggregating the received PageRank values from its
 # neighbors, which is then adjusted by the damping factor:
 #
 # .. math::
 #
 #    PV(u) = \frac{1-d}{N} + d \times \sum_{v \in \mathcal{N}(u)}
 #    \frac{PV(v)}{D(v)}
 #
 # where :math:`N` is the number of nodes in the graph; :math:`D(v)` is the
 # out-degree of a node :math:`v`; and :math:`\mathcal{N}(u)` is the neighbor
 # nodes.
 ###############################################################################
 # A naive implementation
 # ----------------------
 # Let us first create a graph with 100 nodes with NetworkX and convert it to a
 # :class:`DGLGraph`:
 import networkx as nx
 import matplotlib.pyplot as plt
 import torch
 import dgl
 N = 100  # number of nodes
 DAMP = 0.85  # damping factor
 K = 10  # number of iterations
 g = nx.nx.erdos_renyi_graph(N, 0.1)
 g = dgl.DGLGraph(g)
 nx.draw(g.to_networkx(), node_size=50, node_color=[[.5, .5, .5,]])
 plt.show()
 ###############################################################################
 # According to the algorithm, PageRank consists of two phases in a typical
 # scatter-gather pattern. We first initialize the PageRank value of each node
 # to :math:`\frac{1}{N}` and store each node's out-degree as a node feature:
 g.ndata['pv'] = torch.ones(N) / N
 g.ndata['deg'] = g.out_degrees(g.nodes()).float()
 ###############################################################################
 # We then define the message function, which divides every node's PageRank
 # value by its out-degree and passes the result as message to its neighbors:
 def pagerank_message_func(edges):
    return {'pv' : edges.src['pv'] / edges.src['deg']}
 ###############################################################################
 # In DGL, the message functions are expressed as **Edge UDFs**.  Edge UDFs
 # take in a single argument ``edges``.  It has three members ``src``, ``dst``,
 # and ``data`` for accessing source node features, destination node features,
 # and edge features respectively.  Here, the function computes messages only
 # from source node features.
 #
 # Next, we define the reduce function, which removes and aggregates the
 # messages from its ``mailbox``, and computes its new PageRank value:
 def pagerank_reduce_func(nodes):
    msgs = torch.sum(nodes.mailbox['pv'], dim=1)
    pv = (1 - DAMP) / N + DAMP * msgs
    return {'pv' : pv}
 ###############################################################################
 # The reduce functions are **Node UDFs**.  Node UDFs have a single argument
 # ``nodes``, which has two members ``data`` and ``mailbox``.  ``data``
 # contains the node features while ``mailbox`` contains all incoming message
 # features, stacked along the second dimension (hence the ``dim=1`` argument).
 #
 # The message UDF works on a batch of edges, whereas the reduce UDF works on
 # a batch of edges but outputs a batch of nodes. Their relationships are as
 # follows:
 #
 # .. image:: https://i.imgur.com/kIMiuFb.png
 #
 # We register the message function and reduce function, which will be called
 # later by DGL.
 g.register_message_func(pagerank_message_func)
 g.register_reduce_func(pagerank_reduce_func)
 ###############################################################################
 # The algorithm is then very straight-forward. Here is the code for one
 # PageRank iteration:
 def pagerank_naive(g):
    # Phase #1: send out messages along all edges.
    for u, v in zip(*g.edges()):
        g.send((u, v))
    # Phase #2: receive messages to compute new PageRank values.
    for v in g.nodes():
        g.recv(v)
 ###############################################################################
 # Improvement with batching semantics
 # -----------------------------------
 # The above code does not scale to large graph because it iterates over all
 # the nodes. DGL solves this by letting user compute on a *batch* of nodes or
 # edges. For example, the following codes trigger message and reduce functions
 # on multiple nodes and edges at once.
 def pagerank_batch(g):
    g.send(g.edges())
    g.recv(g.nodes())
 ###############################################################################
 # Note that we are still using the same reduce function ``pagerank_reduce_func``,
 # where ``nodes.mailbox['pv']`` is a *single* tensor, stacking the incoming
 # messages along the second dimension.
 #
 # Naturally, one will wonder if this is even possible to perform reduce on all
 # nodes in parallel, since each node may have different number of incoming
 # messages and one cannot really "stack" tensors of different lengths together.
 # In general, DGL solves the problem by grouping the nodes by the number of
 # incoming messages, and calling the reduce function for each group.
 ###############################################################################
 # More improvement with higher level APIs
 # ---------------------------------------
 # DGL provides many routines that combines basic ``send`` and ``recv`` in
 # various ways. They are called **level-2 APIs**. For example, the PageRank
 # example can be further simplified as follows:
 def pagerank_level2(g):
    g.update_all()
 ###############################################################################
 # Besides ``update_all``, we also have ``pull``, ``push``, and ``send_and_recv``
-# in this level-2 category. Please refer to the :doc:`API reference <../api/python/graph>`
+# in this level-2 category. Please refer to the :doc:`API reference <../../api/python/graph>`
 # for more details.
 ###############################################################################
 # Even more improvement with DGL builtin functions
 # ------------------------------------------------
 # As some of the message and reduce functions are very commonly used, DGL also
 # provides **builtin functions**. For example, two builtin functions can be
 # used in the PageRank example.
 #
 # * :func:`dgl.function.copy_src(src, out) <function.copy_src>`
 #   is an edge UDF that computes the
 #   output using the source node feature data. User needs to specify the name of
 #   the source feature data (``src``) and the output name (``out``).
 # 
 # * :func:`dgl.function.sum(msg, out) <function.sum>` is a node UDF
 #   that sums the messages in
 #   the node's mailbox. User needs to specify the message name (``msg``) and the
 #   output name (``out``).
 #
 # For example, the PageRank example can be rewritten as following:
 import dgl.function as fn
 def pagerank_builtin(g):
    g.ndata['pv'] = g.ndata['pv'] / g.ndata['deg']
    g.update_all(message_func=fn.copy_src(src='pv', out='m'),
                 reduce_func=fn.sum(msg='m',out='m_sum'))
    g.ndata['pv'] = (1 - DAMP) / N + DAMP * g.ndata['m_sum']
 ###############################################################################
 # Here, we directly provide the UDFs to the :func:`update_all <DGLGraph.update_all>`
 # as its arguments.
 # This will override the previously registered UDFs.
 #
 # In addition to cleaner code, using builtin functions also gives DGL the
 # opportunity to fuse operations together, resulting in faster execution.  For
 # example, DGL will fuse the ``copy_src`` message function and ``sum`` reduce
 # function into one sparse matrix-vector (spMV) multiplication.
 #
 # `This section <spmv_>`_ describes why spMV can speed up the scatter-gather
 # phase in PageRank.  For more details about the builtin functions in DGL,
-# please read the :doc:`API reference <../api/python/function>`.
+# please read the :doc:`API reference <../../api/python/function>`.
 #
 # You can also download and run the codes to feel the difference.
 for k in range(K):
    # Uncomment the corresponding line to select different version.
    # pagerank_naive(g)
    # pagerank_batch(g)
    # pagerank_level2(g)
    pagerank_builtin(g)
 print(g.ndata['pv'])
 ###############################################################################
 # .. _spmv:
 #
 # Using spMV for PageRank
 # -----------------------
 # Using builtin functions allows DGL to understand the semantics of UDFs and
 # thus allows more efficient implementation for you. For example, in the case
 # of PageRank, one common trick to accelerate it is using its linear algebra
 # form.
 #
 # .. math::
 #
 #    \mathbf{R}^{k} = \frac{1-d}{N} \mathbf{1} + d \mathbf{A}*\mathbf{R}^{k-1}
 #
 # Here, :math:`\mathbf{R}^k` is the vector of the PageRank values of all nodes
 # at iteration :math:`k`; :math:`\mathbf{A}` is the sparse adjacency matrix
 # of the graph.
 # Computing this equation is quite efficient because there exists efficient
 # GPU kernel for the *sparse-matrix-vector-multiplication* (spMV). DGL
 # detects whether such optimization is available through the builtin
 # functions. If the certain combination of builtins can be mapped to a spMV
 # kernel (e.g. the pagerank example), DGL will use it automatically. As a
 # result, *we recommend using builtin functions whenever it is possible*.
 ###############################################################################
 # Next steps
 # ----------
-# Check out :doc:`GCN <models/1_gcn>` and :doc:`Capsule <models/2_capsule>`
+# Check out :doc:`GCN <../models/1_gnn/1_gcn>` and :doc:`Capsule <../models/4_old_wines/2_capsule>`
 # for more model implemenetations in DGL.
--- a/tutorials/README.txt
+++ b/tutorials/README.txt
 Basic Tutorials
 ===============
-These tutorials conver the basics of DGL.
+These tutorials cover the basics of DGL.
--- a/tutorials/models/1_gcn.py
+++ b/tutorials/models/1_gcn.py
 """
 .. _model-gcn:
 Graph Convolutional Network
 ====================================
 **Author:** `Qi Huang <https://github.com/HQ01>`_, `Minjie Wang  <https://jermainewang.github.io/>`_,
 Yu Gai, Quan Gan, Zheng Zhang
 This is a gentle introduction of using DGL to implement Graph Convolutional
 Networks (Kipf & Welling et al., `Semi-Supervised Classificaton with Graph
 Convolutional Networks <https://arxiv.org/pdf/1609.02907.pdf>`_). We build upon
-the :doc:`earlier tutorial <../3_pagerank>` on DGLGraph and demonstrate
+the :doc:`earlier tutorial <../../basics/3_pagerank>` on DGLGraph and demonstrate
 how DGL combines graph with deep neural network and learn structural representations.
 """
 ###############################################################################
 # Model Overview
 # ------------------------------------------
 # GCN from the perspective of message passing
 # ```````````````````````````````````````````````
 # We describe a layer of graph convolutional neural network from a message
 # passing perspective; the math can be found `here <math_>`_.
 # It boils down to the following step, for each node :math:`u`:
 # 
 # 1) Aggregate neighbors' representations :math:`h_{v}` to produce an
 # intermediate representation :math:`\hat{h}_u`.  2) Transform the aggregated
 # representation :math:`\hat{h}_{u}` with a linear projection followed by a
 # non-linearity: :math:`h_{u} = f(W_{u} \hat{h}_u)`.
 # 
 # We will implement step 1 with DGL message passing, and step 2 with the
 # ``apply_nodes`` method, whose node UDF will be a PyTorch ``nn.Module``.
 # 
 # GCN implementation with DGL
 # ``````````````````````````````````````````
 # We first define the message and reduce function as usual.  Since the
 # aggregation on a node :math:`u` only involves summing over the neighbors'
 # representations :math:`h_v`, we can simply use builtin functions:
 import dgl
 import dgl.function as fn
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 from dgl import DGLGraph
 gcn_msg = fn.copy_src(src='h', out='m')
 gcn_reduce = fn.sum(msg='m', out='h')
 ###############################################################################
 # We then define the node UDF for ``apply_nodes``, which is a fully-connected layer:
 class NodeApplyModule(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(NodeApplyModule, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.activation = activation
    def forward(self, node):
        h = self.linear(node.data['h'])
        h = self.activation(h)
        return {'h' : h}
 ###############################################################################
 # We then proceed to define the GCN module. A GCN layer essentially performs
 # message passing on all the nodes then applies the `NodeApplyModule`. Note
 # that we omitted the dropout in the paper for simplicity.
 class GCN(nn.Module):
    def __init__(self, in_feats, out_feats, activation):
        super(GCN, self).__init__()
        self.apply_mod = NodeApplyModule(in_feats, out_feats, activation)
    def forward(self, g, feature):
        g.ndata['h'] = feature
        g.update_all(gcn_msg, gcn_reduce)
        g.apply_nodes(func=self.apply_mod)
        return g.ndata.pop('h')
 ###############################################################################
 # The forward function is essentially the same as any other commonly seen NNs
 # model in PyTorch.  We can initialize GCN like any ``nn.Module``. For example,
 # let's define a simple neural network consisting of two GCN layers. Suppose we
 # are training the classifier for the cora dataset (the input feature size is
 # 1433 and the number of classes is 7).
 class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.gcn1 = GCN(1433, 16, F.relu)
        self.gcn2 = GCN(16, 7, F.relu)
    def forward(self, g, features):
        x = self.gcn1(g, features)
        x = self.gcn2(g, x)
        return x
 net = Net()
 print(net)
 ###############################################################################
 # We load the cora dataset using DGL's built-in data module.
 from dgl.data import citation_graph as citegrh
 def load_cora_data():
    data = citegrh.load_cora()
    features = th.FloatTensor(data.features)
    labels = th.LongTensor(data.labels)
    mask = th.ByteTensor(data.train_mask)
    g = DGLGraph(data.graph)
    return g, features, labels, mask
 ###############################################################################
 # We then train the network as follows:
 import time
 import numpy as np
 g, features, labels, mask = load_cora_data()
 optimizer = th.optim.Adam(net.parameters(), lr=1e-3)
 dur = []
 for epoch in range(30):
    if epoch >=3:
        t0 = time.time()
    logits = net(g, features)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[mask], labels[mask])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch >=3:
        dur.append(time.time() - t0)
    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), np.mean(dur)))
 ###############################################################################
 # .. _math:
 #
 # GCN in one formula
 # ------------------
 # Mathematically, the GCN model follows this formula:
 # 
 # :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
 # 
 # Here, :math:`H^{(l)}` denotes the :math:`l^{th}` layer in the network,
 # :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for
 # this layer. :math:`D` and :math:`A`, as commonly seen, represent degree
 # matrix and adjacency matrix, respectively. The ~ is a renormalization trick
 # in which we add a self-connection to each node of the graph, and build the
 # corresponding degree and adjacency matrix.  The shape of the input
 # :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes
 # and :math:`D` is the number of input features. We can chain up multiple
 # layers as such to produce a node-level representation output with shape
 # :math`N \times F`, where :math:`F` is the dimension of the output node
 # feature vector.
 # 
 # The equation can be efficiently implemented using sparse matrix
 # multiplication kernels (such as Kipf's
 # `pygcn <https://github.com/tkipf/pygcn>`_ code). The above DGL implementation
 # in fact has already used this trick due to the use of builtin functions. To
-# understand what is under the hood, please read our tutorial on :doc:`PageRank <../3_pagerank>`.
+# understand what is under the hood, please read our tutorial on :doc:`PageRank <../../basics/3_pagerank>`.
--- a/tutorials/models/4_rgcn.py
+++ b/tutorials/models/4_rgcn.py
--- a/tutorials/models/6_line_graph.py
+++ b/tutorials/models/6_line_graph.py
--- a/tutorials/models/1_gnn/README.txt
+++ b/tutorials/models/1_gnn/README.txt
+.. _tutorials1-index:
+Graph Neural Network and its variant
+------------------------------------
+* **GCN** `[paper] <https://arxiv.org/abs/1609.02907>`__ `[tutorial] <models/1_gcn.html>`__
+  `[code] <https://github.com/jermainewang/dgl/blob/master/examples/pytorch/gcn/gcn.py>`__:
+  this is the vanilla GCN. The tutorial covers the basic uses of DGL APIs.
+* **GAT** `[paper] <https://arxiv.org/abs/1710.10903>`__
+  `[code] <https://github.com/jermainewang/dgl/blob/master/examples/pytorch/gat/gat.py>`__:
+  the key extension of GAT w.r.t vanilla GCN is deploying multi-head attention
+  among neighborhood of a node, thus greatly enhances the capacity and
+  expressiveness of the model.
+* **R-GCN** `[paper] <https://arxiv.org/abs/1703.06103>`__ `[tutorial] <models/4_rgcn.html>`__
+  [code (wip)]: the key
+  difference of RGNN is to allow multi-edges among two entities of a graph, and
+  edges with distinct relationships are encoded differently. This is an
+  interesting extension of GCN that can have a lot of applications of its own.
+* **LGNN** `[paper] <https://arxiv.org/abs/1705.08415>`__ `[tutorial (wip)]` `[code (wip)]`:
+  this model focuses on community detection by inspecting graph structures. It
+  uses representations of both the orignal graph and its line-graph companion. In
+  addition to demonstrate how an algorithm can harness multiple graphs, our
+  implementation shows how one can judiciously mix vanilla tensor operation,
+  sparse-matrix tensor operations, along with message-passing with DGL.
+* **SSE** `[paper] <http://proceedings.mlr.press/v80/dai18a/dai18a.pdf>`__ `[tutorial (wip)]`
+  `[code] <https://github.com/jermainewang/dgl/blob/master/examples/mxnet/sse/sse_batch.py>`__:
+  the emphasize here is *giant* graph that cannot fit comfortably on one GPU
+  card. SSE is an example to illustrate the co-design of both algrithm and
+  system: sampling to guarantee asymptotic covergence while lowering the
+  complexity, and batching across samples for maximum parallelism.
\ No newline at end of file
--- a/tutorials/models/3_tree-lstm.py
+++ b/tutorials/models/3_tree-lstm.py
--- a/tutorials/models/2_small_graph/README.txt
+++ b/tutorials/models/2_small_graph/README.txt
+.. _tutorials2-index:
+Dealing with many small graphs
+------------------------------
+* **Tree-LSTM** `[paper] <https://arxiv.org/abs/1503.00075>`__ `[tutorial] <models/3_tree-lstm.html>`__
+  `[code] <https://github.com/jermainewang/dgl/blob/master/examples/pytorch/tree_lstm/tree_lstm.py>`__:
+  sentences of natural languages have inherent structures, which are thrown away
+  by treating them simply as sequences. Tree-LSTM is a powerful model that learns
+  the representation by leveraging prior syntactic structures (e.g. parse-tree).
+  The challenge to train it well is that simply by padding a sentence to the
+  maximum length no longer works, since trees of different sentences have
+  different sizes and topologies. DGL solves this problem by throwing the trees
+  into a bigger "container" graph, and use message-passing to explore maximum
+  parallelism. The key API we use is batching.
--- a/tutorials/models/5_dgmg.py
+++ b/tutorials/models/5_dgmg.py
--- a/tutorials/models/3_generative_model/README.txt
+++ b/tutorials/models/3_generative_model/README.txt
+.. _tutorials3-index:
+Generative models
+------------------------------
+* **DGMG** `[paper] <https://arxiv.org/abs/1803.03324>`__ `[tutorial] <models/5_dgmg.html>`__
+  `[code] <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/dgmg>`__:
+  this model belongs to the important family that deals with structural
+  generation. DGMG is interesting because its state-machine approach is the most
+  general. It is also very challenging because, unlike Tree-LSTM, every sample
+  has a dynamic, probability-driven structure that is not available before
+  training. We are able to progressively leverage intra- and inter-graph
+  parallelism to steadily improve the performance.
+* **JTNN** `[paper] <https://arxiv.org/abs/1802.04364>`__ `[code (wip)]`: unlike DGMG, this
+  paper generates molecular graphs using the framework of variational
+  auto-encoder. Perhaps more interesting is its approach to build structure
+  hierarchically, in the case of molecular, with junction tree as the middle
+  scaffolding.
--- a/tutorials/models/2_capsule.py
+++ b/tutorials/models/2_capsule.py
--- a/tutorials/models/4_old_wines/README.txt
+++ b/tutorials/models/4_old_wines/README.txt
+.. _tutorials4-index:
+Old (new) wines in new bottle
+-----------------------------
+* **Capsule** `[paper] <https://arxiv.org/abs/1710.09829>`__ `[tutorial] <models/2_capsule.html>`__
+  `[code] <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/capsule>`__: this new
+  computer vision model has two key ideas -- enhancing the feature representation
+  in a vector form (instead of a scalar) called *capsule*, and replacing
+  maxpooling with dynamic routing. The idea of dynamic routing is to integrate a
+  lower level capsule to one (or several) of a higher level one with
+  non-parametric message-passing. We show how the later can be nicely implemented
+  with DGL APIs.
+* **Transformer** `[paper] <https://arxiv.org/abs/1706.03762>`__ `[tutorial (wip)]` `[code (wip)]` and
+  **Universal Transformer** `[paper] <https://arxiv.org/abs/1807.03819>`__ `[tutorial (wip)]`
+  `[code (wip)]`: these
+  two models replace RNN with several layers of multi-head attention to encode
+  and discover structures among tokens of a sentence. These attention mechanisms
+  can similarly formulated as graph operations with message-passing.
--- a/tutorials/models/README.txt
+++ b/tutorials/models/README.txt