[Misc] Auto-reformat multiple python folders. (#5325)

* auto-reformat * lintrunner --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>

[Misc] Auto-reformat multiple python folders. (#5325)
* auto-reformat * lintrunner --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
dce89919 · Hongzhi (Steve), Chen · GitHub · ab812179 · dce89919 · dce89919
Unverified Commit dce89919 authored Feb 20, 2023 by Hongzhi (Steve), Chen Committed by GitHub Feb 20, 2023
6 changed files
--- a/tutorials/models/2_small_graph/3_tree-lstm.py
+++ b/tutorials/models/2_small_graph/3_tree-lstm.py
@@ -15,376 +15,408 @@ Tree-LSTM in DGL
    efficiency. For recommended implementation, please refer to the `official
    examples <https://github.com/dmlc/dgl/tree/master/examples>`_.

-"""
-
-##############################################################################
-#
-# In this tutorial, you learn to use Tree-LSTM networks for sentiment analysis.
-# The Tree-LSTM is a generalization of long short-term memory (LSTM) networks to tree-structured network topologies.
-#
-# The Tree-LSTM structure was first introduced by Kai et. al in an ACL 2015
-# paper: `Improved Semantic Representations From Tree-Structured Long
-# Short-Term Memory Networks <https://arxiv.org/pdf/1503.00075.pdf>`__.
-# The core idea is to introduce syntactic information for language tasks by
-# extending the chain-structured LSTM to a tree-structured LSTM. The dependency
-# tree and constituency tree techniques are leveraged to obtain a ''latent tree''.
-#
-# The challenge in training Tree-LSTMs is batching --- a standard
-# technique in machine learning to accelerate optimization. However, since trees
-# generally have different shapes by nature, parallization is non-trivial.
-# DGL offers an alternative. Pool all the trees into one single graph then
-# induce the message passing over them, guided by the structure of each tree.
-#
-# The task and the dataset
-# ------------------------
-#
-# The steps here use the
-# `Stanford Sentiment Treebank <https://nlp.stanford.edu/sentiment/>`__ in
-# ``dgl.data``. The dataset provides a fine-grained, tree-level sentiment
-# annotation. There are five classes: Very negative, negative, neutral, positive, and
-# very positive, which indicate the sentiment in the current subtree. Non-leaf
-# nodes in a constituency tree do not contain words, so use a special
-# ``PAD_WORD`` token to denote them. During training and inference
-# their embeddings would be masked to all-zero.
-#
-# .. figure:: https://i.loli.net/2018/11/08/5be3d4bfe031b.png
-#    :alt:
-#
-# The figure displays one sample of the SST dataset, which is a
-# constituency parse tree with their nodes labeled with sentiment. To
-# speed up things, build a tiny set with five sentences and take a look
-# at the first one.
-#
-
-from collections import namedtuple
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import dgl
-from dgl.data.tree import SSTDataset
-
-
-SSTBatch = namedtuple('SSTBatch', ['graph', 'mask', 'wordid', 'label'])
-
-# Each sample in the dataset is a constituency tree. The leaf nodes
-# represent words. The word is an int value stored in the "x" field.
-# The non-leaf nodes have a special word PAD_WORD. The sentiment
-# label is stored in the "y" feature field.
-trainset = SSTDataset(mode='tiny')  # the "tiny" set has only five trees
-tiny_sst = [tr for tr in trainset]
-num_vocabs = trainset.vocab_size
-num_classes = trainset.num_classes
-
-vocab = trainset.vocab # vocabulary dict: key -> id
-inv_vocab = {v: k for k, v in vocab.items()} # inverted vocabulary dict: id -> word
-
-a_tree = tiny_sst[0]
-for token in a_tree.ndata['x'].tolist():
-    if token != trainset.PAD_WORD:
-        print(inv_vocab[token], end=" ")
-
-##############################################################################
-# Step 1: Batching
-# ----------------
-#
-# Add all the trees to one graph, using
-# the :func:`~dgl.batched_graph.batch` API.
-#
-
-import networkx as nx
-import matplotlib.pyplot as plt
-
-graph = dgl.batch(tiny_sst)
-def plot_tree(g):
-    # this plot requires pygraphviz package
-    pos = nx.nx_agraph.graphviz_layout(g, prog='dot')
-    nx.draw(g, pos, with_labels=False, node_size=10,
-            node_color=[[.5, .5, .5]], arrowsize=4)
-    plt.show()
-
-plot_tree(graph.to_networkx())
-
-#################################################################################
-# You can read more about the definition of :func:`~dgl.batch`, or
-# skip ahead to the next step:
-# .. note::
-#
-#    **Definition**: :func:`~dgl.batch` unions a list of :math:`B`
-#      :class:`~dgl.DGLGraph`\ s and returns a :class:`~dgl.DGLGraph` of batch
-#      size :math:`B`.
-#
-#    - The union includes all the nodes,
-#      edges, and their features. The order of nodes, edges, and features are
-#      preserved.
-#
-#        - Given that you have :math:`V_i` nodes for graph
-#          :math:`\mathcal{G}_i`, the node ID :math:`j` in graph
-#          :math:`\mathcal{G}_i` correspond to node ID
-#          :math:`j + \sum_{k=1}^{i-1} V_k` in the batched graph.
-#
-#        - Therefore, performing feature transformation and message passing on
-#          the batched graph is equivalent to doing those
-#          on all ``DGLGraph`` constituents in parallel.
-#
-#    - Duplicate references to the same graph are
-#      treated as deep copies; the nodes, edges, and features are duplicated,
-#      and mutation on one reference does not affect the other.
-#    - The batched graph keeps track of the meta
-#      information of the constituents so it can be
-#      :func:`~dgl.batched_graph.unbatch`\ ed to list of ``DGLGraph``\ s.
-#
-# Step 2: Tree-LSTM cell with message-passing APIs
-# ------------------------------------------------
-#
-# Researchers have proposed two types of Tree-LSTMs: Child-Sum
-# Tree-LSTMs, and :math:`N`-ary Tree-LSTMs. In this tutorial you focus
-# on applying *Binary* Tree-LSTM to binarized constituency trees. This
-# application is also known as *Constituency Tree-LSTM*. Use PyTorch
-# as a backend framework to set up the network.
-#
-# In `N`-ary Tree-LSTM, each unit at node :math:`j` maintains a hidden
-# representation :math:`h_j` and a memory cell :math:`c_j`. The unit
-# :math:`j` takes the input vector :math:`x_j` and the hidden
-# representations of the child units: :math:`h_{jl}, 1\leq l\leq N` as
-# input, then update its new hidden representation :math:`h_j` and memory
-# cell :math:`c_j` by:
-#
-# .. math::
-#
-#    i_j & = & \sigma\left(W^{(i)}x_j + \sum_{l=1}^{N}U^{(i)}_l h_{jl} + b^{(i)}\right),  & (1)\\
-#    f_{jk} & = & \sigma\left(W^{(f)}x_j + \sum_{l=1}^{N}U_{kl}^{(f)} h_{jl} + b^{(f)} \right), &  (2)\\
-#    o_j & = & \sigma\left(W^{(o)}x_j + \sum_{l=1}^{N}U_{l}^{(o)} h_{jl} + b^{(o)} \right), & (3)  \\
-#    u_j & = & \textrm{tanh}\left(W^{(u)}x_j + \sum_{l=1}^{N} U_l^{(u)}h_{jl} + b^{(u)} \right), & (4)\\
-#    c_j & = & i_j \odot u_j + \sum_{l=1}^{N} f_{jl} \odot c_{jl}, &(5) \\
-#    h_j & = & o_j \cdot \textrm{tanh}(c_j), &(6)  \\
-#
-# It can be decomposed into three phases: ``message_func``,
-# ``reduce_func`` and ``apply_node_func``.
-#
-# .. note::
-#    ``apply_node_func`` is a new node UDF that has not been introduced before. In
-#    ``apply_node_func``, a user specifies what to do with node features,
-#    without considering edge features and messages. In a Tree-LSTM case,
-#    ``apply_node_func`` is a must, since there exists (leaf) nodes with
-#    :math:`0` incoming edges, which would not be updated with
-#    ``reduce_func``.
-#
-
-import torch as th
-import torch.nn as nn
-
-class TreeLSTMCell(nn.Module):
-    def __init__(self, x_size, h_size):
-        super(TreeLSTMCell, self).__init__()
-        self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
-        self.U_iou = nn.Linear(2 * h_size, 3 * h_size, bias=False)
-        self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
-        self.U_f = nn.Linear(2 * h_size, 2 * h_size)
-
-    def message_func(self, edges):
-        return {'h': edges.src['h'], 'c': edges.src['c']}
-
-    def reduce_func(self, nodes):
-        # concatenate h_jl for equation (1), (2), (3), (4)
-        h_cat = nodes.mailbox['h'].view(nodes.mailbox['h'].size(0), -1)
-        # equation (2)
-        f = th.sigmoid(self.U_f(h_cat)).view(*nodes.mailbox['h'].size())
-        # second term of equation (5)
-        c = th.sum(f * nodes.mailbox['c'], 1)
-        return {'iou': self.U_iou(h_cat), 'c': c}
-
-    def apply_node_func(self, nodes):
-        # equation (1), (3), (4)
-        iou = nodes.data['iou'] + self.b_iou
-        i, o, u = th.chunk(iou, 3, 1)
-        i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
-        # equation (5)
-        c = i * u + nodes.data['c']
-        # equation (6)
-        h = o * th.tanh(c)
-        return {'h' : h, 'c' : c}
-
-##############################################################################
-# Step 3: Define traversal
-# ------------------------
-#
-# After you define the message-passing functions, induce the
-# right order to trigger them. This is a significant departure from models
-# such as GCN, where all nodes are pulling messages from upstream ones
-# *simultaneously*.
-#
-# In the case of Tree-LSTM, messages start from leaves of the tree, and
-# propagate/processed upwards until they reach the roots. A visualization
-# is as follows:
-#
-# .. figure:: https://i.loli.net/2018/11/09/5be4b5d2df54d.gif
-#    :alt:
-#
-# DGL defines a generator to perform the topological sort, each item is a
-# tensor recording the nodes from bottom level to the roots. One can
-# appreciate the degree of parallelism by inspecting the difference of the
-# followings:
-#
-
-# to heterogenous graph
-trv_a_tree = dgl.graph(a_tree.edges())
-print('Traversing one tree:')
-print(dgl.topological_nodes_generator(trv_a_tree))
-
-# to heterogenous graph
-trv_graph = dgl.graph(graph.edges())
-print('Traversing many trees at the same time:')
-print(dgl.topological_nodes_generator(trv_graph))
-
-##############################################################################
-# Call :meth:`~dgl.DGLGraph.prop_nodes` to trigger the message passing:
-
-import dgl.function as fn
-import torch as th
-
-trv_graph.ndata['a'] = th.ones(graph.number_of_nodes(), 1)
-traversal_order = dgl.topological_nodes_generator(trv_graph)
-trv_graph.prop_nodes(traversal_order,
-                     message_func=fn.copy_u('a', 'a'),
-                     reduce_func=fn.sum('a', 'a'))
-
-# the following is a syntax sugar that does the same
-# dgl.prop_nodes_topo(graph)
-
-##############################################################################
-# .. note::
-#
-#    Before you call :meth:`~dgl.DGLGraph.prop_nodes`, specify a
-#    `message_func` and `reduce_func` in advance. In the example, you can see built-in
-#    copy-from-source and sum functions as message functions, and a reduce
-#    function for demonstration.
-#
-# Putting it together
-# -------------------
-#
-# Here is the complete code that specifies the ``Tree-LSTM`` class.
-#
-
-class TreeLSTM(nn.Module):
-    def __init__(self,
-                 num_vocabs,
-                 x_size,
-                 h_size,
-                 num_classes,
-                 dropout,
-                 pretrained_emb=None):
-        super(TreeLSTM, self).__init__()
-        self.x_size = x_size
-        self.embedding = nn.Embedding(num_vocabs, x_size)
-        if pretrained_emb is not None:
-            print('Using glove')
-            self.embedding.weight.data.copy_(pretrained_emb)
-            self.embedding.weight.requires_grad = True
-        self.dropout = nn.Dropout(dropout)
-        self.linear = nn.Linear(h_size, num_classes)
-        self.cell = TreeLSTMCell(x_size, h_size)
-
-    def forward(self, batch, h, c):
-        """Compute tree-lstm prediction given a batch.
-
-        Parameters
-        ----------
-        batch : dgl.data.SSTBatch
-            The data batch.
-        h : Tensor
-            Initial hidden state.
-        c : Tensor
-            Initial cell state.
-
-        Returns
-        -------
-        logits : Tensor
-            The prediction of each node.
-        """
-        g = batch.graph
-        # to heterogenous graph
-        g = dgl.graph(g.edges())
-        # feed embedding
-        embeds = self.embedding(batch.wordid * batch.mask)
-        g.ndata['iou'] = self.cell.W_iou(self.dropout(embeds)) * batch.mask.float().unsqueeze(-1)
-        g.ndata['h'] = h
-        g.ndata['c'] = c
-        # propagate
-        dgl.prop_nodes_topo(g,
-                            message_func=self.cell.message_func,
-                            reduce_func=self.cell.reduce_func,
-                            apply_node_func=self.cell.apply_node_func)
-        # compute logits
-        h = self.dropout(g.ndata.pop('h'))
-        logits = self.linear(h)
-        return logits
-
-##############################################################################
-# Main Loop
-# ---------
-#
-# Finally, you could write a training paradigm in PyTorch.
-#
-
-from torch.utils.data import DataLoader
-import torch.nn.functional as F
-
-device = th.device('cpu')
-# hyper parameters
-x_size = 256
-h_size = 256
-dropout = 0.5
-lr = 0.05
-weight_decay = 1e-4
-epochs = 10
-
-# create the model
-model = TreeLSTM(trainset.vocab_size,
-                 x_size,
-                 h_size,
-                 trainset.num_classes,
-                 dropout)
-print(model)
-
-# create the optimizer
-optimizer = th.optim.Adagrad(model.parameters(),
-                          lr=lr,
-                          weight_decay=weight_decay)
-
-def batcher(dev):
-    def batcher_dev(batch):
-        batch_trees = dgl.batch(batch)
-        return SSTBatch(graph=batch_trees,
-                        mask=batch_trees.ndata['mask'].to(device),
-                        wordid=batch_trees.ndata['x'].to(device),
-                        label=batch_trees.ndata['y'].to(device))
-    return batcher_dev
-
-train_loader = DataLoader(dataset=tiny_sst,
-                          batch_size=5,
-                          collate_fn=batcher(device),
-                          shuffle=False,
-                          num_workers=0)
-
-# training loop
-for epoch in range(epochs):
-    for step, batch in enumerate(train_loader):
-        g = batch.graph
-        n = g.number_of_nodes()
-        h = th.zeros((n, h_size))
-        c = th.zeros((n, h_size))
-        logits = model(batch, h, c)
-        logp = F.log_softmax(logits, 1)
-        loss = F.nll_loss(logp, batch.label, reduction='sum')
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        pred = th.argmax(logits, 1)
-        acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label)
-        print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format(
-            epoch, step, loss.item(), acc))
-
-##############################################################################
-# To train the model on a full dataset with different settings (such as CPU or GPU),
-# refer to the `PyTorch example <https://github.com/dmlc/dgl/tree/master/examples/pytorch/tree_lstm>`__.
-# There is also an implementation of the Child-Sum Tree-LSTM.
+"""
+
+import os
+
+##############################################################################
+#
+# In this tutorial, you learn to use Tree-LSTM networks for sentiment analysis.
+# The Tree-LSTM is a generalization of long short-term memory (LSTM) networks to tree-structured network topologies.
+#
+# The Tree-LSTM structure was first introduced by Kai et. al in an ACL 2015
+# paper: `Improved Semantic Representations From Tree-Structured Long
+# Short-Term Memory Networks <https://arxiv.org/pdf/1503.00075.pdf>`__.
+# The core idea is to introduce syntactic information for language tasks by
+# extending the chain-structured LSTM to a tree-structured LSTM. The dependency
+# tree and constituency tree techniques are leveraged to obtain a ''latent tree''.
+#
+# The challenge in training Tree-LSTMs is batching --- a standard
+# technique in machine learning to accelerate optimization. However, since trees
+# generally have different shapes by nature, parallization is non-trivial.
+# DGL offers an alternative. Pool all the trees into one single graph then
+# induce the message passing over them, guided by the structure of each tree.
+#
+# The task and the dataset
+# ------------------------
+#
+# The steps here use the
+# `Stanford Sentiment Treebank <https://nlp.stanford.edu/sentiment/>`__ in
+# ``dgl.data``. The dataset provides a fine-grained, tree-level sentiment
+# annotation. There are five classes: Very negative, negative, neutral, positive, and
+# very positive, which indicate the sentiment in the current subtree. Non-leaf
+# nodes in a constituency tree do not contain words, so use a special
+# ``PAD_WORD`` token to denote them. During training and inference
+# their embeddings would be masked to all-zero.
+#
+# .. figure:: https://i.loli.net/2018/11/08/5be3d4bfe031b.png
+#    :alt:
+#
+# The figure displays one sample of the SST dataset, which is a
+# constituency parse tree with their nodes labeled with sentiment. To
+# speed up things, build a tiny set with five sentences and take a look
+# at the first one.
+#
+
+from collections import namedtuple
+
+os.environ["DGLBACKEND"] = "pytorch"
+import dgl
+from dgl.data.tree import SSTDataset
+
+
+SSTBatch = namedtuple("SSTBatch", ["graph", "mask", "wordid", "label"])
+
+# Each sample in the dataset is a constituency tree. The leaf nodes
+# represent words. The word is an int value stored in the "x" field.
+# The non-leaf nodes have a special word PAD_WORD. The sentiment
+# label is stored in the "y" feature field.
+trainset = SSTDataset(mode="tiny")  # the "tiny" set has only five trees
+tiny_sst = [tr for tr in trainset]
+num_vocabs = trainset.vocab_size
+num_classes = trainset.num_classes
+
+vocab = trainset.vocab  # vocabulary dict: key -> id
+inv_vocab = {
+    v: k for k, v in vocab.items()
+}  # inverted vocabulary dict: id -> word
+
+a_tree = tiny_sst[0]
+for token in a_tree.ndata["x"].tolist():
+    if token != trainset.PAD_WORD:
+        print(inv_vocab[token], end=" ")
+import matplotlib.pyplot as plt
+
+##############################################################################
+# Step 1: Batching
+# ----------------
+#
+# Add all the trees to one graph, using
+# the :func:`~dgl.batched_graph.batch` API.
+#
+
+import networkx as nx
+
+graph = dgl.batch(tiny_sst)
+
+
+def plot_tree(g):
+    # this plot requires pygraphviz package
+    pos = nx.nx_agraph.graphviz_layout(g, prog="dot")
+    nx.draw(
+        g,
+        pos,
+        with_labels=False,
+        node_size=10,
+        node_color=[[0.5, 0.5, 0.5]],
+        arrowsize=4,
+    )
+    plt.show()
+
+
+plot_tree(graph.to_networkx())
+
+#################################################################################
+# You can read more about the definition of :func:`~dgl.batch`, or
+# skip ahead to the next step:
+# .. note::
+#
+#    **Definition**: :func:`~dgl.batch` unions a list of :math:`B`
+#      :class:`~dgl.DGLGraph`\ s and returns a :class:`~dgl.DGLGraph` of batch
+#      size :math:`B`.
+#
+#    - The union includes all the nodes,
+#      edges, and their features. The order of nodes, edges, and features are
+#      preserved.
+#
+#        - Given that you have :math:`V_i` nodes for graph
+#          :math:`\mathcal{G}_i`, the node ID :math:`j` in graph
+#          :math:`\mathcal{G}_i` correspond to node ID
+#          :math:`j + \sum_{k=1}^{i-1} V_k` in the batched graph.
+#
+#        - Therefore, performing feature transformation and message passing on
+#          the batched graph is equivalent to doing those
+#          on all ``DGLGraph`` constituents in parallel.
+#
+#    - Duplicate references to the same graph are
+#      treated as deep copies; the nodes, edges, and features are duplicated,
+#      and mutation on one reference does not affect the other.
+#    - The batched graph keeps track of the meta
+#      information of the constituents so it can be
+#      :func:`~dgl.batched_graph.unbatch`\ ed to list of ``DGLGraph``\ s.
+#
+# Step 2: Tree-LSTM cell with message-passing APIs
+# ------------------------------------------------
+#
+# Researchers have proposed two types of Tree-LSTMs: Child-Sum
+# Tree-LSTMs, and :math:`N`-ary Tree-LSTMs. In this tutorial you focus
+# on applying *Binary* Tree-LSTM to binarized constituency trees. This
+# application is also known as *Constituency Tree-LSTM*. Use PyTorch
+# as a backend framework to set up the network.
+#
+# In `N`-ary Tree-LSTM, each unit at node :math:`j` maintains a hidden
+# representation :math:`h_j` and a memory cell :math:`c_j`. The unit
+# :math:`j` takes the input vector :math:`x_j` and the hidden
+# representations of the child units: :math:`h_{jl}, 1\leq l\leq N` as
+# input, then update its new hidden representation :math:`h_j` and memory
+# cell :math:`c_j` by:
+#
+# .. math::
+#
+#    i_j & = & \sigma\left(W^{(i)}x_j + \sum_{l=1}^{N}U^{(i)}_l h_{jl} + b^{(i)}\right),  & (1)\\
+#    f_{jk} & = & \sigma\left(W^{(f)}x_j + \sum_{l=1}^{N}U_{kl}^{(f)} h_{jl} + b^{(f)} \right), &  (2)\\
+#    o_j & = & \sigma\left(W^{(o)}x_j + \sum_{l=1}^{N}U_{l}^{(o)} h_{jl} + b^{(o)} \right), & (3)  \\
+#    u_j & = & \textrm{tanh}\left(W^{(u)}x_j + \sum_{l=1}^{N} U_l^{(u)}h_{jl} + b^{(u)} \right), & (4)\\
+#    c_j & = & i_j \odot u_j + \sum_{l=1}^{N} f_{jl} \odot c_{jl}, &(5) \\
+#    h_j & = & o_j \cdot \textrm{tanh}(c_j), &(6)  \\
+#
+# It can be decomposed into three phases: ``message_func``,
+# ``reduce_func`` and ``apply_node_func``.
+#
+# .. note::
+#    ``apply_node_func`` is a new node UDF that has not been introduced before. In
+#    ``apply_node_func``, a user specifies what to do with node features,
+#    without considering edge features and messages. In a Tree-LSTM case,
+#    ``apply_node_func`` is a must, since there exists (leaf) nodes with
+#    :math:`0` incoming edges, which would not be updated with
+#    ``reduce_func``.
+#
+
+import torch as th
+import torch.nn as nn
+
+
+class TreeLSTMCell(nn.Module):
+    def __init__(self, x_size, h_size):
+        super(TreeLSTMCell, self).__init__()
+        self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
+        self.U_iou = nn.Linear(2 * h_size, 3 * h_size, bias=False)
+        self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
+        self.U_f = nn.Linear(2 * h_size, 2 * h_size)
+
+    def message_func(self, edges):
+        return {"h": edges.src["h"], "c": edges.src["c"]}
+
+    def reduce_func(self, nodes):
+        # concatenate h_jl for equation (1), (2), (3), (4)
+        h_cat = nodes.mailbox["h"].view(nodes.mailbox["h"].size(0), -1)
+        # equation (2)
+        f = th.sigmoid(self.U_f(h_cat)).view(*nodes.mailbox["h"].size())
+        # second term of equation (5)
+        c = th.sum(f * nodes.mailbox["c"], 1)
+        return {"iou": self.U_iou(h_cat), "c": c}
+
+    def apply_node_func(self, nodes):
+        # equation (1), (3), (4)
+        iou = nodes.data["iou"] + self.b_iou
+        i, o, u = th.chunk(iou, 3, 1)
+        i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
+        # equation (5)
+        c = i * u + nodes.data["c"]
+        # equation (6)
+        h = o * th.tanh(c)
+        return {"h": h, "c": c}
+
+
+##############################################################################
+# Step 3: Define traversal
+# ------------------------
+#
+# After you define the message-passing functions, induce the
+# right order to trigger them. This is a significant departure from models
+# such as GCN, where all nodes are pulling messages from upstream ones
+# *simultaneously*.
+#
+# In the case of Tree-LSTM, messages start from leaves of the tree, and
+# propagate/processed upwards until they reach the roots. A visualization
+# is as follows:
+#
+# .. figure:: https://i.loli.net/2018/11/09/5be4b5d2df54d.gif
+#    :alt:
+#
+# DGL defines a generator to perform the topological sort, each item is a
+# tensor recording the nodes from bottom level to the roots. One can
+# appreciate the degree of parallelism by inspecting the difference of the
+# followings:
+#
+
+# to heterogenous graph
+trv_a_tree = dgl.graph(a_tree.edges())
+print("Traversing one tree:")
+print(dgl.topological_nodes_generator(trv_a_tree))
+
+# to heterogenous graph
+trv_graph = dgl.graph(graph.edges())
+print("Traversing many trees at the same time:")
+print(dgl.topological_nodes_generator(trv_graph))
+
+##############################################################################
+# Call :meth:`~dgl.DGLGraph.prop_nodes` to trigger the message passing:
+
+import dgl.function as fn
+import torch as th
+
+trv_graph.ndata["a"] = th.ones(graph.number_of_nodes(), 1)
+traversal_order = dgl.topological_nodes_generator(trv_graph)
+trv_graph.prop_nodes(
+    traversal_order,
+    message_func=fn.copy_u("a", "a"),
+    reduce_func=fn.sum("a", "a"),
+)
+
+# the following is a syntax sugar that does the same
+# dgl.prop_nodes_topo(graph)
+
+##############################################################################
+# .. note::
+#
+#    Before you call :meth:`~dgl.DGLGraph.prop_nodes`, specify a
+#    `message_func` and `reduce_func` in advance. In the example, you can see built-in
+#    copy-from-source and sum functions as message functions, and a reduce
+#    function for demonstration.
+#
+# Putting it together
+# -------------------
+#
+# Here is the complete code that specifies the ``Tree-LSTM`` class.
+#
+
+
+class TreeLSTM(nn.Module):
+    def __init__(
+        self,
+        num_vocabs,
+        x_size,
+        h_size,
+        num_classes,
+        dropout,
+        pretrained_emb=None,
+    ):
+        super(TreeLSTM, self).__init__()
+        self.x_size = x_size
+        self.embedding = nn.Embedding(num_vocabs, x_size)
+        if pretrained_emb is not None:
+            print("Using glove")
+            self.embedding.weight.data.copy_(pretrained_emb)
+            self.embedding.weight.requires_grad = True
+        self.dropout = nn.Dropout(dropout)
+        self.linear = nn.Linear(h_size, num_classes)
+        self.cell = TreeLSTMCell(x_size, h_size)
+
+    def forward(self, batch, h, c):
+        """Compute tree-lstm prediction given a batch.
+
+        Parameters
+        ----------
+        batch : dgl.data.SSTBatch
+            The data batch.
+        h : Tensor
+            Initial hidden state.
+        c : Tensor
+            Initial cell state.
+
+        Returns
+        -------
+        logits : Tensor
+            The prediction of each node.
+        """
+        g = batch.graph
+        # to heterogenous graph
+        g = dgl.graph(g.edges())
+        # feed embedding
+        embeds = self.embedding(batch.wordid * batch.mask)
+        g.ndata["iou"] = self.cell.W_iou(
+            self.dropout(embeds)
+        ) * batch.mask.float().unsqueeze(-1)
+        g.ndata["h"] = h
+        g.ndata["c"] = c
+        # propagate
+        dgl.prop_nodes_topo(
+            g,
+            message_func=self.cell.message_func,
+            reduce_func=self.cell.reduce_func,
+            apply_node_func=self.cell.apply_node_func,
+        )
+        # compute logits
+        h = self.dropout(g.ndata.pop("h"))
+        logits = self.linear(h)
+        return logits
+
+
+import torch.nn.functional as F
+
+##############################################################################
+# Main Loop
+# ---------
+#
+# Finally, you could write a training paradigm in PyTorch.
+#
+
+from torch.utils.data import DataLoader
+
+device = th.device("cpu")
+# hyper parameters
+x_size = 256
+h_size = 256
+dropout = 0.5
+lr = 0.05
+weight_decay = 1e-4
+epochs = 10
+
+# create the model
+model = TreeLSTM(
+    trainset.vocab_size, x_size, h_size, trainset.num_classes, dropout
+)
+print(model)
+
+# create the optimizer
+optimizer = th.optim.Adagrad(
+    model.parameters(), lr=lr, weight_decay=weight_decay
+)
+
+
+def batcher(dev):
+    def batcher_dev(batch):
+        batch_trees = dgl.batch(batch)
+        return SSTBatch(
+            graph=batch_trees,
+            mask=batch_trees.ndata["mask"].to(device),
+            wordid=batch_trees.ndata["x"].to(device),
+            label=batch_trees.ndata["y"].to(device),
+        )
+
+    return batcher_dev
+
+
+train_loader = DataLoader(
+    dataset=tiny_sst,
+    batch_size=5,
+    collate_fn=batcher(device),
+    shuffle=False,
+    num_workers=0,
+)
+
+# training loop
+for epoch in range(epochs):
+    for step, batch in enumerate(train_loader):
+        g = batch.graph
+        n = g.number_of_nodes()
+        h = th.zeros((n, h_size))
+        c = th.zeros((n, h_size))
+        logits = model(batch, h, c)
+        logp = F.log_softmax(logits, 1)
+        loss = F.nll_loss(logp, batch.label, reduction="sum")
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        pred = th.argmax(logits, 1)
+        acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label)
+        print(
+            "Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format(
+                epoch, step, loss.item(), acc
+            )
+        )
+##############################################################################
+# To train the model on a full dataset with different settings (such as CPU or GPU),
+# refer to the `PyTorch example <https://github.com/dmlc/dgl/tree/master/examples/pytorch/tree_lstm>`__.
+# There is also an implementation of the Child-Sum Tree-LSTM.
--- a/tutorials/models/3_generative_model/5_dgmg.py
+++ b/tutorials/models/3_generative_model/5_dgmg.py
@@ -14,764 +14,780 @@ Generative Models of Graphs
    efficiency. For recommended implementation, please refer to the `official
    examples <https://github.com/dmlc/dgl/tree/master/examples>`_.

-"""
-
-##############################################################################
-#
-# In this tutorial, you learn how to train and generate one graph at
-# a time. You also explore parallelism within the graph embedding operation, which is an
-# essential building block. The tutorial ends with a simple optimization that
-# delivers double the speed by batching across graphs.
-#
-# Earlier tutorials showed how embedding a graph or
-# a node enables you to work on tasks such as `semi-supervised classification for nodes
-# <http://docs.dgl.ai/tutorials/models/1_gcn.html#sphx-glr-tutorials-models-1-gcn-py>`__
-# or `sentiment analysis
-# <http://docs.dgl.ai/tutorials/models/3_tree-lstm.html#sphx-glr-tutorials-models-3-tree-lstm-py>`__.
-# Wouldn't it be interesting to predict the future evolution of the graph and
-# perform the analysis iteratively?
-#
-# To address the evolution of the graphs, you generate a variety of graph samples. In other words, you need
-# **generative models** of graphs. In-addition to learning
-# node and edge features, you would need to model the distribution of arbitrary graphs.
-# While general generative models can model the density function explicitly and
-# implicitly and generate samples at once or sequentially, you only focus
-# on explicit generative models for sequential generation here. Typical applications
-# include drug or materials discovery, chemical processes, or proteomics.
-#
-# Introduction
-# --------------------
-# The primitive actions of mutating a graph in Deep Graph Library (DGL) are nothing more than ``add_nodes``
-# and ``add_edges``. That is, if you were to draw a circle of three nodes,
-#
-# .. figure:: https://user-images.githubusercontent.com/19576924/48313438-78baf000-e5f7-11e8-931e-cd00ab34fa50.gif
-#    :alt:
-#
-# you can write the code as follows.
-#
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import dgl
-
-g = dgl.DGLGraph()
-g.add_nodes(1)              # Add node 0
-g.add_nodes(1)              # Add node 1
-
-# Edges in DGLGraph are directed by default.
-# For undirected edges, add edges for both directions.
-g.add_edges([1, 0], [0, 1]) # Add edges (1, 0), (0, 1)
-g.add_nodes(1)              # Add node 2
-g.add_edges([2, 1], [1, 2]) # Add edges (2, 1), (1, 2)
-g.add_edges([2, 0], [0, 2]) # Add edges (2, 0), (0, 2)
-
-#######################################################################################
-# Real-world graphs are much more complex. There are many families of graphs,
-# with different sizes, topologies, node types, edge types, and the possibility
-# of multigraphs. Besides, a same graph can be generated in many different
-# orders. Regardless, the generative process entails a few steps.
-#
-# - Encode a changing graph.
-# - Perform actions stochastically.
-# - If you are training, collect error signals and optimize the model parameters.
-#
-# When it comes to implementation, another important aspect is speed. How do you
-# parallelize the computation, given that generating a graph is fundamentally a
-# sequential process?
-#
-# .. note::
-#
-#    To be sure, this is not necessarily a hard constraint. Subgraphs can be 
-#    built in parallel and then get assembled. But we
-#    will restrict ourselves to the sequential processes for this tutorial.
-#
-#
-# DGMG: The main flow
-# --------------------
-# For this tutorial, you use 
-# `Deep Generative Models of Graphs <https://arxiv.org/abs/1803.03324>`__
-# ) (DGMG) to implement a graph generative model using DGL. Its algorithmic 
-# framework is general but also challenging to parallelize.
-#
-# .. note::
-#
-#    While it's possible for DGMG to handle complex graphs with typed nodes,
-#    typed edges, and multigraphs, here you use a simplified version of it
-#    for generating graph topologies.
-#
-# DGMG generates a graph by following a state machine, which is basically a
-# two-level loop. Generate one node at a time and connect it to a subset of
-# the existing nodes, one at a time. This is similar to language modeling. The
-# generative process is an iterative one that emits one word or character or sentence
-# at a time, conditioned on the sequence generated so far.
-#
-# At each time step, you either:
-#      - Add a new node to the graph
-#      - Select two existing nodes and add an edge between them
-#
-# .. figure:: https://user-images.githubusercontent.com/19576924/48605003-7f11e900-e9b6-11e8-8880-87362348e154.png
-#    :alt:
-#
-# The Python code will look as follows. In fact, this is *exactly* how inference
-# with DGMG is implemented in DGL.
-#
-
-def forward_inference(self):
-    stop = self.add_node_and_update()
-    while (not stop) and (self.g.number_of_nodes() < self.v_max + 1):
-        num_trials = 0
-        to_add_edge = self.add_edge_or_not()
-        while to_add_edge and (num_trials < self.g.number_of_nodes() - 1):
-            self.choose_dest_and_update()
-            num_trials += 1
-            to_add_edge = self.add_edge_or_not()
-        stop = self.add_node_and_update()
-
-    return self.g
-
-#######################################################################################
-# Assume you have a pre-trained model for generating cycles of nodes 10-20.
-# How does it generate a cycle on-the-fly during inference? Use the code below
-# to create an animation with your own model.
-#
-# ::
-#
-#     import torch
-#     import matplotlib.animation as animation
-#     import matplotlib.pyplot as plt
-#     import networkx as nx
-#     from copy import deepcopy
-#
-#     if __name__ == '__main__':
-#         # pre-trained model saved with path ./model.pth
-#         model = torch.load('./model.pth')
-#         model.eval()
-#         g = model()
-#
-#         src_list = g.edges()[1]
-#         dest_list = g.edges()[0]
-#
-#         evolution = []
-#
-#         nx_g = nx.Graph()
-#         evolution.append(deepcopy(nx_g))
-#
-#         for i in range(0, len(src_list), 2):
-#             src = src_list[i].item()
-#             dest = dest_list[i].item()
-#             if src not in nx_g.nodes():
-#                 nx_g.add_node(src)
-#                 evolution.append(deepcopy(nx_g))
-#             if dest not in nx_g.nodes():
-#                 nx_g.add_node(dest)
-#                 evolution.append(deepcopy(nx_g))
-#             nx_g.add_edges_from([(src, dest), (dest, src)])
-#             evolution.append(deepcopy(nx_g))
-#
-#         def animate(i):
-#             ax.cla()
-#             g_t = evolution[i]
-#             nx.draw_circular(g_t, with_labels=True, ax=ax,
-#                              node_color=['#FEBD69'] * g_t.number_of_nodes())
-#
-#         fig, ax = plt.subplots()
-#         ani = animation.FuncAnimation(fig, animate,
-#                                       frames=len(evolution),
-#                                       interval=600)
-#
-# .. figure:: https://user-images.githubusercontent.com/19576924/48928548-2644d200-ef1b-11e8-8591-da93345382ad.gif
-#    :alt:
-#
-# DGMG: Optimization objective
-# ------------------------------
-# Similar to language modeling, DGMG trains the model with *behavior cloning*,
-# or *teacher forcing*. Assume for each graph there exists a sequence of
-# *oracle actions* :math:`a_{1},\cdots,a_{T}` that generates it. What the model
-# does is to follow these actions, compute the joint probabilities of such
-# action sequences, and maximize them.
-#
-# By chain rule, the probability of taking :math:`a_{1},\cdots,a_{T}` is:
-#
-# .. math::
-#
-#    p(a_{1},\cdots, a_{T}) = p(a_{1})p(a_{2}|a_{1})\cdots p(a_{T}|a_{1},\cdots,a_{T-1}).\\
-#
-# The optimization objective is then simply the typical MLE loss:
-#
-# .. math::
-#
-#    -\log p(a_{1},\cdots,a_{T})=-\sum_{t=1}^{T}\log p(a_{t}|a_{1},\cdots, a_{t-1}).\\
-#
-
-def forward_train(self, actions):
-    """
-    - actions: list
-        - Contains a_1, ..., a_T described above
-    - self.prepare_for_train()
-        - Initializes self.action_step to be 0, which will get
-          incremented by 1 every time it is called.
-        - Initializes objects recording log p(a_t|a_1,...a_{t-1})
-
-    Returns
-    -------
-    - self.get_log_prob(): log p(a_1, ..., a_T)
-    """
-    self.prepare_for_train()
-
-    stop = self.add_node_and_update(a=actions[self.action_step])
-    while not stop:
-        to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
-        while to_add_edge:
-            self.choose_dest_and_update(a=actions[self.action_step])
-            to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
-        stop = self.add_node_and_update(a=actions[self.action_step])
-
-    return self.get_log_prob()
-
-#######################################################################################
-# The key difference between ``forward_train`` and ``forward_inference`` is
-# that the training process takes oracle actions as input and returns log
-# probabilities for evaluating the loss.
-#
-# DGMG: The implementation
-# --------------------------
-# The ``DGMG`` class
-# ``````````````````````````
-# Below you can find the skeleton code for the model. You gradually
-# fill in the details for each function.
-#
-
-import torch.nn as nn
-
-
-class DGMGSkeleton(nn.Module):
-    def __init__(self, v_max):
-        """
-        Parameters
-        ----------
-        v_max: int
-            Max number of nodes considered
-        """
-        super(DGMGSkeleton, self).__init__()
-
-        # Graph configuration
-        self.v_max = v_max
-
-    def add_node_and_update(self, a=None):
-        """Decide if to add a new node.
-        If a new node should be added, update the graph."""
-        return NotImplementedError
-
-    def add_edge_or_not(self, a=None):
-        """Decide if a new edge should be added."""
-        return NotImplementedError
-
-    def choose_dest_and_update(self, a=None):
-        """Choose destination and connect it to the latest node.
-        Add edges for both directions and update the graph."""
-        return NotImplementedError
-
-    def forward_train(self, actions):
-        """Forward at training time. It records the probability
-        of generating a ground truth graph following the actions."""
-        return NotImplementedError
-
-    def forward_inference(self):
-        """Forward at inference time.
-        It generates graphs on the fly."""
-        return NotImplementedError
-
-    def forward(self, actions=None):
-        # The graph you will work on
-        self.g = dgl.DGLGraph()
-
-        # If there are some features for nodes and edges,
-        # zero tensors will be set for those of new nodes and edges.
-        self.g.set_n_initializer(dgl.frame.zero_initializer)
-        self.g.set_e_initializer(dgl.frame.zero_initializer)
-
-        if self.training:
-            return self.forward_train(actions=actions)
-        else:
-            return self.forward_inference()
-
-#######################################################################################
-# Encoding a dynamic graph
-# ``````````````````````````
-# All the actions generating a graph are sampled from probability
-# distributions. In order to do that, you project the structured data,
-# namely the graph, onto an Euclidean space. The challenge is that such
-# process, called *embedding*, needs to be repeated as the graphs mutate.
-#
-# Graph embedding
-# ''''''''''''''''''''''''''
-# Let :math:`G=(V,E)` be an arbitrary graph. Each node :math:`v` has an
-# embedding vector :math:`\textbf{h}_{v} \in \mathbb{R}^{n}`. Similarly,
-# the graph has an embedding vector :math:`\textbf{h}_{G} \in \mathbb{R}^{k}`.
-# Typically, :math:`k > n` since a graph contains more information than
-# an individual node.
-#
-# The graph embedding is a weighted sum of node embeddings under a linear
-# transformation:
-#
-# .. math::
-#
-#    \textbf{h}_{G} =\sum_{v\in V}\text{Sigmoid}(g_m(\textbf{h}_{v}))f_{m}(\textbf{h}_{v}),\\
-#
-# The first term, :math:`\text{Sigmoid}(g_m(\textbf{h}_{v}))`, computes a
-# gating function and can be thought of as how much the overall graph embedding
-# attends on each node. The second term :math:`f_{m}:\mathbb{R}^{n}\rightarrow\mathbb{R}^{k}`
-# maps the node embeddings to the space of graph embeddings.
-#
-# Implement graph embedding as a ``GraphEmbed`` class.
-#
-
-import torch
-
-
-class GraphEmbed(nn.Module):
-    def __init__(self, node_hidden_size):
-        super(GraphEmbed, self).__init__()
-
-        # Setting from the paper
-        self.graph_hidden_size = 2 * node_hidden_size
-
-        # Embed graphs
-        self.node_gating = nn.Sequential(
-            nn.Linear(node_hidden_size, 1),
-            nn.Sigmoid()
-        )
-        self.node_to_graph = nn.Linear(node_hidden_size,
-                                       self.graph_hidden_size)
-
-    def forward(self, g):
-        if g.number_of_nodes() == 0:
-            return torch.zeros(1, self.graph_hidden_size)
-        else:
-            # Node features are stored as hv in ndata.
-            hvs = g.ndata['hv']
-            return (self.node_gating(hvs) *
-                    self.node_to_graph(hvs)).sum(0, keepdim=True)
-
-#######################################################################################
-# Update node embeddings via graph propagation
-# '''''''''''''''''''''''''''''''''''''''''''''
-#
-# The mechanism of updating node embeddings in DGMG is similar to that for
-# graph convolutional networks. For a node :math:`v` in the graph, its
-# neighbor :math:`u` sends a message to it with
-#
-# .. math::
-#
-#    \textbf{m}_{u\rightarrow v}=\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m},\\
-#
-# where :math:`\textbf{x}_{u,v}` is the embedding of the edge between
-# :math:`u` and :math:`v`.
-#
-# After receiving messages from all its neighbors, :math:`v` summarizes them
-# with a node activation vector
-#
-# .. math::
-#
-#    \textbf{a}_{v} = \sum_{u: (u, v)\in E}\textbf{m}_{u\rightarrow v}\\
-#
-# and use this information to update its own feature:
-#
-# .. math::
-#
-#    \textbf{h}'_{v} = \textbf{GRU}(\textbf{h}_{v}, \textbf{a}_{v}).\\
-#
-# Performing all the operations above once for all nodes synchronously is
-# called one round of graph propagation. The more rounds of graph propagation
-# you perform, the longer distance messages travel throughout the graph.
-#
-# With DGL, you implement graph propagation with ``g.update_all``.
-# The message notation here can be a bit confusing. Researchers can refer
-# to :math:`\textbf{m}_{u\rightarrow v}` as messages, however the message function
-# below only passes :math:`\text{concat}([\textbf{h}_{u}, \textbf{x}_{u, v}])`.
-# The operation :math:`\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m}`
-# is then performed across all edges at once for efficiency consideration.
-#
-
-from functools import partial
-
-class GraphProp(nn.Module):
-    def __init__(self, num_prop_rounds, node_hidden_size):
-        super(GraphProp, self).__init__()
-
-        self.num_prop_rounds = num_prop_rounds
-
-        # Setting from the paper
-        self.node_activation_hidden_size = 2 * node_hidden_size
-
-        message_funcs = []
-        node_update_funcs = []
-        self.reduce_funcs = []
-
-        for t in range(num_prop_rounds):
-            # input being [hv, hu, xuv]
-            message_funcs.append(nn.Linear(2 * node_hidden_size + 1,
-                                           self.node_activation_hidden_size))
-
-            self.reduce_funcs.append(partial(self.dgmg_reduce, round=t))
-            node_update_funcs.append(
-                nn.GRUCell(self.node_activation_hidden_size,
-                           node_hidden_size))
-
-        self.message_funcs = nn.ModuleList(message_funcs)
-        self.node_update_funcs = nn.ModuleList(node_update_funcs)
-
-    def dgmg_msg(self, edges):
-        """For an edge u->v, return concat([h_u, x_uv])"""
-        return {'m': torch.cat([edges.src['hv'],
-                                edges.data['he']],
-                               dim=1)}
-
-    def dgmg_reduce(self, nodes, round):
-        hv_old = nodes.data['hv']
-        m = nodes.mailbox['m']
-        message = torch.cat([
-            hv_old.unsqueeze(1).expand(-1, m.size(1), -1), m], dim=2)
-        node_activation = (self.message_funcs[round](message)).sum(1)
-
-        return {'a': node_activation}
-
-    def forward(self, g):
-        if g.number_of_edges() > 0:
-            for t in range(self.num_prop_rounds):
-                g.update_all(message_func=self.dgmg_msg,
-                             reduce_func=self.reduce_funcs[t])
-                g.ndata['hv'] = self.node_update_funcs[t](
-                     g.ndata['a'], g.ndata['hv'])
-
-#######################################################################################
-# Actions
-# ``````````````````````````
-# All actions are sampled from distributions parameterized using neural networks
-# and here they are in turn.
-#
-# Action 1: Add nodes
-# ''''''''''''''''''''''''''
-#
-# Given the graph embedding vector :math:`\textbf{h}_{G}`, evaluate
-#
-# .. math::
-#
-#    \text{Sigmoid}(\textbf{W}_{\text{add node}}\textbf{h}_{G}+b_{\text{add node}}),\\
-#
-# which is then used to parametrize a Bernoulli distribution for deciding whether
-# to add a new node.
-#
-# If a new node is to be added, initialize its feature with
-#
-# .. math::
-#
-#    \textbf{W}_{\text{init}}\text{concat}([\textbf{h}_{\text{init}} , \textbf{h}_{G}])+\textbf{b}_{\text{init}},\\
-#
-# where :math:`\textbf{h}_{\text{init}}` is a learnable embedding module for
-# untyped nodes.
-#
-
-import torch.nn.functional as F
-from torch.distributions import Bernoulli
-
-def bernoulli_action_log_prob(logit, action):
-    """Calculate the log p of an action with respect to a Bernoulli
-    distribution. Use logit rather than prob for numerical stability."""
-    if action == 0:
-        return F.logsigmoid(-logit)
-    else:
-        return F.logsigmoid(logit)
-
-class AddNode(nn.Module):
-    def __init__(self, graph_embed_func, node_hidden_size):
-        super(AddNode, self).__init__()
-
-        self.graph_op = {'embed': graph_embed_func}
-
-        self.stop = 1
-        self.add_node = nn.Linear(graph_embed_func.graph_hidden_size, 1)
-
-        # If to add a node, initialize its hv
-        self.node_type_embed = nn.Embedding(1, node_hidden_size)
-        self.initialize_hv = nn.Linear(node_hidden_size + \
-                                       graph_embed_func.graph_hidden_size,
-                                       node_hidden_size)
-
-        self.init_node_activation = torch.zeros(1, 2 * node_hidden_size)
-
-    def _initialize_node_repr(self, g, node_type, graph_embed):
-        """Whenver a node is added, initialize its representation."""
-        num_nodes = g.number_of_nodes()
-        hv_init = self.initialize_hv(
-            torch.cat([
-                self.node_type_embed(torch.LongTensor([node_type])),
-                graph_embed], dim=1))
-        g.nodes[num_nodes - 1].data['hv'] = hv_init
-        g.nodes[num_nodes - 1].data['a'] = self.init_node_activation
-
-    def prepare_training(self):
-        self.log_prob = []
-
-    def forward(self, g, action=None):
-        graph_embed = self.graph_op['embed'](g)
-
-        logit = self.add_node(graph_embed)
-        prob = torch.sigmoid(logit)
-
-        if not self.training:
-            action = Bernoulli(prob).sample().item()
-        stop = bool(action == self.stop)
-
-        if not stop:
-            g.add_nodes(1)
-            self._initialize_node_repr(g, action, graph_embed)
-
-        if self.training:
-            sample_log_prob = bernoulli_action_log_prob(logit, action)
-
-            self.log_prob.append(sample_log_prob)
-
-        return stop
-
-#######################################################################################
-# Action 2: Add edges
-# ''''''''''''''''''''''''''
-#
-# Given the graph embedding vector :math:`\textbf{h}_{G}` and the node
-# embedding vector :math:`\textbf{h}_{v}` for the latest node :math:`v`,
-# you evaluate
-#
-# .. math::
-#
-#    \text{Sigmoid}(\textbf{W}_{\text{add edge}}\text{concat}([\textbf{h}_{G}, \textbf{h}_{v}])+b_{\text{add edge}}),\\
-#
-# which is then used to parametrize a Bernoulli distribution for deciding
-# whether to add a new edge starting from :math:`v`.
-#
-
-class AddEdge(nn.Module):
-    def __init__(self, graph_embed_func, node_hidden_size):
-        super(AddEdge, self).__init__()
-
-        self.graph_op = {'embed': graph_embed_func}
-        self.add_edge = nn.Linear(graph_embed_func.graph_hidden_size + \
-                                  node_hidden_size, 1)
-
-    def prepare_training(self):
-        self.log_prob = []
-
-    def forward(self, g, action=None):
-        graph_embed = self.graph_op['embed'](g)
-        src_embed = g.nodes[g.number_of_nodes() - 1].data['hv']
-
-        logit = self.add_edge(torch.cat(
-            [graph_embed, src_embed], dim=1))
-        prob = torch.sigmoid(logit)
-
-        if self.training:
-            sample_log_prob = bernoulli_action_log_prob(logit, action)
-            self.log_prob.append(sample_log_prob)
-        else:
-            action = Bernoulli(prob).sample().item()
-
-        to_add_edge = bool(action == 0)
-        return to_add_edge
-
-#######################################################################################
-# Action 3: Choose a destination
-# '''''''''''''''''''''''''''''''''
-#
-# When action 2 returns `True`, choose a destination for the
-# latest node :math:`v`.
-#
-# For each possible destination :math:`u\in\{0, \cdots, v-1\}`, the
-# probability of choosing it is given by
-#
-# .. math::
-#
-#    \frac{\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{u}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}{\sum_{i=0}^{v-1}\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{i}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}\\
-#
-
-from torch.distributions import Categorical
-
-class ChooseDestAndUpdate(nn.Module):
-    def __init__(self, graph_prop_func, node_hidden_size):
-        super(ChooseDestAndUpdate, self).__init__()
-
-        self.graph_op = {'prop': graph_prop_func}
-        self.choose_dest = nn.Linear(2 * node_hidden_size, 1)
-
-    def _initialize_edge_repr(self, g, src_list, dest_list):
-        # For untyped edges, only add 1 to indicate its existence.
-        # For multiple edge types, use a one-hot representation
-        # or an embedding module.
-        edge_repr = torch.ones(len(src_list), 1)
-        g.edges[src_list, dest_list].data['he'] = edge_repr
-
-    def prepare_training(self):
-        self.log_prob = []
-
-    def forward(self, g, dest):
-        src = g.number_of_nodes() - 1
-        possible_dests = range(src)
-
-        src_embed_expand = g.nodes[src].data['hv'].expand(src, -1)
-        possible_dests_embed = g.nodes[possible_dests].data['hv']
-
-        dests_scores = self.choose_dest(
-            torch.cat([possible_dests_embed,
-                       src_embed_expand], dim=1)).view(1, -1)
-        dests_probs = F.softmax(dests_scores, dim=1)
-
-        if not self.training:
-            dest = Categorical(dests_probs).sample().item()
-
-        if not g.has_edges_between(src, dest):
-            # For undirected graphs, add edges for both directions
-            # so that you can perform graph propagation.
-            src_list = [src, dest]
-            dest_list = [dest, src]
-
-            g.add_edges(src_list, dest_list)
-            self._initialize_edge_repr(g, src_list, dest_list)
-
-            self.graph_op['prop'](g)
-
-        if self.training:
-            if dests_probs.nelement() > 1:
-                self.log_prob.append(
-                    F.log_softmax(dests_scores, dim=1)[:, dest: dest + 1])
-
-#######################################################################################
-# Putting it together
-# ``````````````````````````
-#
-# You are now ready to have a complete implementation of the model class.
-#
-
-class DGMG(DGMGSkeleton):
-    def __init__(self, v_max, node_hidden_size,
-                 num_prop_rounds):
-        super(DGMG, self).__init__(v_max)
-
-        # Graph embedding module
-        self.graph_embed = GraphEmbed(node_hidden_size)
-
-        # Graph propagation module
-        self.graph_prop = GraphProp(num_prop_rounds,
-                                    node_hidden_size)
-
-        # Actions
-        self.add_node_agent = AddNode(
-            self.graph_embed, node_hidden_size)
-        self.add_edge_agent = AddEdge(
-            self.graph_embed, node_hidden_size)
-        self.choose_dest_agent = ChooseDestAndUpdate(
-            self.graph_prop, node_hidden_size)
-
-        # Forward functions
-        self.forward_train = partial(forward_train, self=self)
-        self.forward_inference = partial(forward_inference, self=self)
-
-    @property
-    def action_step(self):
-        old_step_count = self.step_count
-        self.step_count += 1
-
-        return old_step_count
-
-    def prepare_for_train(self):
-        self.step_count = 0
-
-        self.add_node_agent.prepare_training()
-        self.add_edge_agent.prepare_training()
-        self.choose_dest_agent.prepare_training()
-
-    def add_node_and_update(self, a=None):
-        """Decide if to add a new node.
-        If a new node should be added, update the graph."""
-
-        return self.add_node_agent(self.g, a)
-
-    def add_edge_or_not(self, a=None):
-        """Decide if a new edge should be added."""
-
-        return self.add_edge_agent(self.g, a)
-
-    def choose_dest_and_update(self, a=None):
-        """Choose destination and connect it to the latest node.
-        Add edges for both directions and update the graph."""
-
-        self.choose_dest_agent(self.g, a)
-
-    def get_log_prob(self):
-        add_node_log_p = torch.cat(self.add_node_agent.log_prob).sum()
-        add_edge_log_p = torch.cat(self.add_edge_agent.log_prob).sum()
-        choose_dest_log_p = torch.cat(self.choose_dest_agent.log_prob).sum()
-        return add_node_log_p + add_edge_log_p + choose_dest_log_p
-
-#######################################################################################
-# Below is an animation where a graph is generated on the fly
-# after every 10 batches of training for the first 400 batches. You
-# can see how the model improves over time and begins generating cycles.
-#
-# .. figure:: https://user-images.githubusercontent.com/19576924/48929291-60fe3880-ef22-11e8-832a-fbe56656559a.gif
-#    :alt:
-#
-# For generative models, you can evaluate performance by checking the percentage
-# of valid graphs among the graphs it generates on the fly.
-
-import torch.utils.model_zoo as model_zoo
-
-# Download a pre-trained model state dict for generating cycles with 10-20 nodes.
-state_dict = model_zoo.load_url('https://data.dgl.ai/model/dgmg_cycles-5a0c40be.pth')
-model = DGMG(v_max=20, node_hidden_size=16, num_prop_rounds=2)
-model.load_state_dict(state_dict)
-model.eval()
-
-def is_valid(g):
-    # Check if g is a cycle having 10-20 nodes.
-    def _get_previous(i, v_max):
-        if i == 0:
-            return v_max
-        else:
-            return i - 1
-
-    def _get_next(i, v_max):
-        if i == v_max:
-            return 0
-        else:
-            return i + 1
-
-    size = g.number_of_nodes()
-
-    if size < 10 or size > 20:
-        return False
-
-    for node in range(size):
-        neighbors = g.successors(node)
-
-        if len(neighbors) != 2:
-            return False
-
-        if _get_previous(node, size - 1) not in neighbors:
-            return False
-
-        if _get_next(node, size - 1) not in neighbors:
-            return False
-
-    return True
-
-num_valid = 0
-for i in range(100):
-    g = model()
-    num_valid += is_valid(g)
-
-del model
-print('Among 100 graphs generated, {}% are valid.'.format(num_valid))
-
-#######################################################################################
-# For the complete implementation, see the `DGL DGMG example
-# <https://github.com/dmlc/dgl/tree/master/examples/pytorch/dgmg>`__.
-#
+"""
+
+##############################################################################
+#
+# In this tutorial, you learn how to train and generate one graph at
+# a time. You also explore parallelism within the graph embedding operation, which is an
+# essential building block. The tutorial ends with a simple optimization that
+# delivers double the speed by batching across graphs.
+#
+# Earlier tutorials showed how embedding a graph or
+# a node enables you to work on tasks such as `semi-supervised classification for nodes
+# <http://docs.dgl.ai/tutorials/models/1_gcn.html#sphx-glr-tutorials-models-1-gcn-py>`__
+# or `sentiment analysis
+# <http://docs.dgl.ai/tutorials/models/3_tree-lstm.html#sphx-glr-tutorials-models-3-tree-lstm-py>`__.
+# Wouldn't it be interesting to predict the future evolution of the graph and
+# perform the analysis iteratively?
+#
+# To address the evolution of the graphs, you generate a variety of graph samples. In other words, you need
+# **generative models** of graphs. In-addition to learning
+# node and edge features, you would need to model the distribution of arbitrary graphs.
+# While general generative models can model the density function explicitly and
+# implicitly and generate samples at once or sequentially, you only focus
+# on explicit generative models for sequential generation here. Typical applications
+# include drug or materials discovery, chemical processes, or proteomics.
+#
+# Introduction
+# --------------------
+# The primitive actions of mutating a graph in Deep Graph Library (DGL) are nothing more than ``add_nodes``
+# and ``add_edges``. That is, if you were to draw a circle of three nodes,
+#
+# .. figure:: https://user-images.githubusercontent.com/19576924/48313438-78baf000-e5f7-11e8-931e-cd00ab34fa50.gif
+#    :alt:
+#
+# you can write the code as follows.
+#
+
+import os
+
+os.environ["DGLBACKEND"] = "pytorch"
+import dgl
+
+g = dgl.DGLGraph()
+g.add_nodes(1)  # Add node 0
+g.add_nodes(1)  # Add node 1
+
+# Edges in DGLGraph are directed by default.
+# For undirected edges, add edges for both directions.
+g.add_edges([1, 0], [0, 1])  # Add edges (1, 0), (0, 1)
+g.add_nodes(1)  # Add node 2
+g.add_edges([2, 1], [1, 2])  # Add edges (2, 1), (1, 2)
+g.add_edges([2, 0], [0, 2])  # Add edges (2, 0), (0, 2)
+
+#######################################################################################
+# Real-world graphs are much more complex. There are many families of graphs,
+# with different sizes, topologies, node types, edge types, and the possibility
+# of multigraphs. Besides, a same graph can be generated in many different
+# orders. Regardless, the generative process entails a few steps.
+#
+# - Encode a changing graph.
+# - Perform actions stochastically.
+# - If you are training, collect error signals and optimize the model parameters.
+#
+# When it comes to implementation, another important aspect is speed. How do you
+# parallelize the computation, given that generating a graph is fundamentally a
+# sequential process?
+#
+# .. note::
+#
+#    To be sure, this is not necessarily a hard constraint. Subgraphs can be
+#    built in parallel and then get assembled. But we
+#    will restrict ourselves to the sequential processes for this tutorial.
+#
+#
+# DGMG: The main flow
+# --------------------
+# For this tutorial, you use
+# `Deep Generative Models of Graphs <https://arxiv.org/abs/1803.03324>`__
+# ) (DGMG) to implement a graph generative model using DGL. Its algorithmic
+# framework is general but also challenging to parallelize.
+#
+# .. note::
+#
+#    While it's possible for DGMG to handle complex graphs with typed nodes,
+#    typed edges, and multigraphs, here you use a simplified version of it
+#    for generating graph topologies.
+#
+# DGMG generates a graph by following a state machine, which is basically a
+# two-level loop. Generate one node at a time and connect it to a subset of
+# the existing nodes, one at a time. This is similar to language modeling. The
+# generative process is an iterative one that emits one word or character or sentence
+# at a time, conditioned on the sequence generated so far.
+#
+# At each time step, you either:
+#      - Add a new node to the graph
+#      - Select two existing nodes and add an edge between them
+#
+# .. figure:: https://user-images.githubusercontent.com/19576924/48605003-7f11e900-e9b6-11e8-8880-87362348e154.png
+#    :alt:
+#
+# The Python code will look as follows. In fact, this is *exactly* how inference
+# with DGMG is implemented in DGL.
+#
+
+
+def forward_inference(self):
+    stop = self.add_node_and_update()
+    while (not stop) and (self.g.number_of_nodes() < self.v_max + 1):
+        num_trials = 0
+        to_add_edge = self.add_edge_or_not()
+        while to_add_edge and (num_trials < self.g.number_of_nodes() - 1):
+            self.choose_dest_and_update()
+            num_trials += 1
+            to_add_edge = self.add_edge_or_not()
+        stop = self.add_node_and_update()
+    return self.g
+
+
+#######################################################################################
+# Assume you have a pre-trained model for generating cycles of nodes 10-20.
+# How does it generate a cycle on-the-fly during inference? Use the code below
+# to create an animation with your own model.
+#
+# ::
+#
+#     import torch
+#     import matplotlib.animation as animation
+#     import matplotlib.pyplot as plt
+#     import networkx as nx
+#     from copy import deepcopy
+#
+#     if __name__ == '__main__':
+#         # pre-trained model saved with path ./model.pth
+#         model = torch.load('./model.pth')
+#         model.eval()
+#         g = model()
+#
+#         src_list = g.edges()[1]
+#         dest_list = g.edges()[0]
+#
+#         evolution = []
+#
+#         nx_g = nx.Graph()
+#         evolution.append(deepcopy(nx_g))
+#
+#         for i in range(0, len(src_list), 2):
+#             src = src_list[i].item()
+#             dest = dest_list[i].item()
+#             if src not in nx_g.nodes():
+#                 nx_g.add_node(src)
+#                 evolution.append(deepcopy(nx_g))
+#             if dest not in nx_g.nodes():
+#                 nx_g.add_node(dest)
+#                 evolution.append(deepcopy(nx_g))
+#             nx_g.add_edges_from([(src, dest), (dest, src)])
+#             evolution.append(deepcopy(nx_g))
+#
+#         def animate(i):
+#             ax.cla()
+#             g_t = evolution[i]
+#             nx.draw_circular(g_t, with_labels=True, ax=ax,
+#                              node_color=['#FEBD69'] * g_t.number_of_nodes())
+#
+#         fig, ax = plt.subplots()
+#         ani = animation.FuncAnimation(fig, animate,
+#                                       frames=len(evolution),
+#                                       interval=600)
+#
+# .. figure:: https://user-images.githubusercontent.com/19576924/48928548-2644d200-ef1b-11e8-8591-da93345382ad.gif
+#    :alt:
+#
+# DGMG: Optimization objective
+# ------------------------------
+# Similar to language modeling, DGMG trains the model with *behavior cloning*,
+# or *teacher forcing*. Assume for each graph there exists a sequence of
+# *oracle actions* :math:`a_{1},\cdots,a_{T}` that generates it. What the model
+# does is to follow these actions, compute the joint probabilities of such
+# action sequences, and maximize them.
+#
+# By chain rule, the probability of taking :math:`a_{1},\cdots,a_{T}` is:
+#
+# .. math::
+#
+#    p(a_{1},\cdots, a_{T}) = p(a_{1})p(a_{2}|a_{1})\cdots p(a_{T}|a_{1},\cdots,a_{T-1}).\\
+#
+# The optimization objective is then simply the typical MLE loss:
+#
+# .. math::
+#
+#    -\log p(a_{1},\cdots,a_{T})=-\sum_{t=1}^{T}\log p(a_{t}|a_{1},\cdots, a_{t-1}).\\
+#
+
+
+def forward_train(self, actions):
+    """
+    - actions: list
+        - Contains a_1, ..., a_T described above
+    - self.prepare_for_train()
+        - Initializes self.action_step to be 0, which will get
+          incremented by 1 every time it is called.
+        - Initializes objects recording log p(a_t|a_1,...a_{t-1})
+
+    Returns
+    -------
+    - self.get_log_prob(): log p(a_1, ..., a_T)
+    """
+    self.prepare_for_train()
+
+    stop = self.add_node_and_update(a=actions[self.action_step])
+    while not stop:
+        to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
+        while to_add_edge:
+            self.choose_dest_and_update(a=actions[self.action_step])
+            to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
+        stop = self.add_node_and_update(a=actions[self.action_step])
+    return self.get_log_prob()
+
+
+#######################################################################################
+# The key difference between ``forward_train`` and ``forward_inference`` is
+# that the training process takes oracle actions as input and returns log
+# probabilities for evaluating the loss.
+#
+# DGMG: The implementation
+# --------------------------
+# The ``DGMG`` class
+# ``````````````````````````
+# Below you can find the skeleton code for the model. You gradually
+# fill in the details for each function.
+#
+
+import torch.nn as nn
+
+
+class DGMGSkeleton(nn.Module):
+    def __init__(self, v_max):
+        """
+        Parameters
+        ----------
+        v_max: int
+            Max number of nodes considered
+        """
+        super(DGMGSkeleton, self).__init__()
+
+        # Graph configuration
+        self.v_max = v_max
+
+    def add_node_and_update(self, a=None):
+        """Decide if to add a new node.
+        If a new node should be added, update the graph."""
+        return NotImplementedError
+
+    def add_edge_or_not(self, a=None):
+        """Decide if a new edge should be added."""
+        return NotImplementedError
+
+    def choose_dest_and_update(self, a=None):
+        """Choose destination and connect it to the latest node.
+        Add edges for both directions and update the graph."""
+        return NotImplementedError
+
+    def forward_train(self, actions):
+        """Forward at training time. It records the probability
+        of generating a ground truth graph following the actions."""
+        return NotImplementedError
+
+    def forward_inference(self):
+        """Forward at inference time.
+        It generates graphs on the fly."""
+        return NotImplementedError
+
+    def forward(self, actions=None):
+        # The graph you will work on
+        self.g = dgl.DGLGraph()
+
+        # If there are some features for nodes and edges,
+        # zero tensors will be set for those of new nodes and edges.
+        self.g.set_n_initializer(dgl.frame.zero_initializer)
+        self.g.set_e_initializer(dgl.frame.zero_initializer)
+
+        if self.training:
+            return self.forward_train(actions=actions)
+        else:
+            return self.forward_inference()
+
+
+#######################################################################################
+# Encoding a dynamic graph
+# ``````````````````````````
+# All the actions generating a graph are sampled from probability
+# distributions. In order to do that, you project the structured data,
+# namely the graph, onto an Euclidean space. The challenge is that such
+# process, called *embedding*, needs to be repeated as the graphs mutate.
+#
+# Graph embedding
+# ''''''''''''''''''''''''''
+# Let :math:`G=(V,E)` be an arbitrary graph. Each node :math:`v` has an
+# embedding vector :math:`\textbf{h}_{v} \in \mathbb{R}^{n}`. Similarly,
+# the graph has an embedding vector :math:`\textbf{h}_{G} \in \mathbb{R}^{k}`.
+# Typically, :math:`k > n` since a graph contains more information than
+# an individual node.
+#
+# The graph embedding is a weighted sum of node embeddings under a linear
+# transformation:
+#
+# .. math::
+#
+#    \textbf{h}_{G} =\sum_{v\in V}\text{Sigmoid}(g_m(\textbf{h}_{v}))f_{m}(\textbf{h}_{v}),\\
+#
+# The first term, :math:`\text{Sigmoid}(g_m(\textbf{h}_{v}))`, computes a
+# gating function and can be thought of as how much the overall graph embedding
+# attends on each node. The second term :math:`f_{m}:\mathbb{R}^{n}\rightarrow\mathbb{R}^{k}`
+# maps the node embeddings to the space of graph embeddings.
+#
+# Implement graph embedding as a ``GraphEmbed`` class.
+#
+
+import torch
+
+
+class GraphEmbed(nn.Module):
+    def __init__(self, node_hidden_size):
+        super(GraphEmbed, self).__init__()
+
+        # Setting from the paper
+        self.graph_hidden_size = 2 * node_hidden_size
+
+        # Embed graphs
+        self.node_gating = nn.Sequential(
+            nn.Linear(node_hidden_size, 1), nn.Sigmoid()
+        )
+        self.node_to_graph = nn.Linear(node_hidden_size, self.graph_hidden_size)
+
+    def forward(self, g):
+        if g.number_of_nodes() == 0:
+            return torch.zeros(1, self.graph_hidden_size)
+        else:
+            # Node features are stored as hv in ndata.
+            hvs = g.ndata["hv"]
+            return (self.node_gating(hvs) * self.node_to_graph(hvs)).sum(
+                0, keepdim=True
+            )
+
+
+#######################################################################################
+# Update node embeddings via graph propagation
+# '''''''''''''''''''''''''''''''''''''''''''''
+#
+# The mechanism of updating node embeddings in DGMG is similar to that for
+# graph convolutional networks. For a node :math:`v` in the graph, its
+# neighbor :math:`u` sends a message to it with
+#
+# .. math::
+#
+#    \textbf{m}_{u\rightarrow v}=\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m},\\
+#
+# where :math:`\textbf{x}_{u,v}` is the embedding of the edge between
+# :math:`u` and :math:`v`.
+#
+# After receiving messages from all its neighbors, :math:`v` summarizes them
+# with a node activation vector
+#
+# .. math::
+#
+#    \textbf{a}_{v} = \sum_{u: (u, v)\in E}\textbf{m}_{u\rightarrow v}\\
+#
+# and use this information to update its own feature:
+#
+# .. math::
+#
+#    \textbf{h}'_{v} = \textbf{GRU}(\textbf{h}_{v}, \textbf{a}_{v}).\\
+#
+# Performing all the operations above once for all nodes synchronously is
+# called one round of graph propagation. The more rounds of graph propagation
+# you perform, the longer distance messages travel throughout the graph.
+#
+# With DGL, you implement graph propagation with ``g.update_all``.
+# The message notation here can be a bit confusing. Researchers can refer
+# to :math:`\textbf{m}_{u\rightarrow v}` as messages, however the message function
+# below only passes :math:`\text{concat}([\textbf{h}_{u}, \textbf{x}_{u, v}])`.
+# The operation :math:`\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m}`
+# is then performed across all edges at once for efficiency consideration.
+#
+
+from functools import partial
+
+
+class GraphProp(nn.Module):
+    def __init__(self, num_prop_rounds, node_hidden_size):
+        super(GraphProp, self).__init__()
+
+        self.num_prop_rounds = num_prop_rounds
+
+        # Setting from the paper
+        self.node_activation_hidden_size = 2 * node_hidden_size
+
+        message_funcs = []
+        node_update_funcs = []
+        self.reduce_funcs = []
+
+        for t in range(num_prop_rounds):
+            # input being [hv, hu, xuv]
+            message_funcs.append(
+                nn.Linear(
+                    2 * node_hidden_size + 1, self.node_activation_hidden_size
+                )
+            )
+
+            self.reduce_funcs.append(partial(self.dgmg_reduce, round=t))
+            node_update_funcs.append(
+                nn.GRUCell(self.node_activation_hidden_size, node_hidden_size)
+            )
+        self.message_funcs = nn.ModuleList(message_funcs)
+        self.node_update_funcs = nn.ModuleList(node_update_funcs)
+
+    def dgmg_msg(self, edges):
+        """For an edge u->v, return concat([h_u, x_uv])"""
+        return {"m": torch.cat([edges.src["hv"], edges.data["he"]], dim=1)}
+
+    def dgmg_reduce(self, nodes, round):
+        hv_old = nodes.data["hv"]
+        m = nodes.mailbox["m"]
+        message = torch.cat(
+            [hv_old.unsqueeze(1).expand(-1, m.size(1), -1), m], dim=2
+        )
+        node_activation = (self.message_funcs[round](message)).sum(1)
+
+        return {"a": node_activation}
+
+    def forward(self, g):
+        if g.number_of_edges() > 0:
+            for t in range(self.num_prop_rounds):
+                g.update_all(
+                    message_func=self.dgmg_msg, reduce_func=self.reduce_funcs[t]
+                )
+                g.ndata["hv"] = self.node_update_funcs[t](
+                    g.ndata["a"], g.ndata["hv"]
+                )
+
+
+#######################################################################################
+# Actions
+# ``````````````````````````
+# All actions are sampled from distributions parameterized using neural networks
+# and here they are in turn.
+#
+# Action 1: Add nodes
+# ''''''''''''''''''''''''''
+#
+# Given the graph embedding vector :math:`\textbf{h}_{G}`, evaluate
+#
+# .. math::
+#
+#    \text{Sigmoid}(\textbf{W}_{\text{add node}}\textbf{h}_{G}+b_{\text{add node}}),\\
+#
+# which is then used to parametrize a Bernoulli distribution for deciding whether
+# to add a new node.
+#
+# If a new node is to be added, initialize its feature with
+#
+# .. math::
+#
+#    \textbf{W}_{\text{init}}\text{concat}([\textbf{h}_{\text{init}} , \textbf{h}_{G}])+\textbf{b}_{\text{init}},\\
+#
+# where :math:`\textbf{h}_{\text{init}}` is a learnable embedding module for
+# untyped nodes.
+#
+
+import torch.nn.functional as F
+from torch.distributions import Bernoulli
+
+
+def bernoulli_action_log_prob(logit, action):
+    """Calculate the log p of an action with respect to a Bernoulli
+    distribution. Use logit rather than prob for numerical stability."""
+    if action == 0:
+        return F.logsigmoid(-logit)
+    else:
+        return F.logsigmoid(logit)
+
+
+class AddNode(nn.Module):
+    def __init__(self, graph_embed_func, node_hidden_size):
+        super(AddNode, self).__init__()
+
+        self.graph_op = {"embed": graph_embed_func}
+
+        self.stop = 1
+        self.add_node = nn.Linear(graph_embed_func.graph_hidden_size, 1)
+
+        # If to add a node, initialize its hv
+        self.node_type_embed = nn.Embedding(1, node_hidden_size)
+        self.initialize_hv = nn.Linear(
+            node_hidden_size + graph_embed_func.graph_hidden_size,
+            node_hidden_size,
+        )
+
+        self.init_node_activation = torch.zeros(1, 2 * node_hidden_size)
+
+    def _initialize_node_repr(self, g, node_type, graph_embed):
+        """Whenver a node is added, initialize its representation."""
+        num_nodes = g.number_of_nodes()
+        hv_init = self.initialize_hv(
+            torch.cat(
+                [
+                    self.node_type_embed(torch.LongTensor([node_type])),
+                    graph_embed,
+                ],
+                dim=1,
+            )
+        )
+        g.nodes[num_nodes - 1].data["hv"] = hv_init
+        g.nodes[num_nodes - 1].data["a"] = self.init_node_activation
+
+    def prepare_training(self):
+        self.log_prob = []
+
+    def forward(self, g, action=None):
+        graph_embed = self.graph_op["embed"](g)
+
+        logit = self.add_node(graph_embed)
+        prob = torch.sigmoid(logit)
+
+        if not self.training:
+            action = Bernoulli(prob).sample().item()
+        stop = bool(action == self.stop)
+
+        if not stop:
+            g.add_nodes(1)
+            self._initialize_node_repr(g, action, graph_embed)
+        if self.training:
+            sample_log_prob = bernoulli_action_log_prob(logit, action)
+
+            self.log_prob.append(sample_log_prob)
+        return stop
+
+
+#######################################################################################
+# Action 2: Add edges
+# ''''''''''''''''''''''''''
+#
+# Given the graph embedding vector :math:`\textbf{h}_{G}` and the node
+# embedding vector :math:`\textbf{h}_{v}` for the latest node :math:`v`,
+# you evaluate
+#
+# .. math::
+#
+#    \text{Sigmoid}(\textbf{W}_{\text{add edge}}\text{concat}([\textbf{h}_{G}, \textbf{h}_{v}])+b_{\text{add edge}}),\\
+#
+# which is then used to parametrize a Bernoulli distribution for deciding
+# whether to add a new edge starting from :math:`v`.
+#
+
+
+class AddEdge(nn.Module):
+    def __init__(self, graph_embed_func, node_hidden_size):
+        super(AddEdge, self).__init__()
+
+        self.graph_op = {"embed": graph_embed_func}
+        self.add_edge = nn.Linear(
+            graph_embed_func.graph_hidden_size + node_hidden_size, 1
+        )
+
+    def prepare_training(self):
+        self.log_prob = []
+
+    def forward(self, g, action=None):
+        graph_embed = self.graph_op["embed"](g)
+        src_embed = g.nodes[g.number_of_nodes() - 1].data["hv"]
+
+        logit = self.add_edge(torch.cat([graph_embed, src_embed], dim=1))
+        prob = torch.sigmoid(logit)
+
+        if self.training:
+            sample_log_prob = bernoulli_action_log_prob(logit, action)
+            self.log_prob.append(sample_log_prob)
+        else:
+            action = Bernoulli(prob).sample().item()
+        to_add_edge = bool(action == 0)
+        return to_add_edge
+
+
+#######################################################################################
+# Action 3: Choose a destination
+# '''''''''''''''''''''''''''''''''
+#
+# When action 2 returns `True`, choose a destination for the
+# latest node :math:`v`.
+#
+# For each possible destination :math:`u\in\{0, \cdots, v-1\}`, the
+# probability of choosing it is given by
+#
+# .. math::
+#
+#    \frac{\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{u}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}{\sum_{i=0}^{v-1}\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{i}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}\\
+#
+
+from torch.distributions import Categorical
+
+
+class ChooseDestAndUpdate(nn.Module):
+    def __init__(self, graph_prop_func, node_hidden_size):
+        super(ChooseDestAndUpdate, self).__init__()
+
+        self.graph_op = {"prop": graph_prop_func}
+        self.choose_dest = nn.Linear(2 * node_hidden_size, 1)
+
+    def _initialize_edge_repr(self, g, src_list, dest_list):
+        # For untyped edges, only add 1 to indicate its existence.
+        # For multiple edge types, use a one-hot representation
+        # or an embedding module.
+        edge_repr = torch.ones(len(src_list), 1)
+        g.edges[src_list, dest_list].data["he"] = edge_repr
+
+    def prepare_training(self):
+        self.log_prob = []
+
+    def forward(self, g, dest):
+        src = g.number_of_nodes() - 1
+        possible_dests = range(src)
+
+        src_embed_expand = g.nodes[src].data["hv"].expand(src, -1)
+        possible_dests_embed = g.nodes[possible_dests].data["hv"]
+
+        dests_scores = self.choose_dest(
+            torch.cat([possible_dests_embed, src_embed_expand], dim=1)
+        ).view(1, -1)
+        dests_probs = F.softmax(dests_scores, dim=1)
+
+        if not self.training:
+            dest = Categorical(dests_probs).sample().item()
+        if not g.has_edges_between(src, dest):
+            # For undirected graphs, add edges for both directions
+            # so that you can perform graph propagation.
+            src_list = [src, dest]
+            dest_list = [dest, src]
+
+            g.add_edges(src_list, dest_list)
+            self._initialize_edge_repr(g, src_list, dest_list)
+
+            self.graph_op["prop"](g)
+        if self.training:
+            if dests_probs.nelement() > 1:
+                self.log_prob.append(
+                    F.log_softmax(dests_scores, dim=1)[:, dest : dest + 1]
+                )
+
+
+#######################################################################################
+# Putting it together
+# ``````````````````````````
+#
+# You are now ready to have a complete implementation of the model class.
+#
+
+
+class DGMG(DGMGSkeleton):
+    def __init__(self, v_max, node_hidden_size, num_prop_rounds):
+        super(DGMG, self).__init__(v_max)
+
+        # Graph embedding module
+        self.graph_embed = GraphEmbed(node_hidden_size)
+
+        # Graph propagation module
+        self.graph_prop = GraphProp(num_prop_rounds, node_hidden_size)
+
+        # Actions
+        self.add_node_agent = AddNode(self.graph_embed, node_hidden_size)
+        self.add_edge_agent = AddEdge(self.graph_embed, node_hidden_size)
+        self.choose_dest_agent = ChooseDestAndUpdate(
+            self.graph_prop, node_hidden_size
+        )
+
+        # Forward functions
+        self.forward_train = partial(forward_train, self=self)
+        self.forward_inference = partial(forward_inference, self=self)
+
+    @property
+    def action_step(self):
+        old_step_count = self.step_count
+        self.step_count += 1
+
+        return old_step_count
+
+    def prepare_for_train(self):
+        self.step_count = 0
+
+        self.add_node_agent.prepare_training()
+        self.add_edge_agent.prepare_training()
+        self.choose_dest_agent.prepare_training()
+
+    def add_node_and_update(self, a=None):
+        """Decide if to add a new node.
+        If a new node should be added, update the graph."""
+
+        return self.add_node_agent(self.g, a)
+
+    def add_edge_or_not(self, a=None):
+        """Decide if a new edge should be added."""
+
+        return self.add_edge_agent(self.g, a)
+
+    def choose_dest_and_update(self, a=None):
+        """Choose destination and connect it to the latest node.
+        Add edges for both directions and update the graph."""
+
+        self.choose_dest_agent(self.g, a)
+
+    def get_log_prob(self):
+        add_node_log_p = torch.cat(self.add_node_agent.log_prob).sum()
+        add_edge_log_p = torch.cat(self.add_edge_agent.log_prob).sum()
+        choose_dest_log_p = torch.cat(self.choose_dest_agent.log_prob).sum()
+        return add_node_log_p + add_edge_log_p + choose_dest_log_p
+
+
+#######################################################################################
+# Below is an animation where a graph is generated on the fly
+# after every 10 batches of training for the first 400 batches. You
+# can see how the model improves over time and begins generating cycles.
+#
+# .. figure:: https://user-images.githubusercontent.com/19576924/48929291-60fe3880-ef22-11e8-832a-fbe56656559a.gif
+#    :alt:
+#
+# For generative models, you can evaluate performance by checking the percentage
+# of valid graphs among the graphs it generates on the fly.
+
+import torch.utils.model_zoo as model_zoo
+
+# Download a pre-trained model state dict for generating cycles with 10-20 nodes.
+state_dict = model_zoo.load_url(
+    "https://data.dgl.ai/model/dgmg_cycles-5a0c40be.pth"
+)
+model = DGMG(v_max=20, node_hidden_size=16, num_prop_rounds=2)
+model.load_state_dict(state_dict)
+model.eval()
+
+
+def is_valid(g):
+    # Check if g is a cycle having 10-20 nodes.
+    def _get_previous(i, v_max):
+        if i == 0:
+            return v_max
+        else:
+            return i - 1
+
+    def _get_next(i, v_max):
+        if i == v_max:
+            return 0
+        else:
+            return i + 1
+
+    size = g.number_of_nodes()
+
+    if size < 10 or size > 20:
+        return False
+    for node in range(size):
+        neighbors = g.successors(node)
+
+        if len(neighbors) != 2:
+            return False
+        if _get_previous(node, size - 1) not in neighbors:
+            return False
+        if _get_next(node, size - 1) not in neighbors:
+            return False
+    return True
+
+
+num_valid = 0
+for i in range(100):
+    g = model()
+    num_valid += is_valid(g)
+del model
+print("Among 100 graphs generated, {}% are valid.".format(num_valid))
+
+#######################################################################################
+# For the complete implementation, see the `DGL DGMG example
+# <https://github.com/dmlc/dgl/tree/master/examples/pytorch/dgmg>`__.
+#
--- a/tutorials/models/4_old_wines/2_capsule.py
+++ b/tutorials/models/4_old_wines/2_capsule.py
@@ -17,276 +17,275 @@ offers a different perspective. The tutorial describes how to implement a Capsul
    efficiency. For recommended implementation, please refer to the `official
    examples <https://github.com/dmlc/dgl/tree/master/examples>`_.

-"""
-#######################################################################################
-# Key ideas of Capsule
-# --------------------
-#
-# The Capsule model offers two key ideas: Richer representation and dynamic routing.
-#
-# **Richer representation** -- In classic convolutional networks, a scalar
-# value represents the activation of a given feature. By contrast, a
-# capsule outputs a vector. The vector's length represents the probability
-# of a feature being present. The vector's orientation represents the
-# various properties of the feature (such as pose, deformation, texture
-# etc.).
-#
-# |image0|
-#
-# **Dynamic routing** -- The output of a capsule is sent to
-# certain parents in the layer above based on how well the capsule's
-# prediction agrees with that of a parent. Such dynamic
-# routing-by-agreement generalizes the static routing of max-pooling.
-#
-# During training, routing is accomplished iteratively. Each iteration adjusts
-# routing weights between capsules based on their observed agreements.
-# It's a manner similar to a k-means algorithm or `competitive
-# learning <https://en.wikipedia.org/wiki/Competitive_learning>`__.
-#
-# In this tutorial, you see how a capsule's dynamic routing algorithm can be
-# naturally expressed as a graph algorithm. The implementation is adapted
-# from `Cedric
-# Chee <https://github.com/cedrickchee/capsule-net-pytorch>`__, replacing
-# only the routing layer. This version achieves similar speed and accuracy.
-#
-# Model implementation
-# ----------------------
-# Step 1: Setup and graph initialization
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# The connectivity between two layers of capsules form a directed,
-# bipartite graph, as shown in the Figure below.
-#
-# |image1|
-#
-# Each node :math:`j` is associated with feature :math:`v_j`,
-# representing its capsule’s output. Each edge is associated with
-# features :math:`b_{ij}` and :math:`\hat{u}_{j|i}`. :math:`b_{ij}`
-# determines routing weights, and :math:`\hat{u}_{j|i}` represents the
-# prediction of capsule :math:`i` for :math:`j`.
-#
-# Here's how we set up the graph and initialize node and edge features.
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import matplotlib.pyplot as plt
-import numpy as np
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-
-import dgl
-
-
-def init_graph(in_nodes, out_nodes, f_size):
-    u = np.repeat(np.arange(in_nodes), out_nodes)
-    v = np.tile(np.arange(in_nodes, in_nodes + out_nodes), in_nodes)
-    g = dgl.DGLGraph((u, v))
-    # init states
-    g.ndata["v"] = th.zeros(in_nodes + out_nodes, f_size)
-    g.edata["b"] = th.zeros(in_nodes * out_nodes, 1)
-    return g
-
-
-#########################################################################################
-# Step 2: Define message passing functions
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# This is the pseudocode for Capsule's routing algorithm.
-#
-# |image2|
-# Implement pseudocode lines 4-7 in the class `DGLRoutingLayer` as the following steps:
-#
-# 1. Calculate coupling coefficients.
-#
-#    -  Coefficients are the softmax over all out-edge of in-capsules.
-#       :math:`\textbf{c}_{i,j} = \text{softmax}(\textbf{b}_{i,j})`.
-#
-# 2. Calculate weighted sum over all in-capsules.
-#
-#    -  Output of a capsule is equal to the weighted sum of its in-capsules
-#       :math:`s_j=\sum_i c_{ij}\hat{u}_{j|i}`
-#
-# 3. Squash outputs.
-#
-#    -  Squash the length of a Capsule's output vector to range (0,1), so it can represent the probability (of some feature being present).
-#    -  :math:`v_j=\text{squash}(s_j)=\frac{||s_j||^2}{1+||s_j||^2}\frac{s_j}{||s_j||}`
-#
-# 4. Update weights by the amount of agreement.
-#
-#    -  The scalar product :math:`\hat{u}_{j|i}\cdot v_j` can be considered as how well capsule :math:`i` agrees with :math:`j`. It is used to update
-#       :math:`b_{ij}=b_{ij}+\hat{u}_{j|i}\cdot v_j`
-
-import dgl.function as fn
-
-
-class DGLRoutingLayer(nn.Module):
-    def __init__(self, in_nodes, out_nodes, f_size):
-        super(DGLRoutingLayer, self).__init__()
-        self.g = init_graph(in_nodes, out_nodes, f_size)
-        self.in_nodes = in_nodes
-        self.out_nodes = out_nodes
-        self.in_indx = list(range(in_nodes))
-        self.out_indx = list(range(in_nodes, in_nodes + out_nodes))
-
-    def forward(self, u_hat, routing_num=1):
-        self.g.edata["u_hat"] = u_hat
-
-        for r in range(routing_num):
-            # step 1 (line 4): normalize over out edges
-            edges_b = self.g.edata["b"].view(self.in_nodes, self.out_nodes)
-            self.g.edata["c"] = F.softmax(edges_b, dim=1).view(-1, 1)
-            self.g.edata["c u_hat"] = self.g.edata["c"] * self.g.edata["u_hat"]
-
-            # Execute step 1 & 2
-            self.g.update_all(fn.copy_e("c u_hat", "m"), fn.sum("m", "s"))
-
-            # step 3 (line 6)
-            self.g.nodes[self.out_indx].data["v"] = self.squash(
-                self.g.nodes[self.out_indx].data["s"], dim=1
-            )
-
-            # step 4 (line 7)
-            v = th.cat(
-                [self.g.nodes[self.out_indx].data["v"]] * self.in_nodes, dim=0
-            )
-            self.g.edata["b"] = self.g.edata["b"] + (
-                self.g.edata["u_hat"] * v
-            ).sum(dim=1, keepdim=True)
-
-    @staticmethod
-    def squash(s, dim=1):
-        sq = th.sum(s**2, dim=dim, keepdim=True)
-        s_norm = th.sqrt(sq)
-        s = (sq / (1.0 + sq)) * (s / s_norm)
-        return s
-
-
-############################################################################################################
-# Step 3: Testing
-# ~~~~~~~~~~~~~~~
-#
-# Make a simple 20x10 capsule layer.
-in_nodes = 20
-out_nodes = 10
-f_size = 4
-u_hat = th.randn(in_nodes * out_nodes, f_size)
-routing = DGLRoutingLayer(in_nodes, out_nodes, f_size)
-
-############################################################################################################
-# You can visualize a Capsule network's behavior by monitoring the entropy
-# of coupling coefficients. They should start high and then drop, as the
-# weights gradually concentrate on fewer edges.
-entropy_list = []
-dist_list = []
-
-for i in range(10):
-    routing(u_hat)
-    dist_matrix = routing.g.edata["c"].view(in_nodes, out_nodes)
-    entropy = (-dist_matrix * th.log(dist_matrix)).sum(dim=1)
-    entropy_list.append(entropy.data.numpy())
-    dist_list.append(dist_matrix.data.numpy())
-
-stds = np.std(entropy_list, axis=1)
-means = np.mean(entropy_list, axis=1)
-plt.errorbar(np.arange(len(entropy_list)), means, stds, marker="o")
-plt.ylabel("Entropy of Weight Distribution")
-plt.xlabel("Number of Routing")
-plt.xticks(np.arange(len(entropy_list)))
-plt.close()
-############################################################################################################
-# |image3|
-#
-# Alternatively, we can also watch the evolution of histograms.
-
-import matplotlib.animation as animation
-import seaborn as sns
-
-fig = plt.figure(dpi=150)
-fig.clf()
-ax = fig.subplots()
-
-
-def dist_animate(i):
-    ax.cla()
-    sns.distplot(dist_list[i].reshape(-1), kde=False, ax=ax)
-    ax.set_xlabel("Weight Distribution Histogram")
-    ax.set_title("Routing: %d" % (i))
-
-
-ani = animation.FuncAnimation(
-    fig, dist_animate, frames=len(entropy_list), interval=500
-)
-plt.close()
-
-############################################################################################################
-# |image4|
-#
-# You can monitor the how lower-level Capsules gradually attach to one of the
-# higher level ones.
-import networkx as nx
-from networkx.algorithms import bipartite
-
-g = routing.g.to_networkx()
-X, Y = bipartite.sets(g)
-height_in = 10
-height_out = height_in * 0.8
-height_in_y = np.linspace(0, height_in, in_nodes)
-height_out_y = np.linspace((height_in - height_out) / 2, height_out, out_nodes)
-pos = dict()
-
-fig2 = plt.figure(figsize=(8, 3), dpi=150)
-fig2.clf()
-ax = fig2.subplots()
-pos.update(
-    (n, (i, 1)) for i, n in zip(height_in_y, X)
-)  # put nodes from X at x=1
-pos.update(
-    (n, (i, 2)) for i, n in zip(height_out_y, Y)
-)  # put nodes from Y at x=2
-
-
-def weight_animate(i):
-    ax.cla()
-    ax.axis("off")
-    ax.set_title("Routing: %d  " % i)
-    dm = dist_list[i]
-    nx.draw_networkx_nodes(
-        g, pos, nodelist=range(in_nodes), node_color="r", node_size=100, ax=ax
-    )
-    nx.draw_networkx_nodes(
-        g,
-        pos,
-        nodelist=range(in_nodes, in_nodes + out_nodes),
-        node_color="b",
-        node_size=100,
-        ax=ax,
-    )
-    for edge in g.edges():
-        nx.draw_networkx_edges(
-            g,
-            pos,
-            edgelist=[edge],
-            width=dm[edge[0], edge[1] - in_nodes] * 1.5,
-            ax=ax,
-        )
-
-
-ani2 = animation.FuncAnimation(
-    fig2, weight_animate, frames=len(dist_list), interval=500
-)
-plt.close()
-
-############################################################################################################
-# |image5|
-#
-# The full code of this visualization is provided on
-# `GitHub <https://github.com/dmlc/dgl/blob/master/examples/pytorch/capsule/simple_routing.py>`__. The complete
-# code that trains on MNIST is also on `GitHub <https://github.com/dmlc/dgl/tree/tutorial/examples/pytorch/capsule>`__.
-#
-# .. |image0| image:: https://i.imgur.com/55Ovkdh.png
-# .. |image1| image:: https://i.imgur.com/9tc6GLl.png
-# .. |image2| image:: https://i.imgur.com/mv1W9Rv.png
-# .. |image3| image:: https://i.imgur.com/dMvu7p3.png
-# .. |image4| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_dist.gif
-# .. |image5| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_vis.gif
+"""
+#######################################################################################
+# Key ideas of Capsule
+# --------------------
+#
+# The Capsule model offers two key ideas: Richer representation and dynamic routing.
+#
+# **Richer representation** -- In classic convolutional networks, a scalar
+# value represents the activation of a given feature. By contrast, a
+# capsule outputs a vector. The vector's length represents the probability
+# of a feature being present. The vector's orientation represents the
+# various properties of the feature (such as pose, deformation, texture
+# etc.).
+#
+# |image0|
+#
+# **Dynamic routing** -- The output of a capsule is sent to
+# certain parents in the layer above based on how well the capsule's
+# prediction agrees with that of a parent. Such dynamic
+# routing-by-agreement generalizes the static routing of max-pooling.
+#
+# During training, routing is accomplished iteratively. Each iteration adjusts
+# routing weights between capsules based on their observed agreements.
+# It's a manner similar to a k-means algorithm or `competitive
+# learning <https://en.wikipedia.org/wiki/Competitive_learning>`__.
+#
+# In this tutorial, you see how a capsule's dynamic routing algorithm can be
+# naturally expressed as a graph algorithm. The implementation is adapted
+# from `Cedric
+# Chee <https://github.com/cedrickchee/capsule-net-pytorch>`__, replacing
+# only the routing layer. This version achieves similar speed and accuracy.
+#
+# Model implementation
+# ----------------------
+# Step 1: Setup and graph initialization
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The connectivity between two layers of capsules form a directed,
+# bipartite graph, as shown in the Figure below.
+#
+# |image1|
+#
+# Each node :math:`j` is associated with feature :math:`v_j`,
+# representing its capsule’s output. Each edge is associated with
+# features :math:`b_{ij}` and :math:`\hat{u}_{j|i}`. :math:`b_{ij}`
+# determines routing weights, and :math:`\hat{u}_{j|i}` represents the
+# prediction of capsule :math:`i` for :math:`j`.
+#
+# Here's how we set up the graph and initialize node and edge features.
+
+import os
+
+os.environ["DGLBACKEND"] = "pytorch"
+import dgl
+import matplotlib.pyplot as plt
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def init_graph(in_nodes, out_nodes, f_size):
+    u = np.repeat(np.arange(in_nodes), out_nodes)
+    v = np.tile(np.arange(in_nodes, in_nodes + out_nodes), in_nodes)
+    g = dgl.DGLGraph((u, v))
+    # init states
+    g.ndata["v"] = th.zeros(in_nodes + out_nodes, f_size)
+    g.edata["b"] = th.zeros(in_nodes * out_nodes, 1)
+    return g
+
+
+#########################################################################################
+# Step 2: Define message passing functions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This is the pseudocode for Capsule's routing algorithm.
+#
+# |image2|
+# Implement pseudocode lines 4-7 in the class `DGLRoutingLayer` as the following steps:
+#
+# 1. Calculate coupling coefficients.
+#
+#    -  Coefficients are the softmax over all out-edge of in-capsules.
+#       :math:`\textbf{c}_{i,j} = \text{softmax}(\textbf{b}_{i,j})`.
+#
+# 2. Calculate weighted sum over all in-capsules.
+#
+#    -  Output of a capsule is equal to the weighted sum of its in-capsules
+#       :math:`s_j=\sum_i c_{ij}\hat{u}_{j|i}`
+#
+# 3. Squash outputs.
+#
+#    -  Squash the length of a Capsule's output vector to range (0,1), so it can represent the probability (of some feature being present).
+#    -  :math:`v_j=\text{squash}(s_j)=\frac{||s_j||^2}{1+||s_j||^2}\frac{s_j}{||s_j||}`
+#
+# 4. Update weights by the amount of agreement.
+#
+#    -  The scalar product :math:`\hat{u}_{j|i}\cdot v_j` can be considered as how well capsule :math:`i` agrees with :math:`j`. It is used to update
+#       :math:`b_{ij}=b_{ij}+\hat{u}_{j|i}\cdot v_j`
+
+import dgl.function as fn
+
+
+class DGLRoutingLayer(nn.Module):
+    def __init__(self, in_nodes, out_nodes, f_size):
+        super(DGLRoutingLayer, self).__init__()
+        self.g = init_graph(in_nodes, out_nodes, f_size)
+        self.in_nodes = in_nodes
+        self.out_nodes = out_nodes
+        self.in_indx = list(range(in_nodes))
+        self.out_indx = list(range(in_nodes, in_nodes + out_nodes))
+
+    def forward(self, u_hat, routing_num=1):
+        self.g.edata["u_hat"] = u_hat
+
+        for r in range(routing_num):
+            # step 1 (line 4): normalize over out edges
+            edges_b = self.g.edata["b"].view(self.in_nodes, self.out_nodes)
+            self.g.edata["c"] = F.softmax(edges_b, dim=1).view(-1, 1)
+            self.g.edata["c u_hat"] = self.g.edata["c"] * self.g.edata["u_hat"]
+
+            # Execute step 1 & 2
+            self.g.update_all(fn.copy_e("c u_hat", "m"), fn.sum("m", "s"))
+
+            # step 3 (line 6)
+            self.g.nodes[self.out_indx].data["v"] = self.squash(
+                self.g.nodes[self.out_indx].data["s"], dim=1
+            )
+
+            # step 4 (line 7)
+            v = th.cat(
+                [self.g.nodes[self.out_indx].data["v"]] * self.in_nodes, dim=0
+            )
+            self.g.edata["b"] = self.g.edata["b"] + (
+                self.g.edata["u_hat"] * v
+            ).sum(dim=1, keepdim=True)
+
+    @staticmethod
+    def squash(s, dim=1):
+        sq = th.sum(s**2, dim=dim, keepdim=True)
+        s_norm = th.sqrt(sq)
+        s = (sq / (1.0 + sq)) * (s / s_norm)
+        return s
+
+
+############################################################################################################
+# Step 3: Testing
+# ~~~~~~~~~~~~~~~
+#
+# Make a simple 20x10 capsule layer.
+in_nodes = 20
+out_nodes = 10
+f_size = 4
+u_hat = th.randn(in_nodes * out_nodes, f_size)
+routing = DGLRoutingLayer(in_nodes, out_nodes, f_size)
+
+############################################################################################################
+# You can visualize a Capsule network's behavior by monitoring the entropy
+# of coupling coefficients. They should start high and then drop, as the
+# weights gradually concentrate on fewer edges.
+entropy_list = []
+dist_list = []
+
+for i in range(10):
+    routing(u_hat)
+    dist_matrix = routing.g.edata["c"].view(in_nodes, out_nodes)
+    entropy = (-dist_matrix * th.log(dist_matrix)).sum(dim=1)
+    entropy_list.append(entropy.data.numpy())
+    dist_list.append(dist_matrix.data.numpy())
+stds = np.std(entropy_list, axis=1)
+means = np.mean(entropy_list, axis=1)
+plt.errorbar(np.arange(len(entropy_list)), means, stds, marker="o")
+plt.ylabel("Entropy of Weight Distribution")
+plt.xlabel("Number of Routing")
+plt.xticks(np.arange(len(entropy_list)))
+plt.close()
+############################################################################################################
+# |image3|
+#
+# Alternatively, we can also watch the evolution of histograms.
+
+import matplotlib.animation as animation
+import seaborn as sns
+
+fig = plt.figure(dpi=150)
+fig.clf()
+ax = fig.subplots()
+
+
+def dist_animate(i):
+    ax.cla()
+    sns.distplot(dist_list[i].reshape(-1), kde=False, ax=ax)
+    ax.set_xlabel("Weight Distribution Histogram")
+    ax.set_title("Routing: %d" % (i))
+
+
+ani = animation.FuncAnimation(
+    fig, dist_animate, frames=len(entropy_list), interval=500
+)
+plt.close()
+
+############################################################################################################
+# |image4|
+#
+# You can monitor the how lower-level Capsules gradually attach to one of the
+# higher level ones.
+import networkx as nx
+from networkx.algorithms import bipartite
+
+g = routing.g.to_networkx()
+X, Y = bipartite.sets(g)
+height_in = 10
+height_out = height_in * 0.8
+height_in_y = np.linspace(0, height_in, in_nodes)
+height_out_y = np.linspace((height_in - height_out) / 2, height_out, out_nodes)
+pos = dict()
+
+fig2 = plt.figure(figsize=(8, 3), dpi=150)
+fig2.clf()
+ax = fig2.subplots()
+pos.update(
+    (n, (i, 1)) for i, n in zip(height_in_y, X)
+)  # put nodes from X at x=1
+pos.update(
+    (n, (i, 2)) for i, n in zip(height_out_y, Y)
+)  # put nodes from Y at x=2
+
+
+def weight_animate(i):
+    ax.cla()
+    ax.axis("off")
+    ax.set_title("Routing: %d  " % i)
+    dm = dist_list[i]
+    nx.draw_networkx_nodes(
+        g, pos, nodelist=range(in_nodes), node_color="r", node_size=100, ax=ax
+    )
+    nx.draw_networkx_nodes(
+        g,
+        pos,
+        nodelist=range(in_nodes, in_nodes + out_nodes),
+        node_color="b",
+        node_size=100,
+        ax=ax,
+    )
+    for edge in g.edges():
+        nx.draw_networkx_edges(
+            g,
+            pos,
+            edgelist=[edge],
+            width=dm[edge[0], edge[1] - in_nodes] * 1.5,
+            ax=ax,
+        )
+
+
+ani2 = animation.FuncAnimation(
+    fig2, weight_animate, frames=len(dist_list), interval=500
+)
+plt.close()
+
+############################################################################################################
+# |image5|
+#
+# The full code of this visualization is provided on
+# `GitHub <https://github.com/dmlc/dgl/blob/master/examples/pytorch/capsule/simple_routing.py>`__. The complete
+# code that trains on MNIST is also on `GitHub <https://github.com/dmlc/dgl/tree/tutorial/examples/pytorch/capsule>`__.
+#
+# .. |image0| image:: https://i.imgur.com/55Ovkdh.png
+# .. |image1| image:: https://i.imgur.com/9tc6GLl.png
+# .. |image2| image:: https://i.imgur.com/mv1W9Rv.png
+# .. |image3| image:: https://i.imgur.com/dMvu7p3.png
+# .. |image4| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_dist.gif
+# .. |image5| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_vis.gif
--- a/tutorials/models/4_old_wines/7_transformer.py
+++ b/tutorials/models/4_old_wines/7_transformer.py
@@ -104,7 +104,7 @@ Transformer as a Graph Neural Network
 # -  ``get_o`` maps the updated value after attention to the output
 #    :math:`o` for post-processing.
 #
-# .. code:: 
+# .. code::
 #
 #    class MultiHeadAttention(nn.Module):
 #        "Multi-Head Attention"
@@ -146,14 +146,14 @@ Transformer as a Graph Neural Network
 #
 # Construct the graph by mapping tokens of the source and target
 # sentence to nodes. The complete Transformer graph is made up of three
-# subgraphs:   
-# 
+# subgraphs:
+#
 # **Source language graph**. This is a complete graph, each
 # token :math:`s_i` can attend to any other token :math:`s_j` (including
-# self-loops). |image0|    
+# self-loops). |image0|
 # **Target language graph**. The graph is
 # half-complete, in that :math:`t_i` attends only to :math:`t_j` if
-# :math:`i > j` (an output token can not depend on future words). |image1|    
+# :math:`i > j` (an output token can not depend on future words). |image1|
 # **Cross-language graph**. This is a bi-partitie graph, where there is
 # an edge from every source token :math:`s_i` to every target token
 # :math:`t_j`, meaning every target token can attend on source tokens.
@@ -191,7 +191,7 @@ Transformer as a Graph Neural Network
 #
 # Compute ``score`` and send source node’s ``v`` to destination’s mailbox
 #
-# .. code:: 
+# .. code::
 #
 #    def message_func(edges):
 #        return {'score': ((edges.src['k'] * edges.dst['q'])
@@ -203,7 +203,7 @@ Transformer as a Graph Neural Network
 #
 # Normalize over all in-edges and weighted sum to get output
 #
-# .. code:: 
+# .. code::
 #
 #    import torch as th
 #    import torch.nn.functional as F
@@ -216,7 +216,7 @@ Transformer as a Graph Neural Network
 # Execute on specific edges
 # '''''''''''''''''''''''''
 #
-# .. code:: 
+# .. code::
 #
 #    import functools.partial as partial
 #    def naive_propagate_attention(self, g, eids):
@@ -269,7 +269,7 @@ Transformer as a Graph Neural Network
 #
 # The normalization of :math:`\textrm{wv}` is left to post processing.
 #
-# .. code:: 
+# .. code::
 #
 #    def src_dot_dst(src_field, dst_field, out_field):
 #        def func(edges):
@@ -338,7 +338,7 @@ Transformer as a Graph Neural Network
 #
 #    where :math:`\textrm{FFN}` refers to the feed forward function.
 #
-# .. code:: 
+# .. code::
 #
 #    class Encoder(nn.Module):
 #        def __init__(self, layer, N):
@@ -501,7 +501,7 @@ Transformer as a Graph Neural Network
 # Task and the dataset
 # ~~~~~~~~~~~~~~~~~~~~
 #
-# The Transformer is a general framework for a variety of NLP tasks. This tutorial focuses 
+# The Transformer is a general framework for a variety of NLP tasks. This tutorial focuses
 # on the sequence to sequence learning: it’s a typical case to illustrate how it works.
 #
 # As for the dataset, there are two example tasks: copy and sort, together
@@ -729,7 +729,7 @@ Transformer as a Graph Neural Network
 # with this nodes. The following code shows the Universal Transformer
 # class in DGL:
 #
-# .. code:: 
+# .. code::
 #
 #    class UTransformer(nn.Module):
 #        "Universal Transformer(https://arxiv.org/pdf/1807.03819.pdf) with ACT(https://arxiv.org/pdf/1603.08983.pdf)."
@@ -849,10 +849,10 @@ Transformer as a Graph Neural Network
 # that are still active:
 #
 # .. note::
-# 
+#
 #    - :func:`~dgl.DGLGraph.filter_nodes` takes a predicate and a node
 #      ID list/tensor as input, then returns a tensor of node IDs that satisfy
-#      the given predicate.     
+#      the given predicate.
 #    - :func:`~dgl.DGLGraph.filter_edges` takes a predicate
 #      and an edge ID list/tensor as input, then returns a tensor of edge IDs
 #      that satisfy the given predicate.
@@ -883,6 +883,6 @@ Transformer as a Graph Neural Network
 #
 # .. note::
 #     The notebook itself is not executable due to many dependencies.
-#     Download `7_transformer.py <https://data.dgl.ai/tutorial/7_transformer.py>`__, 
-#     and copy the python script to directory ``examples/pytorch/transformer`` 
+#     Download `7_transformer.py <https://data.dgl.ai/tutorial/7_transformer.py>`__,
+#     and copy the python script to directory ``examples/pytorch/transformer``
 #     then run ``python 7_transformer.py`` to see how it works.
--- a/tutorials/multi/1_graph_classification.py
+++ b/tutorials/multi/1_graph_classification.py
@@ -71,7 +71,8 @@ process ID, which should be an integer from `0` to `world_size - 1`.
 """

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
+
+os.environ["DGLBACKEND"] = "pytorch"
 import torch.distributed as dist



--- a/tutorials/multi/2_node_classification.py
+++ b/tutorials/multi/2_node_classification.py
@@ -26,63 +26,65 @@ models with multi-GPU with ``DistributedDataParallel``.
 ######################################################################
 # Loading Dataset
 # ---------------
-# 
+#
 # OGB already prepared the data as a ``DGLGraph`` object. The following code is
 # copy-pasted from the :doc:`Training GNN with Neighbor Sampling for Node
 # Classification <../large/L1_large_node_classification>`
 # tutorial.
-# 
+#

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
+
+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
-import torch
 import numpy as np
+import sklearn.metrics
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import tqdm
 from dgl.nn import SAGEConv
 from ogb.nodeproppred import DglNodePropPredDataset
-import tqdm
-import sklearn.metrics

-dataset = DglNodePropPredDataset('ogbn-arxiv')
+dataset = DglNodePropPredDataset("ogbn-arxiv")

 graph, node_labels = dataset[0]
 # Add reverse edges since ogbn-arxiv is unidirectional.
 graph = dgl.add_reverse_edges(graph)
-graph.ndata['label'] = node_labels[:, 0]
+graph.ndata["label"] = node_labels[:, 0]

-node_features = graph.ndata['feat']
+node_features = graph.ndata["feat"]
 num_features = node_features.shape[1]
 num_classes = (node_labels.max() + 1).item()

 idx_split = dataset.get_idx_split()
-train_nids = idx_split['train']
-valid_nids = idx_split['valid']
-test_nids = idx_split['test']    # Test node IDs, not used in the tutorial though.
+train_nids = idx_split["train"]
+valid_nids = idx_split["valid"]
+test_nids = idx_split["test"]  # Test node IDs, not used in the tutorial though.


 ######################################################################
 # Defining Model
 # --------------
-# 
+#
 # The model will be again identical to the :doc:`Training GNN with Neighbor
 # Sampling for Node Classification <../large/L1_large_node_classification>`
 # tutorial.
-# 
+#
+

 class Model(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model, self).__init__()
-        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
-        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
+        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type="mean")
+        self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type="mean")
        self.h_feats = h_feats

    def forward(self, mfgs, x):
-        h_dst = x[:mfgs[0].num_dst_nodes()]
+        h_dst = x[: mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
-        h_dst = h[:mfgs[1].num_dst_nodes()]
+        h_dst = h[: mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

@@ -90,7 +92,7 @@ class Model(nn.Module):
 ######################################################################
 # Defining Training Procedure
 # ---------------------------
-# 
+#
 # The training procedure will be slightly different from what you saw
 # previously, in the sense that you will need to
 #
@@ -98,45 +100,58 @@ class Model(nn.Module):
 # * Wrap your model with ``torch.nn.parallel.DistributedDataParallel``.
 # * Add a ``use_ddp=True`` argument to the DGL dataloader you wish to run
 #   together with DDP.
-# 
+#
 # You will also need to wrap the training loop inside a function so that
 # you can spawn subprocesses to run it.
-# 
+#
+

 def run(proc_id, devices):
    # Initialize distributed training context.
    dev_id = devices[proc_id]
-    dist_init_method = 'tcp://{master_ip}:{master_port}'.format(master_ip='127.0.0.1', master_port='12345')
+    dist_init_method = "tcp://{master_ip}:{master_port}".format(
+        master_ip="127.0.0.1", master_port="12345"
+    )
    if torch.cuda.device_count() < 1:
-        device = torch.device('cpu')
+        device = torch.device("cpu")
        torch.distributed.init_process_group(
-            backend='gloo', init_method=dist_init_method, world_size=len(devices), rank=proc_id)
+            backend="gloo",
+            init_method=dist_init_method,
+            world_size=len(devices),
+            rank=proc_id,
+        )
    else:
        torch.cuda.set_device(dev_id)
-        device = torch.device('cuda:' + str(dev_id))
+        device = torch.device("cuda:" + str(dev_id))
        torch.distributed.init_process_group(
-            backend='nccl', init_method=dist_init_method, world_size=len(devices), rank=proc_id)
-    
+            backend="nccl",
+            init_method=dist_init_method,
+            world_size=len(devices),
+            rank=proc_id,
+        )
+
    # Define training and validation dataloader, copied from the previous tutorial
    # but with one line of difference: use_ddp to enable distributed data parallel
    # data loading.
    sampler = dgl.dataloading.NeighborSampler([4, 4])
    train_dataloader = dgl.dataloading.DataLoader(
        # The following arguments are specific to DataLoader.
-        graph,              # The graph
-        train_nids,         # The node IDs to iterate over in minibatches
-        sampler,            # The neighbor sampler
-        device=device,      # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,       # Make it work with distributed data parallel
+        graph,  # The graph
+        train_nids,  # The node IDs to iterate over in minibatches
+        sampler,  # The neighbor sampler
+        device=device,  # Put the sampled MFGs on CPU or GPU
+        use_ddp=True,  # Make it work with distributed data parallel
        # The following arguments are inherited from PyTorch DataLoader.
-        batch_size=1024,    # Per-device batch size.
-                            # The effective batch size is this number times the number of GPUs.
-        shuffle=True,       # Whether to shuffle the nodes for every epoch
-        drop_last=False,    # Whether to drop the last incomplete batch
-        num_workers=0       # Number of sampler processes
+        batch_size=1024,  # Per-device batch size.
+        # The effective batch size is this number times the number of GPUs.
+        shuffle=True,  # Whether to shuffle the nodes for every epoch
+        drop_last=False,  # Whether to drop the last incomplete batch
+        num_workers=0,  # Number of sampler processes
    )
    valid_dataloader = dgl.dataloading.DataLoader(
-        graph, valid_nids, sampler,
+        graph,
+        valid_nids,
+        sampler,
        device=device,
        use_ddp=False,
        batch_size=1024,
@@ -144,20 +159,24 @@ def run(proc_id, devices):
        drop_last=False,
        num_workers=0,
    )
-    
+
    model = Model(num_features, 128, num_classes).to(device)
    # Wrap the model with distributed data parallel module.
-    if device == torch.device('cpu'):
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None)
+    if device == torch.device("cpu"):
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=None, output_device=None
+        )
    else:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
-    
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[device], output_device=device
+        )
+
    # Define optimizer
    opt = torch.optim.Adam(model.parameters())
-    
+
    best_accuracy = 0
-    best_model_path = './model.pt'
-    
+    best_model_path = "./model.pt"
+
    # Copied from previous tutorial with changes highlighted.
    for epoch in range(10):
        model.train()
@@ -165,8 +184,8 @@ def run(proc_id, devices):
        with tqdm.tqdm(train_dataloader) as tq:
            for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
                # feature copy from CPU to GPU takes place here
-                inputs = mfgs[0].srcdata['feat']
-                labels = mfgs[-1].dstdata['label']
+                inputs = mfgs[0].srcdata["feat"]
+                labels = mfgs[-1].dstdata["label"]

                predictions = model(mfgs, inputs)

@@ -175,9 +194,15 @@ def run(proc_id, devices):
                loss.backward()
                opt.step()

-                accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())
+                accuracy = sklearn.metrics.accuracy_score(
+                    labels.cpu().numpy(),
+                    predictions.argmax(1).detach().cpu().numpy(),
+                )

-                tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)
+                tq.set_postfix(
+                    {"loss": "%.03f" % loss.item(), "acc": "%.03f" % accuracy},
+                    refresh=False,
+                )

        model.eval()

@@ -187,13 +212,15 @@ def run(proc_id, devices):
            labels = []
            with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
                for input_nodes, output_nodes, mfgs in tq:
-                    inputs = mfgs[0].srcdata['feat']
-                    labels.append(mfgs[-1].dstdata['label'].cpu().numpy())
-                    predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy())
+                    inputs = mfgs[0].srcdata["feat"]
+                    labels.append(mfgs[-1].dstdata["label"].cpu().numpy())
+                    predictions.append(
+                        model(mfgs, inputs).argmax(1).cpu().numpy()
+                    )
                predictions = np.concatenate(predictions)
                labels = np.concatenate(labels)
                accuracy = sklearn.metrics.accuracy_score(labels, predictions)
-                print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
+                print("Epoch {} Validation Accuracy {}".format(epoch, accuracy))
                if best_accuracy < accuracy:
                    best_accuracy = accuracy
                    torch.save(model.state_dict(), best_model_path)
@@ -205,7 +232,7 @@ def run(proc_id, devices):
 ######################################################################
 # Spawning Trainer Processes
 # --------------------------
-# 
+#
 # A typical scenario for multi-GPU training with DDP is to replicate the
 # model once per GPU, and spawn one trainer process per GPU.
 #
@@ -219,15 +246,15 @@ def run(proc_id, devices):
 # or ``out_degrees`` is called.  To avoid this, you need to create
 # all sparse matrix representations beforehand using the ``create_formats_``
 # method:
-# 
+#

 graph.create_formats_()


 ######################################################################
 # Then you can spawn the subprocesses to train with multiple GPUs.
-# 
-# 
+#
+#
 # .. code:: python
 #
 #    # Say you have four GPUs.