[Misc] Auto-reformat multiple python folders. (#5325)

* auto-reformat * lintrunner --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>

[Misc] Auto-reformat multiple python folders. (#5325)
* auto-reformat * lintrunner --------- Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
dce89919 · Hongzhi (Steve), Chen · GitHub · ab812179 · dce89919 · dce89919
Unverified Commit dce89919 authored Feb 20, 2023 by Hongzhi (Steve), Chen Committed by GitHub Feb 20, 2023
20 changed files
--- a/dglgo/dglgo/utils/yaml_dump.py
+++ b/dglgo/dglgo/utils/yaml_dump.py
-
 from ruamel.yaml.comments import CommentedMap


@@ -14,12 +13,14 @@ def deep_convert_dict(layer):

    return to_ret

+
 import collections.abc

+
 def merge_comment(d, comment_dict, column=30):
    for k, v in comment_dict.items():
        if isinstance(v, collections.abc.Mapping):
            d[k] = merge_comment(d.get(k, CommentedMap()), v)
        else:
            d.yaml_add_eol_comment(v, key=k, column=column)
-    return d
\ No newline at end of file
+    return d
--- a/dglgo/setup.py
+++ b/dglgo/setup.py
 #!/usr/bin/env python

-from setuptools import find_packages
 from distutils.core import setup

-setup(name='dglgo',
-      version='0.0.2',
-      description='DGL',
-      author='DGL Team',
-      author_email='wmjlyjemaine@gmail.com',
-      packages=find_packages(),
-      install_requires=[
-          'typer>=0.4.0',
-          'isort>=5.10.1',
-          'autopep8>=1.6.0',
-          'numpydoc>=1.1.0',
-          "pydantic>=1.9.0",
-          "ruamel.yaml>=0.17.20",
-          "PyYAML>=5.1",
-          "ogb>=1.3.3",
-          "rdkit-pypi",
-          "scikit-learn>=0.20.0"
-      ],
-      package_data={"": ["./*"]},
-      include_package_data=True,
-      license='APACHE',
-      entry_points={
-          'console_scripts': [
-              "dgl = dglgo.cli.cli:main"
-          ]
-      },
-      url='https://github.com/dmlc/dgl',
-      )
+from setuptools import find_packages
+
+setup(
+    name="dglgo",
+    version="0.0.2",
+    description="DGL",
+    author="DGL Team",
+    author_email="wmjlyjemaine@gmail.com",
+    packages=find_packages(),
+    install_requires=[
+        "typer>=0.4.0",
+        "isort>=5.10.1",
+        "autopep8>=1.6.0",
+        "numpydoc>=1.1.0",
+        "pydantic>=1.9.0",
+        "ruamel.yaml>=0.17.20",
+        "PyYAML>=5.1",
+        "ogb>=1.3.3",
+        "rdkit-pypi",
+        "scikit-learn>=0.20.0",
+    ],
+    package_data={"": ["./*"]},
+    include_package_data=True,
+    license="APACHE",
+    entry_points={"console_scripts": ["dgl = dglgo.cli.cli:main"]},
+    url="https://github.com/dmlc/dgl",
+)
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,16 +14,18 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../../python'))
+
+sys.path.insert(0, os.path.abspath("../../python"))


 # -- Project information -----------------------------------------------------

-project = 'DGL'
-copyright = '2018, DGL Team'
-author = 'DGL Team'
+project = "DGL"
+copyright = "2018, DGL Team"
+author = "DGL Team"

 import dgl
+
 version = dgl.__version__
 release = dgl.__version__
 dglbackend = os.environ.get("DGLBACKEND", "pytorch")
@@ -39,35 +41,35 @@ dglbackend = os.environ.get("DGLBACKEND", "pytorch")
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.coverage',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.graphviz',
-    'sphinxemoji.sphinxemoji',
-    'sphinx_gallery.gen_gallery',
-    'sphinx_copybutton',
-    'nbsphinx',
-    'nbsphinx_link',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.coverage",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.graphviz",
+    "sphinxemoji.sphinxemoji",
+    "sphinx_gallery.gen_gallery",
+    "sphinx_copybutton",
+    "nbsphinx",
+    "nbsphinx_link",
 ]

 # Do not run notebooks on non-pytorch backends
 if dglbackend != "pytorch":
-    nbsphinx_execute = 'never'
+    nbsphinx_execute = "never"

 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]

 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -90,7 +92,7 @@ pygments_style = None
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"

 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
@@ -101,8 +103,8 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-html_css_files = ['css/custom.css']
+html_static_path = ["_static"]
+html_css_files = ["css/custom.css"]

 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -118,7 +120,7 @@ html_css_files = ['css/custom.css']
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
-htmlhelp_basename = 'dgldoc'
+htmlhelp_basename = "dgldoc"


 # -- Options for LaTeX output ------------------------------------------------
@@ -127,15 +129,12 @@ latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
-
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
-
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
-
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
@@ -145,8 +144,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'dgl.tex', 'DGL Documentation',
-     'DGL Team', 'manual'),
+    (master_doc, "dgl.tex", "DGL Documentation", "DGL Team", "manual"),
 ]


@@ -154,10 +152,7 @@ latex_documents = [

 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'dgl', 'DGL Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "dgl", "DGL Documentation", [author], 1)]


 # -- Options for Texinfo output ----------------------------------------------
@@ -166,9 +161,15 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'dgl', 'DGL Documentation',
-     author, 'dgl', 'Library for deep learning on graphs.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "dgl",
+        "DGL Documentation",
+        author,
+        "dgl",
+        "Library for deep learning on graphs.",
+        "Miscellaneous",
+    ),
 ]


@@ -187,64 +188,71 @@ epub_title = project
 # epub_uid = ''

 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]


 # -- Extension configuration -------------------------------------------------
 autosummary_generate = True
-autodoc_member_order = 'alphabetical'
+autodoc_member_order = "alphabetical"

 intersphinx_mapping = {
-    'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
-    'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
-    'matplotlib': ('http://matplotlib.org/', None),
-    'networkx' : ('https://networkx.github.io/documentation/stable', None),
+    "python": (
+        "https://docs.python.org/{.major}".format(sys.version_info),
+        None,
+    ),
+    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
+    "scipy": ("http://docs.scipy.org/doc/scipy/reference", None),
+    "matplotlib": ("http://matplotlib.org/", None),
+    "networkx": ("https://networkx.github.io/documentation/stable", None),
 }

 # sphinx gallery configurations
 from sphinx_gallery.sorting import FileNameSortKey

-examples_dirs = ['../../tutorials/blitz',
-                 '../../tutorials/large',
-                 '../../tutorials/dist',
-                 '../../tutorials/models',
-                 '../../tutorials/multi',
-                 '../../tutorials/cpu']  # path to find sources
-gallery_dirs = ['tutorials/blitz/',
-                'tutorials/large/',
-                'tutorials/dist/',
-                'tutorials/models/',
-                'tutorials/multi/',
-                'tutorials/cpu']  # path to generate docs
+examples_dirs = [
+    "../../tutorials/blitz",
+    "../../tutorials/large",
+    "../../tutorials/dist",
+    "../../tutorials/models",
+    "../../tutorials/multi",
+    "../../tutorials/cpu",
+]  # path to find sources
+gallery_dirs = [
+    "tutorials/blitz/",
+    "tutorials/large/",
+    "tutorials/dist/",
+    "tutorials/models/",
+    "tutorials/multi/",
+    "tutorials/cpu",
+]  # path to generate docs
 if dglbackend != "pytorch":
    examples_dirs = []
    gallery_dirs = []

 reference_url = {
-    'dgl' : None,
-    'numpy': 'http://docs.scipy.org/doc/numpy/',
-    'scipy': 'http://docs.scipy.org/doc/scipy/reference',
-    'matplotlib': 'http://matplotlib.org/',
-    'networkx' : 'https://networkx.github.io/documentation/stable',
+    "dgl": None,
+    "numpy": "http://docs.scipy.org/doc/numpy/",
+    "scipy": "http://docs.scipy.org/doc/scipy/reference",
+    "matplotlib": "http://matplotlib.org/",
+    "networkx": "https://networkx.github.io/documentation/stable",
 }

 sphinx_gallery_conf = {
-    'backreferences_dir' : 'generated/backreferences',
-    'doc_module' : ('dgl', 'numpy'),
-    'examples_dirs' : examples_dirs,
-    'gallery_dirs' : gallery_dirs,
-    'within_subsection_order' : FileNameSortKey,
-    'filename_pattern' : '.py',
-    'download_all_examples' : False,
+    "backreferences_dir": "generated/backreferences",
+    "doc_module": ("dgl", "numpy"),
+    "examples_dirs": examples_dirs,
+    "gallery_dirs": gallery_dirs,
+    "within_subsection_order": FileNameSortKey,
+    "filename_pattern": ".py",
+    "download_all_examples": False,
 }

 # Compatibility for different backend when builds tutorials
-if dglbackend == 'mxnet':
-    sphinx_gallery_conf['filename_pattern'] = "/*(?<=mx)\.py"
-if dglbackend == 'pytorch':
-    sphinx_gallery_conf['filename_pattern'] = "/*(?<!mx)\.py"
+if dglbackend == "mxnet":
+    sphinx_gallery_conf["filename_pattern"] = "/*(?<=mx)\.py"
+if dglbackend == "pytorch":
+    sphinx_gallery_conf["filename_pattern"] = "/*(?<!mx)\.py"

 # sphinx-copybutton tool
-copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_text = r">>> |\.\.\. "
 copybutton_prompt_is_regexp = True
--- a/docs/source/gen_dataset_stat.py
+++ b/docs/source/gen_dataset_stat.py
-from pytablewriter import RstGridTableWriter, MarkdownTableWriter
 import numpy as np
 import pandas as pd
 from dgl import DGLGraph
-from dgl.data.gnn_benchmark import AmazonCoBuy, CoraFull, Coauthor
-from dgl.data.karate import KarateClub
-from dgl.data.gindt import GINDataset
+
+# from dgl.data.qm9 import QM9
+from dgl.data import CitationGraphDataset, PPIDataset, RedditDataset, TUDataset
 from dgl.data.bitcoinotc import BitcoinOTC
 from dgl.data.gdelt import GDELT
+from dgl.data.gindt import GINDataset
+from dgl.data.gnn_benchmark import AmazonCoBuy, Coauthor, CoraFull
 from dgl.data.icews18 import ICEWS18
+from dgl.data.karate import KarateClub
 from dgl.data.qm7b import QM7b
-# from dgl.data.qm9 import QM9
-from dgl.data import CitationGraphDataset, PPIDataset, RedditDataset, TUDataset
+from pytablewriter import MarkdownTableWriter, RstGridTableWriter

 ds_list = {
    "BitcoinOTC": "BitcoinOTC()",
@@ -40,9 +41,9 @@ writer = RstGridTableWriter()
 # writer = MarkdownTableWriter()

 extract_graph = lambda g: g if isinstance(g, DGLGraph) else g[0]
-stat_list=[]
-for k,v in ds_list.items():
-    print(k, ' ', v)
+stat_list = []
+for k, v in ds_list.items():
+    print(k, " ", v)
    ds = eval(v.split("/")[0])
    num_nodes = []
    num_edges = []
@@ -58,10 +59,10 @@ for k,v in ds_list.items():
        "# of graphs": len(ds),
        "Avg. # of nodes": np.mean(num_nodes),
        "Avg. # of edges": np.mean(num_edges),
-        "Node field": ', '.join(list(gg.ndata.keys())),
-        "Edge field": ', '.join(list(gg.edata.keys())),
+        "Node field": ", ".join(list(gg.ndata.keys())),
+        "Edge field": ", ".join(list(gg.edata.keys())),
        # "Graph field": ', '.join(ds[0][0].gdata.keys()) if hasattr(ds[0][0], "gdata") else "",
-        "Temporal": hasattr(ds, "is_temporal")
+        "Temporal": hasattr(ds, "is_temporal"),
    }
    stat_list.append(dd)


--- a/featgraph/pack_featgraph.py
+++ b/featgraph/pack_featgraph.py
@@ -26,15 +26,14 @@ def get_sddmm_kernels_gpu(idtypes, dtypes):
    return ret


-if __name__ == '__main__':
-    binary_path = 'libfeatgraph_kernels.so'
+if __name__ == "__main__":
+    binary_path = "libfeatgraph_kernels.so"
    kernels = []
-    idtypes = ['int32', 'int64']
-    dtypes = ['float16', 'float64', 'float32', 'int32', 'int64']
+    idtypes = ["int32", "int64"]
+    dtypes = ["float16", "float64", "float32", "int32", "int64"]

    kernels += get_sddmm_kernels_gpu(idtypes, dtypes)

    # build kernels and export the module to libfeatgraph_kernels.so
-    module = tvm.build(kernels, target='cuda', target_host='llvm')
+    module = tvm.build(kernels, target="cuda", target_host="llvm")
    module.export_library(binary_path)
-
--- a/featgraph/sddmm.py
+++ b/featgraph/sddmm.py
@@ -4,8 +4,8 @@ from tvm import te


 def sddmm_tree_reduction_gpu(idx_type, feat_type):
-    """ SDDMM kernels on GPU optimized with Tree Reduction.
-    
+    """SDDMM kernels on GPU optimized with Tree Reduction.
+
    Parameters
    ----------
    idx_type : str
@@ -19,35 +19,40 @@ def sddmm_tree_reduction_gpu(idx_type, feat_type):
        The result IRModule.
    """
    # define vars and placeholders
-    nnz = te.var('nnz', idx_type)
-    num_rows = te.var('num_rows', idx_type)
-    num_cols = te.var('num_cols', idx_type)
-    H = te.var('num_heads', idx_type)
-    D = te.var('feat_len', idx_type)
-    row = te.placeholder((nnz,), idx_type, 'row')
-    col = te.placeholder((nnz,), idx_type, 'col')
-    ufeat = te.placeholder((num_rows, H, D), feat_type, 'ufeat')
-    vfeat = te.placeholder((num_cols, H, D), feat_type, 'vfeat')
+    nnz = te.var("nnz", idx_type)
+    num_rows = te.var("num_rows", idx_type)
+    num_cols = te.var("num_cols", idx_type)
+    H = te.var("num_heads", idx_type)
+    D = te.var("feat_len", idx_type)
+    row = te.placeholder((nnz,), idx_type, "row")
+    col = te.placeholder((nnz,), idx_type, "col")
+    ufeat = te.placeholder((num_rows, H, D), feat_type, "ufeat")
+    vfeat = te.placeholder((num_cols, H, D), feat_type, "vfeat")
    # define edge computation function
    def edge_func(eid, h, i):
-        k = te.reduce_axis((0, D), name='k')
+        k = te.reduce_axis((0, D), name="k")
        return te.sum(ufeat[row[eid], h, k] * vfeat[col[eid], h, k], axis=k)
-    out = te.compute((nnz, H, tvm.tir.IntImm(idx_type, 1)), edge_func, name='out')
+
+    out = te.compute(
+        (nnz, H, tvm.tir.IntImm(idx_type, 1)), edge_func, name="out"
+    )
    # define schedules
    sched = te.create_schedule(out.op)
    edge_axis, head_axis, _ = out.op.axis
    reduce_axis = out.op.reduce_axis[0]
    _, red_inner = sched[out].split(reduce_axis, factor=32)
    edge_outer, edge_inner = sched[out].split(edge_axis, factor=32)
-    sched[out].bind(red_inner, te.thread_axis('threadIdx.x'))
-    sched[out].bind(edge_inner, te.thread_axis('threadIdx.y'))
-    sched[out].bind(edge_outer, te.thread_axis('blockIdx.x'))
-    sched[out].bind(head_axis, te.thread_axis('blockIdx.y'))
-    return tvm.lower(sched, [row, col, ufeat, vfeat, out],
-                     name='SDDMMTreeReduction_{}_{}'.format(idx_type, feat_type))
+    sched[out].bind(red_inner, te.thread_axis("threadIdx.x"))
+    sched[out].bind(edge_inner, te.thread_axis("threadIdx.y"))
+    sched[out].bind(edge_outer, te.thread_axis("blockIdx.x"))
+    sched[out].bind(head_axis, te.thread_axis("blockIdx.y"))
+    return tvm.lower(
+        sched,
+        [row, col, ufeat, vfeat, out],
+        name="SDDMMTreeReduction_{}_{}".format(idx_type, feat_type),
+    )


-if __name__ == '__main__':
-    kernel0 = sddmm_tree_reduction_gpu('int32', 'float32')
+if __name__ == "__main__":
+    kernel0 = sddmm_tree_reduction_gpu("int32", "float32")
    print(kernel0)
-
--- a/featgraph/test.py
+++ b/featgraph/test.py
-import torch
 import dgl
 import dgl.backend as F
+import torch

 g = dgl.rand_graph(10, 15).int().to(torch.device(0))
 gidx = g._graph
-u = torch.rand((10,2,8), device=torch.device(0))
-v = torch.rand((10,2,8), device=torch.device(0))
-e = dgl.ops.gsddmm(g, 'dot', u, v)
+u = torch.rand((10, 2, 8), device=torch.device(0))
+v = torch.rand((10, 2, 8), device=torch.device(0))
+e = dgl.ops.gsddmm(g, "dot", u, v)
 print(e)
-e = torch.zeros((15,2,1), device=torch.device(0))
+e = torch.zeros((15, 2, 1), device=torch.device(0))
 u = F.zerocopy_to_dgl_ndarray(u)
 v = F.zerocopy_to_dgl_ndarray(v)
 e = F.zerocopy_to_dgl_ndarray_for_write(e)

--- a/tutorials/blitz/1_introduction.py
+++ b/tutorials/blitz/1_introduction.py
@@ -22,13 +22,13 @@ networks with PyTorch.
 """

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch
-import torch.nn as nn
-import torch.nn.functional as F

+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
 import dgl.data
+import torch
+import torch.nn as nn
+import torch.nn.functional as F

 ######################################################################
 # Overview of Node Classification with GNN

--- a/tutorials/blitz/2_dglgraph.py
+++ b/tutorials/blitz/2_dglgraph.py
@@ -31,11 +31,11 @@ By the end of this tutorial you will be able to:
 #

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import numpy as np
-import torch

+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
+import numpy as np
+import torch

 g = dgl.graph(([0, 0, 0, 0, 0], [1, 2, 3, 4, 5]), num_nodes=6)
 # Equivalently, PyTorch LongTensors also work.

--- a/tutorials/blitz/3_message_passing.py
+++ b/tutorials/blitz/3_message_passing.py
@@ -19,13 +19,13 @@ GNN for node classification <1_introduction>`.
 """

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch
-import torch.nn as nn
-import torch.nn.functional as F

+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
 import dgl.function as fn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F

 ######################################################################
 # Message passing and GNNs

--- a/tutorials/blitz/4_link_predict.py
+++ b/tutorials/blitz/4_link_predict.py
@@ -19,17 +19,17 @@ By the end of this tutorial you will be able to

 import itertools
 import os
-os.environ['DGLBACKEND'] = 'pytorch'

+os.environ["DGLBACKEND"] = "pytorch"
+
+import dgl
+import dgl.data
 import numpy as np
 import scipy.sparse as sp
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

-import dgl
-import dgl.data
-
 ######################################################################
 # Overview of Link Prediction with GNN
 # ------------------------------------

--- a/tutorials/blitz/5_graph_classification.py
+++ b/tutorials/blitz/5_graph_classification.py
@@ -14,13 +14,13 @@ By the end of this tutorial, you will be able to
 """

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch
-import torch.nn as nn
-import torch.nn.functional as F

+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
 import dgl.data
+import torch
+import torch.nn as nn
+import torch.nn.functional as F

 ######################################################################
 # Overview of Graph Classification with GNN
@@ -54,6 +54,8 @@ print("Node feature dimensionality:", dataset.dim_nfeats)
 print("Number of graph categories:", dataset.gclasses)


+from dgl.dataloading import GraphDataLoader
+
 ######################################################################
 # Defining Data Loader
 # --------------------
@@ -74,8 +76,6 @@ print("Number of graph categories:", dataset.gclasses)

 from torch.utils.data.sampler import SubsetRandomSampler

-from dgl.dataloading import GraphDataLoader
-
 num_examples = len(dataset)
 num_train = int(num_examples * 0.8)


--- a/tutorials/blitz/6_load_data.py
+++ b/tutorials/blitz/6_load_data.py
@@ -88,10 +88,10 @@ interactions.head()
 #

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch

+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
+import torch
 from dgl.data import DGLDataset



--- a/tutorials/large/L1_large_node_classification.py
+++ b/tutorials/large/L1_large_node_classification.py
@@ -26,10 +26,11 @@ Sampling for GNN Training <L0_neighbor_sampling_overview>`.
 #

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
+
+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
-import torch
 import numpy as np
+import torch
 from ogb.nodeproppred import DglNodePropPredDataset

 dataset = DglNodePropPredDataset("ogbn-arxiv")
@@ -284,13 +285,14 @@ valid_dataloader = dgl.dataloading.DataLoader(
 )


+import sklearn.metrics
+
 ######################################################################
 # The following is a training loop that performs validation every epoch.
 # It also saves the model with the best validation accuracy into a file.
 #

 import tqdm
-import sklearn.metrics

 best_accuracy = 0
 best_model_path = "model.pt"

--- a/tutorials/large/L2_large_link_prediction.py
+++ b/tutorials/large/L2_large_link_prediction.py
@@ -53,10 +53,11 @@ Sampling for Node Classification <L1_large_node_classification>`.
 #

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
+
+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
-import torch
 import numpy as np
+import torch
 from ogb.nodeproppred import DglNodePropPredDataset

 dataset = DglNodePropPredDataset("ogbn-arxiv")
@@ -339,6 +340,8 @@ predictor = DotPredictor().to(device)
 opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))


+import sklearn.metrics
+
 ######################################################################
 # The following is the training loop for link prediction and
 # evaluation, and also saves the model that performs the best on the
@@ -346,7 +349,6 @@ opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))
 #

 import tqdm
-import sklearn.metrics

 best_accuracy = 0
 best_model_path = "model.pt"

--- a/tutorials/large/L4_message_passing.py
+++ b/tutorials/large/L4_message_passing.py
@@ -14,30 +14,33 @@ for stochastic GNN training. It assumes that
 """

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
+
+os.environ["DGLBACKEND"] = "pytorch"
 import dgl
-import torch
 import numpy as np
+import torch
 from ogb.nodeproppred import DglNodePropPredDataset

-dataset = DglNodePropPredDataset('ogbn-arxiv')
-device = 'cpu'      # change to 'cuda' for GPU
+dataset = DglNodePropPredDataset("ogbn-arxiv")
+device = "cpu"  # change to 'cuda' for GPU

 graph, node_labels = dataset[0]
 # Add reverse edges since ogbn-arxiv is unidirectional.
 graph = dgl.add_reverse_edges(graph)
-graph.ndata['label'] = node_labels[:, 0]
+graph.ndata["label"] = node_labels[:, 0]
 idx_split = dataset.get_idx_split()
-train_nids = idx_split['train']
-node_features = graph.ndata['feat']
+train_nids = idx_split["train"]
+node_features = graph.ndata["feat"]

 sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
 train_dataloader = dgl.dataloading.DataLoader(
-    graph, train_nids, sampler,
+    graph,
+    train_nids,
+    sampler,
    batch_size=1024,
    shuffle=True,
    drop_last=False,
-    num_workers=0
+    num_workers=0,
 )

 input_nodes, output_nodes, mfgs = next(iter(train_dataloader))
@@ -75,8 +78,8 @@ print(mfg.num_src_nodes(), mfg.num_dst_nodes())
 # will do with ``ndata`` on the graphs you have seen earlier:
 #

-mfg.srcdata['x'] = torch.zeros(mfg.num_src_nodes(), mfg.num_dst_nodes())
-dst_feat = mfg.dstdata['feat']
+mfg.srcdata["x"] = torch.zeros(mfg.num_src_nodes(), mfg.num_dst_nodes())
+dst_feat = mfg.dstdata["feat"]


 ######################################################################
@@ -105,7 +108,11 @@ mfg.srcdata[dgl.NID], mfg.dstdata[dgl.NID]
 # .. |image1| image:: https://data.dgl.ai/tutorial/img/bipartite.gif
 #

-print(torch.equal(mfg.srcdata[dgl.NID][:mfg.num_dst_nodes()], mfg.dstdata[dgl.NID]))
+print(
+    torch.equal(
+        mfg.srcdata[dgl.NID][: mfg.num_dst_nodes()], mfg.dstdata[dgl.NID]
+    )
+)


 ######################################################################
@@ -113,7 +120,7 @@ print(torch.equal(mfg.srcdata[dgl.NID][:mfg.num_dst_nodes()], mfg.dstdata[dgl.NI
 # :math:`h_u^{(l-1)}`:
 #

-mfg.srcdata['h'] = torch.randn(mfg.num_src_nodes(), 10)
+mfg.srcdata["h"] = torch.randn(mfg.num_src_nodes(), 10)


 ######################################################################
@@ -132,8 +139,8 @@ mfg.srcdata['h'] = torch.randn(mfg.num_src_nodes(), 10)

 import dgl.function as fn

-mfg.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h'))
-m_v = mfg.dstdata['h']
+mfg.update_all(message_func=fn.copy_u("h", "m"), reduce_func=fn.mean("m", "h"))
+m_v = mfg.dstdata["h"]
 m_v


@@ -147,6 +154,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 import tqdm

+
 class SAGEConv(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

@@ -157,6 +165,7 @@ class SAGEConv(nn.Module):
    out_feat : int
        Output feature size.
    """
+
    def __init__(self, in_feat, out_feat):
        super(SAGEConv, self).__init__()
        # A linear submodule for projecting the input and neighbor feature to the output.
@@ -174,14 +183,15 @@ class SAGEConv(nn.Module):
        """
        with g.local_scope():
            h_src, h_dst = h
-            g.srcdata['h'] = h_src                        # <---
-            g.dstdata['h'] = h_dst                        # <---
+            g.srcdata["h"] = h_src  # <---
+            g.dstdata["h"] = h_dst  # <---
            # update_all is a message passing API.
-            g.update_all(fn.copy_u('h', 'm'), fn.mean('m', 'h_N'))
-            h_N = g.dstdata['h_N']
-            h_total = torch.cat([h_dst, h_N], dim=1)      # <---
+            g.update_all(fn.copy_u("h", "m"), fn.mean("m", "h_N"))
+            h_N = g.dstdata["h_N"]
+            h_total = torch.cat([h_dst, h_N], dim=1)  # <---
            return self.linear(h_total)

+
 class Model(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(Model, self).__init__()
@@ -189,28 +199,31 @@ class Model(nn.Module):
        self.conv2 = SAGEConv(h_feats, num_classes)

    def forward(self, mfgs, x):
-        h_dst = x[:mfgs[0].num_dst_nodes()]
+        h_dst = x[: mfgs[0].num_dst_nodes()]
        h = self.conv1(mfgs[0], (x, h_dst))
        h = F.relu(h)
-        h_dst = h[:mfgs[1].num_dst_nodes()]
+        h_dst = h[: mfgs[1].num_dst_nodes()]
        h = self.conv2(mfgs[1], (h, h_dst))
        return h

+
 sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
 train_dataloader = dgl.dataloading.DataLoader(
-    graph, train_nids, sampler,
+    graph,
+    train_nids,
+    sampler,
    device=device,
    batch_size=1024,
    shuffle=True,
    drop_last=False,
-    num_workers=0
+    num_workers=0,
 )
-model = Model(graph.ndata['feat'].shape[1], 128, dataset.num_classes).to(device)
+model = Model(graph.ndata["feat"].shape[1], 128, dataset.num_classes).to(device)

 with tqdm.tqdm(train_dataloader) as tq:
    for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
-        inputs = mfgs[0].srcdata['feat']
-        labels = mfgs[-1].dstdata['label']
+        inputs = mfgs[0].srcdata["feat"]
+        labels = mfgs[-1].dstdata["label"]
        predictions = model(mfgs, inputs)


@@ -232,6 +245,7 @@ with tqdm.tqdm(train_dataloader) as tq:
 # Say you start with a GNN module that works for full-graph training only:
 #

+
 class SAGEConv(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

@@ -242,6 +256,7 @@ class SAGEConv(nn.Module):
    out_feat : int
        Output feature size.
    """
+
    def __init__(self, in_feat, out_feat):
        super().__init__()
        # A linear submodule for projecting the input and neighbor feature to the output.
@@ -258,10 +273,13 @@ class SAGEConv(nn.Module):
            The input node feature.
        """
        with g.local_scope():
-            g.ndata['h'] = h
+            g.ndata["h"] = h
            # update_all is a message passing API.
-            g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
-            h_N = g.ndata['h_N']
+            g.update_all(
+                message_func=fn.copy_u("h", "m"),
+                reduce_func=fn.mean("m", "h_N"),
+            )
+            h_N = g.ndata["h_N"]
            h_total = torch.cat([h, h_N], dim=1)
            return self.linear(h_total)

@@ -352,6 +370,7 @@ class SAGEConv(nn.Module):
 # to something like the following:
 #

+
 class SAGEConvForBoth(nn.Module):
    """Graph convolution module used by the GraphSAGE model.

@@ -362,6 +381,7 @@ class SAGEConvForBoth(nn.Module):
    out_feat : int
        Output feature size.
    """
+
    def __init__(self, in_feat, out_feat):
        super().__init__()
        # A linear submodule for projecting the input and neighbor feature to the output.
@@ -383,10 +403,13 @@ class SAGEConvForBoth(nn.Module):
            else:
                h_src = h_dst = h

-            g.srcdata['h'] = h_src
+            g.srcdata["h"] = h_src
            # update_all is a message passing API.
-            g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
-            h_N = g.ndata['h_N']
+            g.update_all(
+                message_func=fn.copy_u("h", "m"),
+                reduce_func=fn.mean("m", "h_N"),
+            )
+            h_N = g.ndata["h_N"]
            h_total = torch.cat([h_dst, h_N], dim=1)
            return self.linear(h_total)


--- a/tutorials/models/1_gnn/1_gcn.py
+++ b/tutorials/models/1_gnn/1_gcn.py
@@ -20,189 +20,186 @@ Convolutional Networks <https://arxiv.org/pdf/1609.02907.pdf>`_). We explain
 what is under the hood of the :class:`~dgl.nn.GraphConv` module.
 The reader is expected to learn how to define a new GNN layer using DGL's
 message passing APIs.
-"""
-
-###############################################################################
-# Model Overview
-# ------------------------------------------
-# GCN from the perspective of message passing
-# ```````````````````````````````````````````````
-# We describe a layer of graph convolutional neural network from a message
-# passing perspective; the math can be found `here <math_>`_.
-# It boils down to the following step, for each node :math:`u`:
-#
-# 1) Aggregate neighbors' representations :math:`h_{v}` to produce an
-# intermediate representation :math:`\hat{h}_u`.  2) Transform the aggregated
-# representation :math:`\hat{h}_{u}` with a linear projection followed by a
-# non-linearity: :math:`h_{u} = f(W_{u} \hat{h}_u)`.
-#
-# We will implement step 1 with DGL message passing, and step 2 by
-# PyTorch ``nn.Module``.
-#
-# GCN implementation with DGL
-# ``````````````````````````````````````````
-# We first define the message and reduce function as usual.  Since the
-# aggregation on a node :math:`u` only involves summing over the neighbors'
-# representations :math:`h_v`, we can simply use builtin functions:
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-
-import dgl
-import dgl.function as fn
-from dgl import DGLGraph
-
-gcn_msg = fn.copy_u(u="h", out="m")
-gcn_reduce = fn.sum(msg="m", out="h")
-
-###############################################################################
-# We then proceed to define the GCNLayer module. A GCNLayer essentially performs
-# message passing on all the nodes then applies a fully-connected layer.
-#
-# .. note::
-#
-#    This is showing how to implement a GCN from scratch.  DGL provides a more
-#    efficient :class:`builtin GCN layer module <dgl.nn.pytorch.conv.GraphConv>`.
-#
-
-
-class GCNLayer(nn.Module):
-    def __init__(self, in_feats, out_feats):
-        super(GCNLayer, self).__init__()
-        self.linear = nn.Linear(in_feats, out_feats)
-
-    def forward(self, g, feature):
-        # Creating a local scope so that all the stored ndata and edata
-        # (such as the `'h'` ndata below) are automatically popped out
-        # when the scope exits.
-        with g.local_scope():
-            g.ndata["h"] = feature
-            g.update_all(gcn_msg, gcn_reduce)
-            h = g.ndata["h"]
-            return self.linear(h)
-
-
-###############################################################################
-# The forward function is essentially the same as any other commonly seen NNs
-# model in PyTorch.  We can initialize GCN like any ``nn.Module``. For example,
-# let's define a simple neural network consisting of two GCN layers. Suppose we
-# are training the classifier for the cora dataset (the input feature size is
-# 1433 and the number of classes is 7). The last GCN layer computes node embeddings,
-# so the last layer in general does not apply activation.
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()
-        self.layer1 = GCNLayer(1433, 16)
-        self.layer2 = GCNLayer(16, 7)
-
-    def forward(self, g, features):
-        x = F.relu(self.layer1(g, features))
-        x = self.layer2(g, x)
-        return x
-
-
-net = Net()
-print(net)
-
-###############################################################################
-# We load the cora dataset using DGL's built-in data module.
-
-from dgl.data import CoraGraphDataset
-
-
-def load_cora_data():
-    dataset = CoraGraphDataset()
-    g = dataset[0]
-    features = g.ndata["feat"]
-    labels = g.ndata["label"]
-    train_mask = g.ndata["train_mask"]
-    test_mask = g.ndata["test_mask"]
-    return g, features, labels, train_mask, test_mask
-
-
-###############################################################################
-# When a model is trained, we can use the following method to evaluate
-# the performance of the model on the test dataset:
-
-
-def evaluate(model, g, features, labels, mask):
-    model.eval()
-    with th.no_grad():
-        logits = model(g, features)
-        logits = logits[mask]
-        labels = labels[mask]
-        _, indices = th.max(logits, dim=1)
-        correct = th.sum(indices == labels)
-        return correct.item() * 1.0 / len(labels)
-
-
-###############################################################################
-# We then train the network as follows:
-
-import time
-
-import numpy as np
-
-g, features, labels, train_mask, test_mask = load_cora_data()
-# Add edges between each node and itself to preserve old node representations
-g.add_edges(g.nodes(), g.nodes())
-optimizer = th.optim.Adam(net.parameters(), lr=1e-2)
-dur = []
-for epoch in range(50):
-    if epoch >= 3:
-        t0 = time.time()
-
-    net.train()
-    logits = net(g, features)
-    logp = F.log_softmax(logits, 1)
-    loss = F.nll_loss(logp[train_mask], labels[train_mask])
-
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
-
-    if epoch >= 3:
-        dur.append(time.time() - t0)
-
-    acc = evaluate(net, g, features, labels, test_mask)
-    print(
-        "Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
-            epoch, loss.item(), acc, np.mean(dur)
-        )
-    )
-
-###############################################################################
-# .. _math:
-#
-# GCN in one formula
-# ------------------
-# Mathematically, the GCN model follows this formula:
-#
-# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
-#
-# Here, :math:`H^{(l)}` denotes the :math:`l^{th}` layer in the network,
-# :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for
-# this layer. :math:`\tilde{D}` and :math:`\tilde{A}` are separately the degree
-# and adjacency matrices for the graph. With the superscript ~, we are referring
-# to the variant where we add additional edges between each node and itself to
-# preserve its old representation in graph convolutions. The shape of the input
-# :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes
-# and :math:`D` is the number of input features. We can chain up multiple
-# layers as such to produce a node-level representation output with shape
-# :math:`N \times F`, where :math:`F` is the dimension of the output node
-# feature vector.
-#
-# The equation can be efficiently implemented using sparse matrix
-# multiplication kernels (such as Kipf's
-# `pygcn <https://github.com/tkipf/pygcn>`_ code). The above DGL implementation
-# in fact has already used this trick due to the use of builtin functions.
-#
-# Note that the tutorial code implements a simplified version of GCN where we
-# replace :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` with
-# :math:`\tilde{A}`. For a full implementation, see our example
-# `here  <https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn>`_.
+"""
+
+###############################################################################
+# Model Overview
+# ------------------------------------------
+# GCN from the perspective of message passing
+# ```````````````````````````````````````````````
+# We describe a layer of graph convolutional neural network from a message
+# passing perspective; the math can be found `here <math_>`_.
+# It boils down to the following step, for each node :math:`u`:
+#
+# 1) Aggregate neighbors' representations :math:`h_{v}` to produce an
+# intermediate representation :math:`\hat{h}_u`.  2) Transform the aggregated
+# representation :math:`\hat{h}_{u}` with a linear projection followed by a
+# non-linearity: :math:`h_{u} = f(W_{u} \hat{h}_u)`.
+#
+# We will implement step 1 with DGL message passing, and step 2 by
+# PyTorch ``nn.Module``.
+#
+# GCN implementation with DGL
+# ``````````````````````````````````````````
+# We first define the message and reduce function as usual.  Since the
+# aggregation on a node :math:`u` only involves summing over the neighbors'
+# representations :math:`h_v`, we can simply use builtin functions:
+
+import os
+
+os.environ["DGLBACKEND"] = "pytorch"
+import dgl
+import dgl.function as fn
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from dgl import DGLGraph
+
+gcn_msg = fn.copy_u(u="h", out="m")
+gcn_reduce = fn.sum(msg="m", out="h")
+
+###############################################################################
+# We then proceed to define the GCNLayer module. A GCNLayer essentially performs
+# message passing on all the nodes then applies a fully-connected layer.
+#
+# .. note::
+#
+#    This is showing how to implement a GCN from scratch.  DGL provides a more
+#    efficient :class:`builtin GCN layer module <dgl.nn.pytorch.conv.GraphConv>`.
+#
+
+
+class GCNLayer(nn.Module):
+    def __init__(self, in_feats, out_feats):
+        super(GCNLayer, self).__init__()
+        self.linear = nn.Linear(in_feats, out_feats)
+
+    def forward(self, g, feature):
+        # Creating a local scope so that all the stored ndata and edata
+        # (such as the `'h'` ndata below) are automatically popped out
+        # when the scope exits.
+        with g.local_scope():
+            g.ndata["h"] = feature
+            g.update_all(gcn_msg, gcn_reduce)
+            h = g.ndata["h"]
+            return self.linear(h)
+
+
+###############################################################################
+# The forward function is essentially the same as any other commonly seen NNs
+# model in PyTorch.  We can initialize GCN like any ``nn.Module``. For example,
+# let's define a simple neural network consisting of two GCN layers. Suppose we
+# are training the classifier for the cora dataset (the input feature size is
+# 1433 and the number of classes is 7). The last GCN layer computes node embeddings,
+# so the last layer in general does not apply activation.
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.layer1 = GCNLayer(1433, 16)
+        self.layer2 = GCNLayer(16, 7)
+
+    def forward(self, g, features):
+        x = F.relu(self.layer1(g, features))
+        x = self.layer2(g, x)
+        return x
+
+
+net = Net()
+print(net)
+
+###############################################################################
+# We load the cora dataset using DGL's built-in data module.
+
+from dgl.data import CoraGraphDataset
+
+
+def load_cora_data():
+    dataset = CoraGraphDataset()
+    g = dataset[0]
+    features = g.ndata["feat"]
+    labels = g.ndata["label"]
+    train_mask = g.ndata["train_mask"]
+    test_mask = g.ndata["test_mask"]
+    return g, features, labels, train_mask, test_mask
+
+
+###############################################################################
+# When a model is trained, we can use the following method to evaluate
+# the performance of the model on the test dataset:
+
+
+def evaluate(model, g, features, labels, mask):
+    model.eval()
+    with th.no_grad():
+        logits = model(g, features)
+        logits = logits[mask]
+        labels = labels[mask]
+        _, indices = th.max(logits, dim=1)
+        correct = th.sum(indices == labels)
+        return correct.item() * 1.0 / len(labels)
+
+
+###############################################################################
+# We then train the network as follows:
+
+import time
+
+import numpy as np
+
+g, features, labels, train_mask, test_mask = load_cora_data()
+# Add edges between each node and itself to preserve old node representations
+g.add_edges(g.nodes(), g.nodes())
+optimizer = th.optim.Adam(net.parameters(), lr=1e-2)
+dur = []
+for epoch in range(50):
+    if epoch >= 3:
+        t0 = time.time()
+    net.train()
+    logits = net(g, features)
+    logp = F.log_softmax(logits, 1)
+    loss = F.nll_loss(logp[train_mask], labels[train_mask])
+
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+    if epoch >= 3:
+        dur.append(time.time() - t0)
+    acc = evaluate(net, g, features, labels, test_mask)
+    print(
+        "Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
+            epoch, loss.item(), acc, np.mean(dur)
+        )
+    )
+###############################################################################
+# .. _math:
+#
+# GCN in one formula
+# ------------------
+# Mathematically, the GCN model follows this formula:
+#
+# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
+#
+# Here, :math:`H^{(l)}` denotes the :math:`l^{th}` layer in the network,
+# :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for
+# this layer. :math:`\tilde{D}` and :math:`\tilde{A}` are separately the degree
+# and adjacency matrices for the graph. With the superscript ~, we are referring
+# to the variant where we add additional edges between each node and itself to
+# preserve its old representation in graph convolutions. The shape of the input
+# :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes
+# and :math:`D` is the number of input features. We can chain up multiple
+# layers as such to produce a node-level representation output with shape
+# :math:`N \times F`, where :math:`F` is the dimension of the output node
+# feature vector.
+#
+# The equation can be efficiently implemented using sparse matrix
+# multiplication kernels (such as Kipf's
+# `pygcn <https://github.com/tkipf/pygcn>`_ code). The above DGL implementation
+# in fact has already used this trick due to the use of builtin functions.
+#
+# Note that the tutorial code implements a simplified version of GCN where we
+# replace :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` with
+# :math:`\tilde{A}`. For a full implementation, see our example
+# `here  <https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn>`_.
--- a/tutorials/models/1_gnn/4_rgcn.py
+++ b/tutorials/models/1_gnn/4_rgcn.py
@@ -29,340 +29,389 @@ subject, relation, object. Edges thus encode important information and
 have their own embeddings to be learned. Furthermore, there may exist
 multiple edges among any given pair.

-"""
-###############################################################################
-# A brief introduction to R-GCN
-# ---------------------------
-# In *statistical relational learning* (SRL), there are two fundamental
-# tasks:
-#
-# - **Entity classification** - Where you assign types and categorical
-#   properties to entities.
-# - **Link prediction** - Where you recover missing triples.
-#
-# In both cases, missing information is expected to be recovered from the 
-# neighborhood structure of the graph. For example, the R-GCN
-# paper cited earlier provides the following example. Knowing that Mikhail Baryshnikov was educated at the Vaganova Academy
-# implies both that Mikhail Baryshnikov should have the label person, and
-# that the triple (Mikhail Baryshnikov, lived in, Russia) must belong to the
-# knowledge graph.
-#
-# R-GCN solves these two problems using a common graph convolutional network. It's 
-# extended with multi-edge encoding to compute embedding of the entities, but
-# with different downstream processing.
-#
-# - Entity classification is done by attaching a softmax classifier at the
-#   final embedding of an entity (node). Training is through loss of standard
-#   cross-entropy.
-# - Link prediction is done by reconstructing an edge with an autoencoder
-#   architecture, using a parameterized score function. Training uses negative
-#   sampling.
-#
-# This tutorial focuses on the first task, entity classification, to show how to generate entity
-# representation. `Complete
-# code <https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn>`_
-# for both tasks is found in the DGL Github repository.
-#
-# Key ideas of R-GCN
-# -------------------
-# Recall that in GCN, the hidden representation for each node :math:`i` at
-# :math:`(l+1)^{th}` layer is computed by:
-#
-# .. math:: h_i^{l+1} = \sigma\left(\sum_{j\in N_i}\frac{1}{c_i} W^{(l)} h_j^{(l)}\right)~~~~~~~~~~(1)\\
-#
-# where :math:`c_i` is a normalization constant.
-#
-# The key difference between R-GCN and GCN is that in R-GCN, edges can
-# represent different relations. In GCN, weight :math:`W^{(l)}` in equation
-# :math:`(1)` is shared by all edges in layer :math:`l`. In contrast, in
-# R-GCN, different edge types use different weights and only edges of the
-# same relation type :math:`r` are associated with the same projection weight
-# :math:`W_r^{(l)}`.
-#
-# So the hidden representation of entities in :math:`(l+1)^{th}` layer in
-# R-GCN can be formulated as the following equation:
-#
-# .. math:: h_i^{l+1} = \sigma\left(W_0^{(l)}h_i^{(l)}+\sum_{r\in R}\sum_{j\in N_i^r}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}\right)~~~~~~~~~~(2)\\
-#
-# where :math:`N_i^r` denotes the set of neighbor indices of node :math:`i`
-# under relation :math:`r\in R` and :math:`c_{i,r}` is a normalization
-# constant. In entity classification, the R-GCN paper uses
-# :math:`c_{i,r}=|N_i^r|`.
-#
-# The problem of applying the above equation directly is the rapid growth of
-# the number of parameters, especially with highly multi-relational data. In
-# order to reduce model parameter size and prevent overfitting, the original
-# paper proposes to use basis decomposition.
-#
-# .. math:: W_r^{(l)}=\sum\limits_{b=1}^B a_{rb}^{(l)}V_b^{(l)}~~~~~~~~~~(3)\\
-#
-# Therefore, the weight :math:`W_r^{(l)}` is a linear combination of basis
-# transformation :math:`V_b^{(l)}` with coefficients :math:`a_{rb}^{(l)}`.
-# The number of bases :math:`B` is much smaller than the number of relations
-# in the knowledge base.
-#
-# .. note::
-#    Another weight regularization, block-decomposition, is implemented in
-#    the `link prediction <link-prediction_>`_.
-#
-# Implement R-GCN in DGL
-# ----------------------
-#
-# An R-GCN model is composed of several R-GCN layers. The first R-GCN layer
-# also serves as input layer and takes in features (for example, description texts)
-# that are associated with node entity and project to hidden space. In this tutorial,
-# we only use the entity ID as an entity feature.
-#
-# R-GCN layers
-# ~~~~~~~~~~~~
-#
-# For each node, an R-GCN layer performs the following steps:
-#
-# - Compute outgoing message using node representation and weight matrix
-#   associated with the edge type (message function)
-# - Aggregate incoming messages and generate new node representations (reduce
-#   and apply function)
-#
-# The following code is the definition of an R-GCN hidden layer.
-#
-# .. note::
-#    Each relation type is associated with a different weight. Therefore,
-#    the full weight matrix has three dimensions: relation, input_feature,
-#    output_feature.
-#
-# .. note::
-#
-#    This is showing how to implement an R-GCN from scratch.  DGL provides a more
-#    efficient :class:`builtin R-GCN layer module <dgl.nn.pytorch.conv.RelGraphConv>`.
-#
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from dgl import DGLGraph
-import dgl.function as fn
-from functools import partial
-
-class RGCNLayer(nn.Module):
-    def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
-                 activation=None, is_input_layer=False):
-        super(RGCNLayer, self).__init__()
-        self.in_feat = in_feat
-        self.out_feat = out_feat
-        self.num_rels = num_rels
-        self.num_bases = num_bases
-        self.bias = bias
-        self.activation = activation
-        self.is_input_layer = is_input_layer
-
-        # sanity check
-        if self.num_bases <= 0 or self.num_bases > self.num_rels:
-            self.num_bases = self.num_rels
-
-        # weight bases in equation (3)
-        self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
-                                                self.out_feat))
-        if self.num_bases < self.num_rels:
-            # linear combination coefficients in equation (3)
-            self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))
-
-        # add bias
-        if self.bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feat))
-
-        # init trainable parameters
-        nn.init.xavier_uniform_(self.weight,
-                                gain=nn.init.calculate_gain('relu'))
-        if self.num_bases < self.num_rels:
-            nn.init.xavier_uniform_(self.w_comp,
-                                    gain=nn.init.calculate_gain('relu'))
-        if self.bias:
-            nn.init.xavier_uniform_(self.bias,
-                                    gain=nn.init.calculate_gain('relu'))
-
-    def forward(self, g):
-        if self.num_bases < self.num_rels:
-            # generate all weights from bases (equation (3))
-            weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
-            weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
-                                                        self.in_feat, self.out_feat)
-        else:
-            weight = self.weight
-
-        if self.is_input_layer:
-            def message_func(edges):
-                # for input layer, matrix multiply can be converted to be
-                # an embedding lookup using source node id
-                embed = weight.view(-1, self.out_feat)
-                index = edges.data[dgl.ETYPE] * self.in_feat + edges.src['id']
-                return {'msg': embed[index] * edges.data['norm']}
-        else:
-            def message_func(edges):
-                w = weight[edges.data[dgl.ETYPE]]
-                msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
-                msg = msg * edges.data['norm']
-                return {'msg': msg}
-
-        def apply_func(nodes):
-            h = nodes.data['h']
-            if self.bias:
-                h = h + self.bias
-            if self.activation:
-                h = self.activation(h)
-            return {'h': h}
-
-        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)
-
-
-###############################################################################
-# Full R-GCN model defined
-# ~~~~~~~~~~~~~~~~~~~~~~~
-
-class Model(nn.Module):
-    def __init__(self, num_nodes, h_dim, out_dim, num_rels,
-                 num_bases=-1, num_hidden_layers=1):
-        super(Model, self).__init__()
-        self.num_nodes = num_nodes
-        self.h_dim = h_dim
-        self.out_dim = out_dim
-        self.num_rels = num_rels
-        self.num_bases = num_bases
-        self.num_hidden_layers = num_hidden_layers
-
-        # create rgcn layers
-        self.build_model()
-
-        # create initial features
-        self.features = self.create_features()
-
-    def build_model(self):
-        self.layers = nn.ModuleList()
-        # input to hidden
-        i2h = self.build_input_layer()
-        self.layers.append(i2h)
-        # hidden to hidden
-        for _ in range(self.num_hidden_layers):
-            h2h = self.build_hidden_layer()
-            self.layers.append(h2h)
-        # hidden to output
-        h2o = self.build_output_layer()
-        self.layers.append(h2o)
-
-    # initialize feature for each node
-    def create_features(self):
-        features = torch.arange(self.num_nodes)
-        return features
-
-    def build_input_layer(self):
-        return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
-                         activation=F.relu, is_input_layer=True)
-
-    def build_hidden_layer(self):
-        return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
-                         activation=F.relu)
-
-    def build_output_layer(self):
-        return RGCNLayer(self.h_dim, self.out_dim, self.num_rels, self.num_bases,
-                         activation=partial(F.softmax, dim=1))
-
-    def forward(self, g):
-        if self.features is not None:
-            g.ndata['id'] = self.features
-        for layer in self.layers:
-            layer(g)
-        return g.ndata.pop('h')
-
-###############################################################################
-# Handle dataset
-# ~~~~~~~~~~~~~~~~
-# This tutorial uses Institute for Applied Informatics and Formal Description Methods (AIFB) dataset from R-GCN paper.
-
-# load graph data
-dataset = dgl.data.rdf.AIFBDataset()
-g = dataset[0]
-category = dataset.predict_category
-train_mask = g.nodes[category].data.pop('train_mask')
-test_mask = g.nodes[category].data.pop('test_mask')
-train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
-test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
-labels = g.nodes[category].data.pop('label')
-num_rels = len(g.canonical_etypes)
-num_classes = dataset.num_classes
-# normalization factor
-for cetype in g.canonical_etypes:
-    g.edges[cetype].data['norm'] = dgl.norm_by_dst(g, cetype).unsqueeze(1)
-category_id = g.ntypes.index(category)
-
-###############################################################################
-# Create graph and model
-# ~~~~~~~~~~~~~~~~~~~~~~~
-
-# configurations
-n_hidden = 16 # number of hidden units
-n_bases = -1 # use number of relations as number of bases
-n_hidden_layers = 0 # use 1 input layer, 1 output layer, no hidden layer
-n_epochs = 25 # epochs to train
-lr = 0.01 # learning rate
-l2norm = 0 # L2 norm coefficient
-
-# create graph
-g = dgl.to_homogeneous(g, edata=['norm'])
-node_ids = torch.arange(g.num_nodes())
-target_idx = node_ids[g.ndata[dgl.NTYPE] == category_id]
-
-# create model
-model = Model(g.num_nodes(),
-              n_hidden,
-              num_classes,
-              num_rels,
-              num_bases=n_bases,
-              num_hidden_layers=n_hidden_layers)
-
-###############################################################################
-# Training loop
-# ~~~~~~~~~~~~~~~~
-
-# optimizer
-optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
-
-print("start training...")
-model.train()
-for epoch in range(n_epochs):
-    optimizer.zero_grad()
-    logits = model.forward(g)
-    logits = logits[target_idx]
-    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
-    loss.backward()
-
-    optimizer.step()
-
-    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
-    train_acc = train_acc.item() / len(train_idx)
-    val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
-    val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
-    val_acc = val_acc.item() / len(test_idx)
-    print("Epoch {:05d} | ".format(epoch) +
-          "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
-              train_acc, loss.item()) +
-          "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
-              val_acc, val_loss.item()))
-
-###############################################################################
-# .. _link-prediction:
-#
-# The second task, link prediction
-# --------------------------------
-# So far, you have seen how to use DGL to implement entity classification with an 
-# R-GCN model. In the knowledge base setting, representation generated by
-# R-GCN can be used to uncover potential relationships between nodes. In the 
-# R-GCN paper, the authors feed the entity representations generated by R-GCN
-# into the `DistMult <https://arxiv.org/pdf/1412.6575.pdf>`_ prediction model
-# to predict possible relationships.
-#
-# The implementation is similar to that presented here, but with an extra DistMult layer
-# stacked on top of the R-GCN layers. You can find the complete
-# implementation of link prediction with R-GCN in our `Github Python code
-# example <https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/link.py>`_.
+"""
+###############################################################################
+# A brief introduction to R-GCN
+# ---------------------------
+# In *statistical relational learning* (SRL), there are two fundamental
+# tasks:
+#
+# - **Entity classification** - Where you assign types and categorical
+#   properties to entities.
+# - **Link prediction** - Where you recover missing triples.
+#
+# In both cases, missing information is expected to be recovered from the
+# neighborhood structure of the graph. For example, the R-GCN
+# paper cited earlier provides the following example. Knowing that Mikhail Baryshnikov was educated at the Vaganova Academy
+# implies both that Mikhail Baryshnikov should have the label person, and
+# that the triple (Mikhail Baryshnikov, lived in, Russia) must belong to the
+# knowledge graph.
+#
+# R-GCN solves these two problems using a common graph convolutional network. It's
+# extended with multi-edge encoding to compute embedding of the entities, but
+# with different downstream processing.
+#
+# - Entity classification is done by attaching a softmax classifier at the
+#   final embedding of an entity (node). Training is through loss of standard
+#   cross-entropy.
+# - Link prediction is done by reconstructing an edge with an autoencoder
+#   architecture, using a parameterized score function. Training uses negative
+#   sampling.
+#
+# This tutorial focuses on the first task, entity classification, to show how to generate entity
+# representation. `Complete
+# code <https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn>`_
+# for both tasks is found in the DGL Github repository.
+#
+# Key ideas of R-GCN
+# -------------------
+# Recall that in GCN, the hidden representation for each node :math:`i` at
+# :math:`(l+1)^{th}` layer is computed by:
+#
+# .. math:: h_i^{l+1} = \sigma\left(\sum_{j\in N_i}\frac{1}{c_i} W^{(l)} h_j^{(l)}\right)~~~~~~~~~~(1)\\
+#
+# where :math:`c_i` is a normalization constant.
+#
+# The key difference between R-GCN and GCN is that in R-GCN, edges can
+# represent different relations. In GCN, weight :math:`W^{(l)}` in equation
+# :math:`(1)` is shared by all edges in layer :math:`l`. In contrast, in
+# R-GCN, different edge types use different weights and only edges of the
+# same relation type :math:`r` are associated with the same projection weight
+# :math:`W_r^{(l)}`.
+#
+# So the hidden representation of entities in :math:`(l+1)^{th}` layer in
+# R-GCN can be formulated as the following equation:
+#
+# .. math:: h_i^{l+1} = \sigma\left(W_0^{(l)}h_i^{(l)}+\sum_{r\in R}\sum_{j\in N_i^r}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}\right)~~~~~~~~~~(2)\\
+#
+# where :math:`N_i^r` denotes the set of neighbor indices of node :math:`i`
+# under relation :math:`r\in R` and :math:`c_{i,r}` is a normalization
+# constant. In entity classification, the R-GCN paper uses
+# :math:`c_{i,r}=|N_i^r|`.
+#
+# The problem of applying the above equation directly is the rapid growth of
+# the number of parameters, especially with highly multi-relational data. In
+# order to reduce model parameter size and prevent overfitting, the original
+# paper proposes to use basis decomposition.
+#
+# .. math:: W_r^{(l)}=\sum\limits_{b=1}^B a_{rb}^{(l)}V_b^{(l)}~~~~~~~~~~(3)\\
+#
+# Therefore, the weight :math:`W_r^{(l)}` is a linear combination of basis
+# transformation :math:`V_b^{(l)}` with coefficients :math:`a_{rb}^{(l)}`.
+# The number of bases :math:`B` is much smaller than the number of relations
+# in the knowledge base.
+#
+# .. note::
+#    Another weight regularization, block-decomposition, is implemented in
+#    the `link prediction <link-prediction_>`_.
+#
+# Implement R-GCN in DGL
+# ----------------------
+#
+# An R-GCN model is composed of several R-GCN layers. The first R-GCN layer
+# also serves as input layer and takes in features (for example, description texts)
+# that are associated with node entity and project to hidden space. In this tutorial,
+# we only use the entity ID as an entity feature.
+#
+# R-GCN layers
+# ~~~~~~~~~~~~
+#
+# For each node, an R-GCN layer performs the following steps:
+#
+# - Compute outgoing message using node representation and weight matrix
+#   associated with the edge type (message function)
+# - Aggregate incoming messages and generate new node representations (reduce
+#   and apply function)
+#
+# The following code is the definition of an R-GCN hidden layer.
+#
+# .. note::
+#    Each relation type is associated with a different weight. Therefore,
+#    the full weight matrix has three dimensions: relation, input_feature,
+#    output_feature.
+#
+# .. note::
+#
+#    This is showing how to implement an R-GCN from scratch.  DGL provides a more
+#    efficient :class:`builtin R-GCN layer module <dgl.nn.pytorch.conv.RelGraphConv>`.
+#
+
+import os
+
+os.environ["DGLBACKEND"] = "pytorch"
+from functools import partial
+
+import dgl
+import dgl.function as fn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dgl import DGLGraph
+
+
+class RGCNLayer(nn.Module):
+    def __init__(
+        self,
+        in_feat,
+        out_feat,
+        num_rels,
+        num_bases=-1,
+        bias=None,
+        activation=None,
+        is_input_layer=False,
+    ):
+        super(RGCNLayer, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.num_rels = num_rels
+        self.num_bases = num_bases
+        self.bias = bias
+        self.activation = activation
+        self.is_input_layer = is_input_layer
+
+        # sanity check
+        if self.num_bases <= 0 or self.num_bases > self.num_rels:
+            self.num_bases = self.num_rels
+        # weight bases in equation (3)
+        self.weight = nn.Parameter(
+            torch.Tensor(self.num_bases, self.in_feat, self.out_feat)
+        )
+        if self.num_bases < self.num_rels:
+            # linear combination coefficients in equation (3)
+            self.w_comp = nn.Parameter(
+                torch.Tensor(self.num_rels, self.num_bases)
+            )
+        # add bias
+        if self.bias:
+            self.bias = nn.Parameter(torch.Tensor(out_feat))
+        # init trainable parameters
+        nn.init.xavier_uniform_(
+            self.weight, gain=nn.init.calculate_gain("relu")
+        )
+        if self.num_bases < self.num_rels:
+            nn.init.xavier_uniform_(
+                self.w_comp, gain=nn.init.calculate_gain("relu")
+            )
+        if self.bias:
+            nn.init.xavier_uniform_(
+                self.bias, gain=nn.init.calculate_gain("relu")
+            )
+
+    def forward(self, g):
+        if self.num_bases < self.num_rels:
+            # generate all weights from bases (equation (3))
+            weight = self.weight.view(
+                self.in_feat, self.num_bases, self.out_feat
+            )
+            weight = torch.matmul(self.w_comp, weight).view(
+                self.num_rels, self.in_feat, self.out_feat
+            )
+        else:
+            weight = self.weight
+        if self.is_input_layer:
+
+            def message_func(edges):
+                # for input layer, matrix multiply can be converted to be
+                # an embedding lookup using source node id
+                embed = weight.view(-1, self.out_feat)
+                index = edges.data[dgl.ETYPE] * self.in_feat + edges.src["id"]
+                return {"msg": embed[index] * edges.data["norm"]}
+
+        else:
+
+            def message_func(edges):
+                w = weight[edges.data[dgl.ETYPE]]
+                msg = torch.bmm(edges.src["h"].unsqueeze(1), w).squeeze()
+                msg = msg * edges.data["norm"]
+                return {"msg": msg}
+
+        def apply_func(nodes):
+            h = nodes.data["h"]
+            if self.bias:
+                h = h + self.bias
+            if self.activation:
+                h = self.activation(h)
+            return {"h": h}
+
+        g.update_all(message_func, fn.sum(msg="msg", out="h"), apply_func)
+
+
+###############################################################################
+# Full R-GCN model defined
+# ~~~~~~~~~~~~~~~~~~~~~~~
+
+
+class Model(nn.Module):
+    def __init__(
+        self,
+        num_nodes,
+        h_dim,
+        out_dim,
+        num_rels,
+        num_bases=-1,
+        num_hidden_layers=1,
+    ):
+        super(Model, self).__init__()
+        self.num_nodes = num_nodes
+        self.h_dim = h_dim
+        self.out_dim = out_dim
+        self.num_rels = num_rels
+        self.num_bases = num_bases
+        self.num_hidden_layers = num_hidden_layers
+
+        # create rgcn layers
+        self.build_model()
+
+        # create initial features
+        self.features = self.create_features()
+
+    def build_model(self):
+        self.layers = nn.ModuleList()
+        # input to hidden
+        i2h = self.build_input_layer()
+        self.layers.append(i2h)
+        # hidden to hidden
+        for _ in range(self.num_hidden_layers):
+            h2h = self.build_hidden_layer()
+            self.layers.append(h2h)
+        # hidden to output
+        h2o = self.build_output_layer()
+        self.layers.append(h2o)
+
+    # initialize feature for each node
+    def create_features(self):
+        features = torch.arange(self.num_nodes)
+        return features
+
+    def build_input_layer(self):
+        return RGCNLayer(
+            self.num_nodes,
+            self.h_dim,
+            self.num_rels,
+            self.num_bases,
+            activation=F.relu,
+            is_input_layer=True,
+        )
+
+    def build_hidden_layer(self):
+        return RGCNLayer(
+            self.h_dim,
+            self.h_dim,
+            self.num_rels,
+            self.num_bases,
+            activation=F.relu,
+        )
+
+    def build_output_layer(self):
+        return RGCNLayer(
+            self.h_dim,
+            self.out_dim,
+            self.num_rels,
+            self.num_bases,
+            activation=partial(F.softmax, dim=1),
+        )
+
+    def forward(self, g):
+        if self.features is not None:
+            g.ndata["id"] = self.features
+        for layer in self.layers:
+            layer(g)
+        return g.ndata.pop("h")
+
+
+###############################################################################
+# Handle dataset
+# ~~~~~~~~~~~~~~~~
+# This tutorial uses Institute for Applied Informatics and Formal Description Methods (AIFB) dataset from R-GCN paper.
+
+# load graph data
+dataset = dgl.data.rdf.AIFBDataset()
+g = dataset[0]
+category = dataset.predict_category
+train_mask = g.nodes[category].data.pop("train_mask")
+test_mask = g.nodes[category].data.pop("test_mask")
+train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
+test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
+labels = g.nodes[category].data.pop("label")
+num_rels = len(g.canonical_etypes)
+num_classes = dataset.num_classes
+# normalization factor
+for cetype in g.canonical_etypes:
+    g.edges[cetype].data["norm"] = dgl.norm_by_dst(g, cetype).unsqueeze(1)
+category_id = g.ntypes.index(category)
+
+###############################################################################
+# Create graph and model
+# ~~~~~~~~~~~~~~~~~~~~~~~
+
+# configurations
+n_hidden = 16  # number of hidden units
+n_bases = -1  # use number of relations as number of bases
+n_hidden_layers = 0  # use 1 input layer, 1 output layer, no hidden layer
+n_epochs = 25  # epochs to train
+lr = 0.01  # learning rate
+l2norm = 0  # L2 norm coefficient
+
+# create graph
+g = dgl.to_homogeneous(g, edata=["norm"])
+node_ids = torch.arange(g.num_nodes())
+target_idx = node_ids[g.ndata[dgl.NTYPE] == category_id]
+
+# create model
+model = Model(
+    g.num_nodes(),
+    n_hidden,
+    num_classes,
+    num_rels,
+    num_bases=n_bases,
+    num_hidden_layers=n_hidden_layers,
+)
+
+###############################################################################
+# Training loop
+# ~~~~~~~~~~~~~~~~
+
+# optimizer
+optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
+
+print("start training...")
+model.train()
+for epoch in range(n_epochs):
+    optimizer.zero_grad()
+    logits = model.forward(g)
+    logits = logits[target_idx]
+    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
+    loss.backward()
+
+    optimizer.step()
+
+    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
+    train_acc = train_acc.item() / len(train_idx)
+    val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
+    val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
+    val_acc = val_acc.item() / len(test_idx)
+    print(
+        "Epoch {:05d} | ".format(epoch)
+        + "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
+            train_acc, loss.item()
+        )
+        + "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
+            val_acc, val_loss.item()
+        )
+    )
+###############################################################################
+# .. _link-prediction:
+#
+# The second task, link prediction
+# --------------------------------
+# So far, you have seen how to use DGL to implement entity classification with an
+# R-GCN model. In the knowledge base setting, representation generated by
+# R-GCN can be used to uncover potential relationships between nodes. In the
+# R-GCN paper, the authors feed the entity representations generated by R-GCN
+# into the `DistMult <https://arxiv.org/pdf/1412.6575.pdf>`_ prediction model
+# to predict possible relationships.
+#
+# The implementation is similar to that presented here, but with an extra DistMult layer
+# stacked on top of the R-GCN layers. You can find the complete
+# implementation of link prediction with R-GCN in our `Github Python code
+# example <https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/link.py>`_.
--- a/tutorials/models/1_gnn/6_line_graph.py
+++ b/tutorials/models/1_gnn/6_line_graph.py
@@ -14,612 +14,640 @@ Line Graph Neural Network
    efficiency. For recommended implementation, please refer to the `official
    examples <https://github.com/dmlc/dgl/tree/master/examples>`_.

-"""
-
-###########################################################################################
-#
-# In this tutorial, you learn how to solve community detection tasks by implementing a line
-# graph neural network (LGNN). Community detection, or graph clustering, consists of partitioning
-# the vertices in a graph into clusters in which nodes are more similar to
-# one another.
-#
-# In the :doc:`Graph convolutinal network tutorial <1_gcn>`, you learned how to classify the nodes of an input
-# graph in a semi-supervised setting. You used a graph convolutional neural network (GCN)
-# as an embedding mechanism for graph features.
-#
-# To generalize a graph neural network (GNN) into supervised community detection, a line-graph based
-# variation of GNN is introduced in the research paper
-# `Supervised Community Detection with Line Graph Neural Networks <https://arxiv.org/abs/1705.08415>`__.
-# One of the highlights of the model is
-# to augment the straightforward GNN architecture so that it operates on
-# a line graph of edge adjacencies, defined with a non-backtracking operator.
-#
-# A line graph neural network (LGNN) shows how DGL can implement an advanced graph algorithm by
-# mixing basic tensor operations, sparse-matrix multiplication, and message-
-# passing APIs.
-#
-# In the following sections, you learn about community detection, line
-# graphs, LGNN, and its implementation.
-#
-# Supervised community detection task with the Cora dataset
-# --------------------------------------------
-# Community detection
-# ~~~~~~~~~~~~~~~~~~~~
-# In a community detection task, you cluster similar nodes instead of
-# labeling them. The node similarity is typically described as having higher inner
-# density within each cluster.
-#
-# What's the difference between community detection and node classification？
-# Comparing to node classification, community detection focuses on retrieving
-# cluster information in the graph, rather than assigning a specific label to
-# a node. For example, as long as a node is clustered with its community
-# members, it doesn't matter whether the node is assigned as "community A",
-# or "community B", while assigning all "great movies" to label "bad movies"
-# will be a disaster in a movie network classification task.
-#
-# What's the difference then, between a community detection algorithm and
-# other clustering algorithm such as k-means? Community detection algorithm operates on
-# graph-structured data. Comparing to k-means, community detection leverages
-# graph structure, instead of simply clustering nodes based on their
-# features.
-#
-# Cora dataset
-# ~~~~~
-# To be consistent with the GCN tutorial,
-# you use the `Cora dataset <https://linqs.soe.ucsc.edu/data>`__
-# to illustrate a simple community detection task. Cora is a scientific publication dataset,
-# with 2708 papers belonging to seven
-# different machine learning fields. Here, you formulate Cora as a
-# directed graph, with each node being a paper, and each edge being a
-# citation link (A->B means A cites B). Here is a visualization of the whole
-# Cora dataset.
-#
-# .. figure:: https://i.imgur.com/X404Byc.png
-#    :alt: cora
-#    :height: 400px
-#    :width: 500px
-#    :align: center
-#
-# Cora naturally contains seven classes, and statistics below show that each
-# class does satisfy our assumption of community, i.e. nodes of same class
-# class have higher connection probability among them than with nodes of different class.
-# The following code snippet verifies that there are more intra-class edges
-# than inter-class.
-
-import os
-os.environ['DGLBACKEND'] = 'pytorch'
-import torch
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-
-import dgl
-from dgl.data import citation_graph as citegrh
-
-data = citegrh.load_cora()
-
-G = data[0]
-labels = th.tensor(G.ndata['label'])
-
-# find all the nodes labeled with class 0
-label0_nodes = th.nonzero(labels == 0, as_tuple=False).squeeze()
-# find all the edges pointing to class 0 nodes
-src, _ = G.in_edges(label0_nodes)
-src_labels = labels[src]
-# find all the edges whose both endpoints are in class 0
-intra_src = th.nonzero(src_labels == 0, as_tuple=False)
-print('Intra-class edges percent: %.4f' % (len(intra_src) / len(src_labels)))
-
-###########################################################################################
-# Binary community subgraph from Cora with a test dataset
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Without loss of generality, in this tutorial you limit the scope of the
-# task to binary community detection.
-#
-# .. note::
-#
-#    To create a practice binary-community dataset from Cora, first extract
-#    all two-class pairs from the original Cora seven classes. For each pair, you
-#    treat each class as one community, and find the largest subgraph that
-#    at least contains one cross-community edge as the training example. As
-#    a result, there are a total of 21 training samples in this small dataset.
-#
-# With the following code, you can visualize one of the training samples and its community structure.
-
-import networkx as nx
-import matplotlib.pyplot as plt
-
-train_set = dgl.data.CoraBinary()
-G1, pmpd1, label1 = train_set[1]
-nx_G1 = G1.to_networkx()
-
-def visualize(labels, g):
-    pos = nx.spring_layout(g, seed=1)
-    plt.figure(figsize=(8, 8))
-    plt.axis('off')
-    nx.draw_networkx(g, pos=pos, node_size=50, cmap=plt.get_cmap('coolwarm'),
-                     node_color=labels, edge_color='k',
-                     arrows=False, width=0.5, style='dotted', with_labels=False)
-visualize(label1, nx_G1)
-
-###########################################################################################
-# To learn more, go the original research paper to see how to generalize
-# to multiple communities case.
-#
-# Community detection in a supervised setting
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# The community detection problem could be tackled with both supervised and
-# unsupervised approaches. You can formulate
-# community detection in a supervised setting as follows:
-#
-# - Each training example consists of :math:`(G, L)`, where :math:`G` is a
-#   directed graph :math:`(V, E)`. For each node :math:`v` in :math:`V`, we
-#   assign a ground truth community label :math:`z_v \in \{0,1\}`.
-# - The parameterized model :math:`f(G, \theta)` predicts a label set
-#   :math:`\tilde{Z} = f(G)` for nodes :math:`V`.
-# - For each example :math:`(G,L)`, the model learns to minimize a specially
-#   designed loss function (equivariant loss) :math:`L_{equivariant} =
-#   (\tilde{Z}，Z)`
-#
-# .. note::
-#
-#    In this supervised setting, the model naturally predicts a label for
-#    each community. However, community assignment should be equivariant to
-#    label permutations. To achieve this, in each forward process, we take
-#    the minimum among losses calculated from all possible permutations of
-#    labels.
-#
-#    Mathematically, this means
-#    :math:`L_{equivariant} = \underset{\pi \in S_c} {min}-\log(\hat{\pi}, \pi)`,
-#    where :math:`S_c` is the set of all permutations of labels, and
-#    :math:`\hat{\pi}` is the set of predicted labels,
-#    :math:`- \log(\hat{\pi},\pi)` denotes negative log likelihood.
-#
-#    For instance, for a sample graph with node :math:`\{1,2,3,4\}` and
-#    community assignment :math:`\{A, A, A, B\}`, with each node's label
-#    :math:`l \in \{0,1\}`,The group of all possible permutations
-#    :math:`S_c = \{\{0,0,0,1\}, \{1,1,1,0\}\}`.
-#
-# Line graph neural network key ideas
-# ------------------------------------
-# An key innovation in this topic is the use of a line graph.
-# Unlike models in previous tutorials, message passing happens not only on the
-# original graph, e.g. the binary community subgraph from Cora, but also on the
-# line graph associated with the original graph.
-#
-# What is a line-graph?
-# ~~~~~~~~~~~~~~~~~~~~~
-# In graph theory, line graph is a graph representation that encodes the
-# edge adjacency structure in the original graph.
-#
-# Specifically, a line-graph :math:`L(G)` turns an edge of the original graph `G`
-# into a node. This is illustrated with the graph below (taken from the
-# research paper).
-#
-# .. figure:: https://i.imgur.com/4WO5jEm.png
-#    :alt: lg
-#    :align: center
-#
-# Here, :math:`e_{A}:= （i\rightarrow j）` and :math:`e_{B}:= (j\rightarrow k)`
-# are two edges in the original graph :math:`G`. In line graph :math:`G_L`,
-# they correspond to nodes :math:`v^{l}_{A}, v^{l}_{B}`.
-#
-# The next natural question is, how to connect nodes in line-graph？ How to
-# connect two edges? Here, we use the following connection rule:
-#
-# Two nodes :math:`v^{l}_{A}`, :math:`v^{l}_{B}` in `lg` are connected if
-# the corresponding two edges :math:`e_{A}, e_{B}` in `g` share one and only
-# one node:
-# :math:`e_{A}`'s destination node is :math:`e_{B}`'s source node
-# (:math:`j`).
-#
-# .. note::
-#
-#    Mathematically, this definition corresponds to a notion called non-backtracking
-#    operator:
-#    :math:`B_{(i \rightarrow j), (\hat{i} \rightarrow \hat{j})}`
-#    :math:`= \begin{cases}
-#    1 \text{ if } j = \hat{i}, \hat{j} \neq i\\
-#    0 \text{ otherwise} \end{cases}`
-#    where an edge is formed if :math:`B_{node1, node2} = 1`.
-#
-#
-# One layer in LGNN, algorithm structure
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# LGNN chains together a series of line graph neural network layers. The graph
-# representation :math:`x` and its line graph companion :math:`y` evolve with
-# the dataflow as follows.
-#
-# .. figure:: https://i.imgur.com/bZGGIGp.png
-#    :alt: alg
-#    :align: center
-#
-# At the :math:`k`-th layer, the :math:`i`-th neuron of the :math:`l`-th
-# channel updates its embedding :math:`x^{(k+1)}_{i,l}` with:
-#
-# .. math::
-#    \begin{split}
-#    x^{(k+1)}_{i,l} ={}&\rho[x^{(k)}_{i}\theta^{(k)}_{1,l}
-#    +(Dx^{(k)})_{i}\theta^{(k)}_{2,l} \\
-#    &+\sum^{J-1}_{j=0}(A^{2^{j}}x^{k})_{i}\theta^{(k)}_{3+j,l}\\
-#    &+[\{\text{Pm},\text{Pd}\}y^{(k)}]_{i}\theta^{(k)}_{3+J,l}] \\
-#    &+\text{skip-connection}
-#    \qquad i \in V, l = 1,2,3, ... b_{k+1}/2
-#    \end{split}
-#
-# Then, the line-graph representation :math:`y^{(k+1)}_{i,l}` with,
-#
-# .. math::
-#
-#    \begin{split}
-#    y^{(k+1)}_{i',l^{'}} = {}&\rho[y^{(k)}_{i^{'}}\gamma^{(k)}_{1,l^{'}}+
-#    (D_{L(G)}y^{(k)})_{i^{'}}\gamma^{(k)}_{2,l^{'}}\\
-#    &+\sum^{J-1}_{j=0}(A_{L(G)}^{2^{j}}y^{k})_{i}\gamma^{(k)}_{3+j,l^{'}}\\
-#    &+[\{\text{Pm},\text{Pd}\}^{T}x^{(k+1)}]_{i^{'}}\gamma^{(k)}_{3+J,l^{'}}]\\
-#    &+\text{skip-connection}
-#    \qquad i^{'} \in V_{l}, l^{'} = 1,2,3, ... b^{'}_{k+1}/2
-#    \end{split}
-#
-# Where :math:`\text{skip-connection}` refers to performing the same operation without the non-linearity
-# :math:`\rho`, and with linear projection :math:`\theta_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`
-# and :math:`\gamma_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`.
-#
-# Implement LGNN in DGL
-# ---------------------
-# Even though the equations in the previous section might seem intimidating,
-# it helps to understand the following information before you implement the LGNN.
-#
-# The two equations are symmetric and can be implemented as two instances
-# of the same class with different parameters.
-# The first equation operates on graph representation :math:`x`,
-# whereas the second operates on line-graph
-# representation :math:`y`. Let us denote this abstraction as :math:`f`. Then
-# the first is :math:`f(x,y; \theta_x)`, and the second
-# is :math:`f(y,x, \theta_y)`. That is, they are parameterized to compute
-# representations of the original graph and its
-# companion line graph, respectively.
-#
-# Each equation consists of four terms. Take the first one as an example, which follows.
-#
-#   - :math:`x^{(k)}\theta^{(k)}_{1,l}`, a linear projection of previous
-#     layer's output :math:`x^{(k)}`, denote as :math:`\text{prev}(x)`.
-#   - :math:`(Dx^{(k)})\theta^{(k)}_{2,l}`, a linear projection of degree
-#     operator on :math:`x^{(k)}`, denote as :math:`\text{deg}(x)`.
-#   - :math:`\sum^{J-1}_{j=0}(A^{2^{j}}x^{(k)})\theta^{(k)}_{3+j,l}`,
-#     a summation of :math:`2^{j}` adjacency operator on :math:`x^{(k)}`,
-#     denote as :math:`\text{radius}(x)`
-#   - :math:`[\{Pm,Pd\}y^{(k)}]\theta^{(k)}_{3+J,l}`, fusing another
-#     graph's embedding information using incidence matrix
-#     :math:`\{Pm, Pd\}`, followed with a linear projection,
-#     denote as :math:`\text{fuse}(y)`.
-#
-# Each of the terms are performed again with different
-# parameters, and without the nonlinearity after the sum.
-# Therefore, :math:`f` could be written as:
-#
-#   .. math::
-#      \begin{split}
-#      f(x^{(k)},y^{(k)}) = {}\rho[&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1})
-#      +\text{fuse}(y^{(k)})]\\
-#      +&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1}) +\text{fuse}(y^{(k)})
-#      \end{split}
-#
-# Two equations are chained-up in the following order:
-#
-#   .. math::
-#      \begin{split}
-#      x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
-#      y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
-#      \end{split}
-#
-# Keep in mind the listed observations in this overview and proceed to implementation.
-# An important point is that you use different strategies for the noted terms.
-#
-# .. note::
-#    You can understand :math:`\{Pm, Pd\}` more thoroughly with this explanation.
-#    Roughly speaking, there is a relationship between how :math:`g` and
-#    :math:`lg` (the line graph) work together with loopy brief propagation.
-#    Here, you implement :math:`\{Pm, Pd\}` as a SciPy COO sparse matrix in the dataset,
-#    and stack them as tensors when batching. Another batching solution is to
-#    treat :math:`\{Pm, Pd\}` as the adjacency matrix of a bipartite graph, which maps
-#    line graph's feature to graph's, and vice versa.
-#
-# Implementing :math:`\text{prev}` and :math:`\text{deg}` as tensor operation
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Linear projection and degree operation are both simply matrix
-# multiplication. Write them as PyTorch tensor operations.
-#
-# In ``__init__``, you define the projection variables.
-#
-# ::
-#
-#    self.linear_prev = nn.Linear(in_feats, out_feats)
-#    self.linear_deg = nn.Linear(in_feats, out_feats)
-#
-#
-# In ``forward()``, :math:`\text{prev}` and :math:`\text{deg}` are the same
-# as any other PyTorch tensor operations.
-#
-# ::
-#
-#    prev_proj = self.linear_prev(feat_a)
-#    deg_proj = self.linear_deg(deg * feat_a)
-#
-# Implementing :math:`\text{radius}` as message passing in DGL
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# As discussed in GCN tutorial, you can formulate one adjacency operator as
-# doing one-step message passing. As a generalization, :math:`2^j` adjacency
-# operations can be formulated as performing :math:`2^j` step of message
-# passing. Therefore, the summation is equivalent to summing nodes'
-# representation of :math:`2^j, j=0, 1, 2..` step message passing, i.e.
-# gathering information in :math:`2^{j}` neighborhood of each node.
-#
-# In ``__init__``, define the projection variables used in each
-# :math:`2^j` steps of message passing.
-#
-# ::
-#
-#   self.linear_radius = nn.ModuleList(
-#           [nn.Linear(in_feats, out_feats) for i in range(radius)])
-#
-# In ``__forward__``, use following function ``aggregate_radius()`` to
-# gather data from multiple hops. This can be seen in the following code.
-# Note that the ``update_all`` is called multiple times.
-
-# Return a list containing features gathered from multiple radius.
-import dgl.function as fn
-def aggregate_radius(radius, g, z):
-    # initializing list to collect message passing result
-    z_list = []
-    g.ndata['z'] = z
-    # pulling message from 1-hop neighbourhood
-    g.update_all(fn.copy_u(u='z', out='m'), fn.sum(msg='m', out='z'))
-    z_list.append(g.ndata['z'])
-    for i in range(radius - 1):
-        for j in range(2 ** i):
-            #pulling message from 2^j neighborhood
-            g.update_all(fn.copy_u(u='z', out='m'), fn.sum(msg='m', out='z'))
-        z_list.append(g.ndata['z'])
-    return z_list
-
-#########################################################################
-# Implementing :math:`\text{fuse}` as sparse matrix multiplication
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# :math:`\{Pm, Pd\}` is a sparse matrix with only two non-zero entries on
-# each column. Therefore, you construct it as a sparse matrix in the dataset,
-# and implement :math:`\text{fuse}` as a sparse matrix multiplication.
-#
-# in ``__forward__``:
-#
-# ::
-#
-#   fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
-#
-# Completing :math:`f(x, y)`
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Finally, the following shows how to sum up all the terms together, pass it to skip connection, and
-# batch norm.
-#
-# ::
-#
-#   result = prev_proj + deg_proj + radius_proj + fuse
-#
-# Pass result to skip connection.
-#
-# ::
-#
-#   result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
-#
-# Then pass the result to batch norm.
-#
-# ::
-#
-#   result = self.bn(result) #Batch Normalization.
-#
-#
-# Here is the complete code for one LGNN layer's abstraction :math:`f(x,y)`
-class LGNNCore(nn.Module):
-    def __init__(self, in_feats, out_feats, radius):
-        super(LGNNCore, self).__init__()
-        self.out_feats = out_feats
-        self.radius = radius
-
-        self.linear_prev = nn.Linear(in_feats, out_feats)
-        self.linear_deg = nn.Linear(in_feats, out_feats)
-        self.linear_radius = nn.ModuleList(
-                [nn.Linear(in_feats, out_feats) for i in range(radius)])
-        self.linear_fuse = nn.Linear(in_feats, out_feats)
-        self.bn = nn.BatchNorm1d(out_feats)
-
-    def forward(self, g, feat_a, feat_b, deg, pm_pd):
-        # term "prev"
-        prev_proj = self.linear_prev(feat_a)
-        # term "deg"
-        deg_proj = self.linear_deg(deg * feat_a)
-
-        # term "radius"
-        # aggregate 2^j-hop features
-        hop2j_list = aggregate_radius(self.radius, g, feat_a)
-        # apply linear transformation
-        hop2j_list = [linear(x) for linear, x in zip(self.linear_radius, hop2j_list)]
-        radius_proj = sum(hop2j_list)
-
-        # term "fuse"
-        fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
-
-        # sum them together
-        result = prev_proj + deg_proj + radius_proj + fuse
-
-        # skip connection and batch norm
-        n = self.out_feats // 2
-        result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
-        result = self.bn(result)
-
-        return result
-
-##############################################################################################################
-# Chain-up LGNN abstractions as an LGNN layer
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# To implement:
-#
-# .. math::
-#    \begin{split}
-#    x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
-#    y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
-#    \end{split}
-#
-# Chain-up two ``LGNNCore`` instances, as in the example code, with different parameters in the forward pass.
-class LGNNLayer(nn.Module):
-    def __init__(self, in_feats, out_feats, radius):
-        super(LGNNLayer, self).__init__()
-        self.g_layer = LGNNCore(in_feats, out_feats, radius)
-        self.lg_layer = LGNNCore(in_feats, out_feats, radius)
-
-    def forward(self, g, lg, x, lg_x, deg_g, deg_lg, pm_pd):
-        next_x = self.g_layer(g, x, lg_x, deg_g, pm_pd)
-        pm_pd_y = th.transpose(pm_pd, 0, 1)
-        next_lg_x = self.lg_layer(lg, lg_x, x, deg_lg, pm_pd_y)
-        return next_x, next_lg_x
-
-########################################################################################
-# Chain-up LGNN layers
-# ~~~~~~~~~~~~~~~~~~~~
-# Define an LGNN with three hidden layers, as in the following example.
-class LGNN(nn.Module):
-    def __init__(self, radius):
-        super(LGNN, self).__init__()
-        self.layer1 = LGNNLayer(1, 16, radius)  # input is scalar feature
-        self.layer2 = LGNNLayer(16, 16, radius)  # hidden size is 16
-        self.layer3 = LGNNLayer(16, 16, radius)
-        self.linear = nn.Linear(16, 2)  # predice two classes
-
-    def forward(self, g, lg, pm_pd):
-        # compute the degrees
-        deg_g = g.in_degrees().float().unsqueeze(1)
-        deg_lg = lg.in_degrees().float().unsqueeze(1)
-        # use degree as the input feature
-        x, lg_x = deg_g, deg_lg
-        x, lg_x = self.layer1(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
-        x, lg_x = self.layer2(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
-        x, lg_x = self.layer3(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
-        return self.linear(x)
-#########################################################################################
-# Training and inference
-# -----------------------
-# First load the data.
-from torch.utils.data import DataLoader
-training_loader = DataLoader(train_set,
-                             batch_size=1,
-                             collate_fn=train_set.collate_fn,
-                             drop_last=True)
-
-#######################################################################################
-# Next, define the main training loop. Note that each training sample contains
-# three objects: A :class:`~dgl.DGLGraph`, a SciPy sparse matrix ``pmpd``, and a label
-# array in ``numpy.ndarray``. Generate the line graph by using this command:
-#
-# ::
-#
-#   lg = g.line_graph(backtracking=False)
-#
-# Note that ``backtracking=False`` is required to correctly simulate non-backtracking
-# operation. We also define a utility function to convert the SciPy sparse matrix to
-# torch sparse tensor.
-
-# Create the model
-model = LGNN(radius=3)
-# define the optimizer
-optimizer = th.optim.Adam(model.parameters(), lr=1e-2)
-
-# A utility function to convert a scipy.coo_matrix to torch.SparseFloat
-def sparse2th(mat):
-    value = mat.data
-    indices = th.LongTensor([mat.row, mat.col])
-    tensor = th.sparse.FloatTensor(indices, th.from_numpy(value).float(), mat.shape)
-    return tensor
-
-# Train for 20 epochs
-for i in range(20):
-    all_loss = []
-    all_acc = []
-    for [g, pmpd, label] in training_loader:
-        # Generate the line graph.
-        lg = g.line_graph(backtracking=False)
-        # Create torch tensors
-        pmpd = sparse2th(pmpd)
-        label = th.from_numpy(label)
-
-        # Forward
-        z = model(g, lg, pmpd)
-
-        # Calculate loss:
-        # Since there are only two communities, there are only two permutations
-        #  of the community labels.
-        loss_perm1 = F.cross_entropy(z, label)
-        loss_perm2 = F.cross_entropy(z, 1 - label)
-        loss = th.min(loss_perm1, loss_perm2)
-
-        # Calculate accuracy:
-        _, pred = th.max(z, 1)
-        acc_perm1 = (pred == label).float().mean()
-        acc_perm2 = (pred == 1 - label).float().mean()
-        acc = th.max(acc_perm1, acc_perm2)
-        all_loss.append(loss.item())
-        all_acc.append(acc.item())
-
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-
-    niters = len(all_loss)
-    print("Epoch %d | loss %.4f | accuracy %.4f" % (i,
-        sum(all_loss) / niters, sum(all_acc) / niters))
-
-#######################################################################################
-# Visualize training progress
-# -----------------------------
-# You can visualize the network's community prediction on one training example,
-# together with the ground truth. Start this with the following code example.
-
-pmpd1 = sparse2th(pmpd1)
-LG1 = G1.line_graph(backtracking=False)
-z = model(G1, LG1, pmpd1)
-_, pred = th.max(z, 1)
-visualize(pred, nx_G1)
-
-#######################################################################################
-# Compared with the ground truth. Note that the color might be reversed for the
-# two communities because the model is for correctly predicting the partitioning.
-visualize(label1, nx_G1)
-
-#########################################
-# Here is an animation to better understand the process. (40 epochs)
-#
-# .. figure:: https://i.imgur.com/KDUyE1S.gif
-#    :alt: lgnn-anim
-#
-# Batching graphs for parallelism
-# --------------------------------
-#
-# LGNN takes a collection of different graphs.
-# You might consider whether batching can be used for parallelism.
-#
-# Batching has been into the data loader itself.
-# In the ``collate_fn`` for PyTorch data loader, graphs are batched using DGL's
-# batched_graph API. DGL batches graphs by merging them
-# into a large graph, with each smaller graph's adjacency matrix being a block
-# along the diagonal of the large graph's adjacency matrix.  Concatenate
-# :math`\{Pm,Pd\}` as block diagonal matrix in correspondence to DGL batched
-# graph API.
-
-def collate_fn(batch):
-    graphs, pmpds, labels = zip(*batch)
-    batched_graphs = dgl.batch(graphs)
-    batched_pmpds = sp.block_diag(pmpds)
-    batched_labels = np.concatenate(labels, axis=0)
-    return batched_graphs, batched_pmpds, batched_labels
-
-######################################################################################
-# You can find the complete code on Github at
-# `Community Detection with Graph Neural Networks (CDGNN) <https://github.com/dmlc/dgl/tree/master/examples/pytorch/line_graph>`_.
+"""
+
+###########################################################################################
+#
+# In this tutorial, you learn how to solve community detection tasks by implementing a line
+# graph neural network (LGNN). Community detection, or graph clustering, consists of partitioning
+# the vertices in a graph into clusters in which nodes are more similar to
+# one another.
+#
+# In the :doc:`Graph convolutinal network tutorial <1_gcn>`, you learned how to classify the nodes of an input
+# graph in a semi-supervised setting. You used a graph convolutional neural network (GCN)
+# as an embedding mechanism for graph features.
+#
+# To generalize a graph neural network (GNN) into supervised community detection, a line-graph based
+# variation of GNN is introduced in the research paper
+# `Supervised Community Detection with Line Graph Neural Networks <https://arxiv.org/abs/1705.08415>`__.
+# One of the highlights of the model is
+# to augment the straightforward GNN architecture so that it operates on
+# a line graph of edge adjacencies, defined with a non-backtracking operator.
+#
+# A line graph neural network (LGNN) shows how DGL can implement an advanced graph algorithm by
+# mixing basic tensor operations, sparse-matrix multiplication, and message-
+# passing APIs.
+#
+# In the following sections, you learn about community detection, line
+# graphs, LGNN, and its implementation.
+#
+# Supervised community detection task with the Cora dataset
+# --------------------------------------------
+# Community detection
+# ~~~~~~~~~~~~~~~~~~~~
+# In a community detection task, you cluster similar nodes instead of
+# labeling them. The node similarity is typically described as having higher inner
+# density within each cluster.
+#
+# What's the difference between community detection and node classification？
+# Comparing to node classification, community detection focuses on retrieving
+# cluster information in the graph, rather than assigning a specific label to
+# a node. For example, as long as a node is clustered with its community
+# members, it doesn't matter whether the node is assigned as "community A",
+# or "community B", while assigning all "great movies" to label "bad movies"
+# will be a disaster in a movie network classification task.
+#
+# What's the difference then, between a community detection algorithm and
+# other clustering algorithm such as k-means? Community detection algorithm operates on
+# graph-structured data. Comparing to k-means, community detection leverages
+# graph structure, instead of simply clustering nodes based on their
+# features.
+#
+# Cora dataset
+# ~~~~~
+# To be consistent with the GCN tutorial,
+# you use the `Cora dataset <https://linqs.soe.ucsc.edu/data>`__
+# to illustrate a simple community detection task. Cora is a scientific publication dataset,
+# with 2708 papers belonging to seven
+# different machine learning fields. Here, you formulate Cora as a
+# directed graph, with each node being a paper, and each edge being a
+# citation link (A->B means A cites B). Here is a visualization of the whole
+# Cora dataset.
+#
+# .. figure:: https://i.imgur.com/X404Byc.png
+#    :alt: cora
+#    :height: 400px
+#    :width: 500px
+#    :align: center
+#
+# Cora naturally contains seven classes, and statistics below show that each
+# class does satisfy our assumption of community, i.e. nodes of same class
+# class have higher connection probability among them than with nodes of different class.
+# The following code snippet verifies that there are more intra-class edges
+# than inter-class.
+
+import os
+
+os.environ["DGLBACKEND"] = "pytorch"
+import dgl
+import torch
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from dgl.data import citation_graph as citegrh
+
+data = citegrh.load_cora()
+
+G = data[0]
+labels = th.tensor(G.ndata["label"])
+
+# find all the nodes labeled with class 0
+label0_nodes = th.nonzero(labels == 0, as_tuple=False).squeeze()
+# find all the edges pointing to class 0 nodes
+src, _ = G.in_edges(label0_nodes)
+src_labels = labels[src]
+# find all the edges whose both endpoints are in class 0
+intra_src = th.nonzero(src_labels == 0, as_tuple=False)
+print("Intra-class edges percent: %.4f" % (len(intra_src) / len(src_labels)))
+
+import matplotlib.pyplot as plt
+
+###########################################################################################
+# Binary community subgraph from Cora with a test dataset
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Without loss of generality, in this tutorial you limit the scope of the
+# task to binary community detection.
+#
+# .. note::
+#
+#    To create a practice binary-community dataset from Cora, first extract
+#    all two-class pairs from the original Cora seven classes. For each pair, you
+#    treat each class as one community, and find the largest subgraph that
+#    at least contains one cross-community edge as the training example. As
+#    a result, there are a total of 21 training samples in this small dataset.
+#
+# With the following code, you can visualize one of the training samples and its community structure.
+
+import networkx as nx
+
+train_set = dgl.data.CoraBinary()
+G1, pmpd1, label1 = train_set[1]
+nx_G1 = G1.to_networkx()
+
+
+def visualize(labels, g):
+    pos = nx.spring_layout(g, seed=1)
+    plt.figure(figsize=(8, 8))
+    plt.axis("off")
+    nx.draw_networkx(
+        g,
+        pos=pos,
+        node_size=50,
+        cmap=plt.get_cmap("coolwarm"),
+        node_color=labels,
+        edge_color="k",
+        arrows=False,
+        width=0.5,
+        style="dotted",
+        with_labels=False,
+    )
+
+
+visualize(label1, nx_G1)
+
+###########################################################################################
+# To learn more, go the original research paper to see how to generalize
+# to multiple communities case.
+#
+# Community detection in a supervised setting
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# The community detection problem could be tackled with both supervised and
+# unsupervised approaches. You can formulate
+# community detection in a supervised setting as follows:
+#
+# - Each training example consists of :math:`(G, L)`, where :math:`G` is a
+#   directed graph :math:`(V, E)`. For each node :math:`v` in :math:`V`, we
+#   assign a ground truth community label :math:`z_v \in \{0,1\}`.
+# - The parameterized model :math:`f(G, \theta)` predicts a label set
+#   :math:`\tilde{Z} = f(G)` for nodes :math:`V`.
+# - For each example :math:`(G,L)`, the model learns to minimize a specially
+#   designed loss function (equivariant loss) :math:`L_{equivariant} =
+#   (\tilde{Z}，Z)`
+#
+# .. note::
+#
+#    In this supervised setting, the model naturally predicts a label for
+#    each community. However, community assignment should be equivariant to
+#    label permutations. To achieve this, in each forward process, we take
+#    the minimum among losses calculated from all possible permutations of
+#    labels.
+#
+#    Mathematically, this means
+#    :math:`L_{equivariant} = \underset{\pi \in S_c} {min}-\log(\hat{\pi}, \pi)`,
+#    where :math:`S_c` is the set of all permutations of labels, and
+#    :math:`\hat{\pi}` is the set of predicted labels,
+#    :math:`- \log(\hat{\pi},\pi)` denotes negative log likelihood.
+#
+#    For instance, for a sample graph with node :math:`\{1,2,3,4\}` and
+#    community assignment :math:`\{A, A, A, B\}`, with each node's label
+#    :math:`l \in \{0,1\}`,The group of all possible permutations
+#    :math:`S_c = \{\{0,0,0,1\}, \{1,1,1,0\}\}`.
+#
+# Line graph neural network key ideas
+# ------------------------------------
+# An key innovation in this topic is the use of a line graph.
+# Unlike models in previous tutorials, message passing happens not only on the
+# original graph, e.g. the binary community subgraph from Cora, but also on the
+# line graph associated with the original graph.
+#
+# What is a line-graph?
+# ~~~~~~~~~~~~~~~~~~~~~
+# In graph theory, line graph is a graph representation that encodes the
+# edge adjacency structure in the original graph.
+#
+# Specifically, a line-graph :math:`L(G)` turns an edge of the original graph `G`
+# into a node. This is illustrated with the graph below (taken from the
+# research paper).
+#
+# .. figure:: https://i.imgur.com/4WO5jEm.png
+#    :alt: lg
+#    :align: center
+#
+# Here, :math:`e_{A}:= （i\rightarrow j）` and :math:`e_{B}:= (j\rightarrow k)`
+# are two edges in the original graph :math:`G`. In line graph :math:`G_L`,
+# they correspond to nodes :math:`v^{l}_{A}, v^{l}_{B}`.
+#
+# The next natural question is, how to connect nodes in line-graph？ How to
+# connect two edges? Here, we use the following connection rule:
+#
+# Two nodes :math:`v^{l}_{A}`, :math:`v^{l}_{B}` in `lg` are connected if
+# the corresponding two edges :math:`e_{A}, e_{B}` in `g` share one and only
+# one node:
+# :math:`e_{A}`'s destination node is :math:`e_{B}`'s source node
+# (:math:`j`).
+#
+# .. note::
+#
+#    Mathematically, this definition corresponds to a notion called non-backtracking
+#    operator:
+#    :math:`B_{(i \rightarrow j), (\hat{i} \rightarrow \hat{j})}`
+#    :math:`= \begin{cases}
+#    1 \text{ if } j = \hat{i}, \hat{j} \neq i\\
+#    0 \text{ otherwise} \end{cases}`
+#    where an edge is formed if :math:`B_{node1, node2} = 1`.
+#
+#
+# One layer in LGNN, algorithm structure
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# LGNN chains together a series of line graph neural network layers. The graph
+# representation :math:`x` and its line graph companion :math:`y` evolve with
+# the dataflow as follows.
+#
+# .. figure:: https://i.imgur.com/bZGGIGp.png
+#    :alt: alg
+#    :align: center
+#
+# At the :math:`k`-th layer, the :math:`i`-th neuron of the :math:`l`-th
+# channel updates its embedding :math:`x^{(k+1)}_{i,l}` with:
+#
+# .. math::
+#    \begin{split}
+#    x^{(k+1)}_{i,l} ={}&\rho[x^{(k)}_{i}\theta^{(k)}_{1,l}
+#    +(Dx^{(k)})_{i}\theta^{(k)}_{2,l} \\
+#    &+\sum^{J-1}_{j=0}(A^{2^{j}}x^{k})_{i}\theta^{(k)}_{3+j,l}\\
+#    &+[\{\text{Pm},\text{Pd}\}y^{(k)}]_{i}\theta^{(k)}_{3+J,l}] \\
+#    &+\text{skip-connection}
+#    \qquad i \in V, l = 1,2,3, ... b_{k+1}/2
+#    \end{split}
+#
+# Then, the line-graph representation :math:`y^{(k+1)}_{i,l}` with,
+#
+# .. math::
+#
+#    \begin{split}
+#    y^{(k+1)}_{i',l^{'}} = {}&\rho[y^{(k)}_{i^{'}}\gamma^{(k)}_{1,l^{'}}+
+#    (D_{L(G)}y^{(k)})_{i^{'}}\gamma^{(k)}_{2,l^{'}}\\
+#    &+\sum^{J-1}_{j=0}(A_{L(G)}^{2^{j}}y^{k})_{i}\gamma^{(k)}_{3+j,l^{'}}\\
+#    &+[\{\text{Pm},\text{Pd}\}^{T}x^{(k+1)}]_{i^{'}}\gamma^{(k)}_{3+J,l^{'}}]\\
+#    &+\text{skip-connection}
+#    \qquad i^{'} \in V_{l}, l^{'} = 1,2,3, ... b^{'}_{k+1}/2
+#    \end{split}
+#
+# Where :math:`\text{skip-connection}` refers to performing the same operation without the non-linearity
+# :math:`\rho`, and with linear projection :math:`\theta_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`
+# and :math:`\gamma_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`.
+#
+# Implement LGNN in DGL
+# ---------------------
+# Even though the equations in the previous section might seem intimidating,
+# it helps to understand the following information before you implement the LGNN.
+#
+# The two equations are symmetric and can be implemented as two instances
+# of the same class with different parameters.
+# The first equation operates on graph representation :math:`x`,
+# whereas the second operates on line-graph
+# representation :math:`y`. Let us denote this abstraction as :math:`f`. Then
+# the first is :math:`f(x,y; \theta_x)`, and the second
+# is :math:`f(y,x, \theta_y)`. That is, they are parameterized to compute
+# representations of the original graph and its
+# companion line graph, respectively.
+#
+# Each equation consists of four terms. Take the first one as an example, which follows.
+#
+#   - :math:`x^{(k)}\theta^{(k)}_{1,l}`, a linear projection of previous
+#     layer's output :math:`x^{(k)}`, denote as :math:`\text{prev}(x)`.
+#   - :math:`(Dx^{(k)})\theta^{(k)}_{2,l}`, a linear projection of degree
+#     operator on :math:`x^{(k)}`, denote as :math:`\text{deg}(x)`.
+#   - :math:`\sum^{J-1}_{j=0}(A^{2^{j}}x^{(k)})\theta^{(k)}_{3+j,l}`,
+#     a summation of :math:`2^{j}` adjacency operator on :math:`x^{(k)}`,
+#     denote as :math:`\text{radius}(x)`
+#   - :math:`[\{Pm,Pd\}y^{(k)}]\theta^{(k)}_{3+J,l}`, fusing another
+#     graph's embedding information using incidence matrix
+#     :math:`\{Pm, Pd\}`, followed with a linear projection,
+#     denote as :math:`\text{fuse}(y)`.
+#
+# Each of the terms are performed again with different
+# parameters, and without the nonlinearity after the sum.
+# Therefore, :math:`f` could be written as:
+#
+#   .. math::
+#      \begin{split}
+#      f(x^{(k)},y^{(k)}) = {}\rho[&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1})
+#      +\text{fuse}(y^{(k)})]\\
+#      +&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1}) +\text{fuse}(y^{(k)})
+#      \end{split}
+#
+# Two equations are chained-up in the following order:
+#
+#   .. math::
+#      \begin{split}
+#      x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
+#      y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
+#      \end{split}
+#
+# Keep in mind the listed observations in this overview and proceed to implementation.
+# An important point is that you use different strategies for the noted terms.
+#
+# .. note::
+#    You can understand :math:`\{Pm, Pd\}` more thoroughly with this explanation.
+#    Roughly speaking, there is a relationship between how :math:`g` and
+#    :math:`lg` (the line graph) work together with loopy brief propagation.
+#    Here, you implement :math:`\{Pm, Pd\}` as a SciPy COO sparse matrix in the dataset,
+#    and stack them as tensors when batching. Another batching solution is to
+#    treat :math:`\{Pm, Pd\}` as the adjacency matrix of a bipartite graph, which maps
+#    line graph's feature to graph's, and vice versa.
+#
+# Implementing :math:`\text{prev}` and :math:`\text{deg}` as tensor operation
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Linear projection and degree operation are both simply matrix
+# multiplication. Write them as PyTorch tensor operations.
+#
+# In ``__init__``, you define the projection variables.
+#
+# ::
+#
+#    self.linear_prev = nn.Linear(in_feats, out_feats)
+#    self.linear_deg = nn.Linear(in_feats, out_feats)
+#
+#
+# In ``forward()``, :math:`\text{prev}` and :math:`\text{deg}` are the same
+# as any other PyTorch tensor operations.
+#
+# ::
+#
+#    prev_proj = self.linear_prev(feat_a)
+#    deg_proj = self.linear_deg(deg * feat_a)
+#
+# Implementing :math:`\text{radius}` as message passing in DGL
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# As discussed in GCN tutorial, you can formulate one adjacency operator as
+# doing one-step message passing. As a generalization, :math:`2^j` adjacency
+# operations can be formulated as performing :math:`2^j` step of message
+# passing. Therefore, the summation is equivalent to summing nodes'
+# representation of :math:`2^j, j=0, 1, 2..` step message passing, i.e.
+# gathering information in :math:`2^{j}` neighborhood of each node.
+#
+# In ``__init__``, define the projection variables used in each
+# :math:`2^j` steps of message passing.
+#
+# ::
+#
+#   self.linear_radius = nn.ModuleList(
+#           [nn.Linear(in_feats, out_feats) for i in range(radius)])
+#
+# In ``__forward__``, use following function ``aggregate_radius()`` to
+# gather data from multiple hops. This can be seen in the following code.
+# Note that the ``update_all`` is called multiple times.
+
+# Return a list containing features gathered from multiple radius.
+import dgl.function as fn
+
+
+def aggregate_radius(radius, g, z):
+    # initializing list to collect message passing result
+    z_list = []
+    g.ndata["z"] = z
+    # pulling message from 1-hop neighbourhood
+    g.update_all(fn.copy_u(u="z", out="m"), fn.sum(msg="m", out="z"))
+    z_list.append(g.ndata["z"])
+    for i in range(radius - 1):
+        for j in range(2**i):
+            # pulling message from 2^j neighborhood
+            g.update_all(fn.copy_u(u="z", out="m"), fn.sum(msg="m", out="z"))
+        z_list.append(g.ndata["z"])
+    return z_list
+
+
+#########################################################################
+# Implementing :math:`\text{fuse}` as sparse matrix multiplication
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# :math:`\{Pm, Pd\}` is a sparse matrix with only two non-zero entries on
+# each column. Therefore, you construct it as a sparse matrix in the dataset,
+# and implement :math:`\text{fuse}` as a sparse matrix multiplication.
+#
+# in ``__forward__``:
+#
+# ::
+#
+#   fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
+#
+# Completing :math:`f(x, y)`
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Finally, the following shows how to sum up all the terms together, pass it to skip connection, and
+# batch norm.
+#
+# ::
+#
+#   result = prev_proj + deg_proj + radius_proj + fuse
+#
+# Pass result to skip connection.
+#
+# ::
+#
+#   result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
+#
+# Then pass the result to batch norm.
+#
+# ::
+#
+#   result = self.bn(result) #Batch Normalization.
+#
+#
+# Here is the complete code for one LGNN layer's abstraction :math:`f(x,y)`
+class LGNNCore(nn.Module):
+    def __init__(self, in_feats, out_feats, radius):
+        super(LGNNCore, self).__init__()
+        self.out_feats = out_feats
+        self.radius = radius
+
+        self.linear_prev = nn.Linear(in_feats, out_feats)
+        self.linear_deg = nn.Linear(in_feats, out_feats)
+        self.linear_radius = nn.ModuleList(
+            [nn.Linear(in_feats, out_feats) for i in range(radius)]
+        )
+        self.linear_fuse = nn.Linear(in_feats, out_feats)
+        self.bn = nn.BatchNorm1d(out_feats)
+
+    def forward(self, g, feat_a, feat_b, deg, pm_pd):
+        # term "prev"
+        prev_proj = self.linear_prev(feat_a)
+        # term "deg"
+        deg_proj = self.linear_deg(deg * feat_a)
+
+        # term "radius"
+        # aggregate 2^j-hop features
+        hop2j_list = aggregate_radius(self.radius, g, feat_a)
+        # apply linear transformation
+        hop2j_list = [
+            linear(x) for linear, x in zip(self.linear_radius, hop2j_list)
+        ]
+        radius_proj = sum(hop2j_list)
+
+        # term "fuse"
+        fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
+
+        # sum them together
+        result = prev_proj + deg_proj + radius_proj + fuse
+
+        # skip connection and batch norm
+        n = self.out_feats // 2
+        result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
+        result = self.bn(result)
+
+        return result
+
+
+##############################################################################################################
+# Chain-up LGNN abstractions as an LGNN layer
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# To implement:
+#
+# .. math::
+#    \begin{split}
+#    x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
+#    y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
+#    \end{split}
+#
+# Chain-up two ``LGNNCore`` instances, as in the example code, with different parameters in the forward pass.
+class LGNNLayer(nn.Module):
+    def __init__(self, in_feats, out_feats, radius):
+        super(LGNNLayer, self).__init__()
+        self.g_layer = LGNNCore(in_feats, out_feats, radius)
+        self.lg_layer = LGNNCore(in_feats, out_feats, radius)
+
+    def forward(self, g, lg, x, lg_x, deg_g, deg_lg, pm_pd):
+        next_x = self.g_layer(g, x, lg_x, deg_g, pm_pd)
+        pm_pd_y = th.transpose(pm_pd, 0, 1)
+        next_lg_x = self.lg_layer(lg, lg_x, x, deg_lg, pm_pd_y)
+        return next_x, next_lg_x
+
+
+########################################################################################
+# Chain-up LGNN layers
+# ~~~~~~~~~~~~~~~~~~~~
+# Define an LGNN with three hidden layers, as in the following example.
+class LGNN(nn.Module):
+    def __init__(self, radius):
+        super(LGNN, self).__init__()
+        self.layer1 = LGNNLayer(1, 16, radius)  # input is scalar feature
+        self.layer2 = LGNNLayer(16, 16, radius)  # hidden size is 16
+        self.layer3 = LGNNLayer(16, 16, radius)
+        self.linear = nn.Linear(16, 2)  # predice two classes
+
+    def forward(self, g, lg, pm_pd):
+        # compute the degrees
+        deg_g = g.in_degrees().float().unsqueeze(1)
+        deg_lg = lg.in_degrees().float().unsqueeze(1)
+        # use degree as the input feature
+        x, lg_x = deg_g, deg_lg
+        x, lg_x = self.layer1(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
+        x, lg_x = self.layer2(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
+        x, lg_x = self.layer3(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
+        return self.linear(x)
+
+
+#########################################################################################
+# Training and inference
+# -----------------------
+# First load the data.
+from torch.utils.data import DataLoader
+
+training_loader = DataLoader(
+    train_set, batch_size=1, collate_fn=train_set.collate_fn, drop_last=True
+)
+
+#######################################################################################
+# Next, define the main training loop. Note that each training sample contains
+# three objects: A :class:`~dgl.DGLGraph`, a SciPy sparse matrix ``pmpd``, and a label
+# array in ``numpy.ndarray``. Generate the line graph by using this command:
+#
+# ::
+#
+#   lg = g.line_graph(backtracking=False)
+#
+# Note that ``backtracking=False`` is required to correctly simulate non-backtracking
+# operation. We also define a utility function to convert the SciPy sparse matrix to
+# torch sparse tensor.
+
+# Create the model
+model = LGNN(radius=3)
+# define the optimizer
+optimizer = th.optim.Adam(model.parameters(), lr=1e-2)
+
+# A utility function to convert a scipy.coo_matrix to torch.SparseFloat
+def sparse2th(mat):
+    value = mat.data
+    indices = th.LongTensor([mat.row, mat.col])
+    tensor = th.sparse.FloatTensor(
+        indices, th.from_numpy(value).float(), mat.shape
+    )
+    return tensor
+
+
+# Train for 20 epochs
+for i in range(20):
+    all_loss = []
+    all_acc = []
+    for [g, pmpd, label] in training_loader:
+        # Generate the line graph.
+        lg = g.line_graph(backtracking=False)
+        # Create torch tensors
+        pmpd = sparse2th(pmpd)
+        label = th.from_numpy(label)
+
+        # Forward
+        z = model(g, lg, pmpd)
+
+        # Calculate loss:
+        # Since there are only two communities, there are only two permutations
+        #  of the community labels.
+        loss_perm1 = F.cross_entropy(z, label)
+        loss_perm2 = F.cross_entropy(z, 1 - label)
+        loss = th.min(loss_perm1, loss_perm2)
+
+        # Calculate accuracy:
+        _, pred = th.max(z, 1)
+        acc_perm1 = (pred == label).float().mean()
+        acc_perm2 = (pred == 1 - label).float().mean()
+        acc = th.max(acc_perm1, acc_perm2)
+        all_loss.append(loss.item())
+        all_acc.append(acc.item())
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    niters = len(all_loss)
+    print(
+        "Epoch %d | loss %.4f | accuracy %.4f"
+        % (i, sum(all_loss) / niters, sum(all_acc) / niters)
+    )
+#######################################################################################
+# Visualize training progress
+# -----------------------------
+# You can visualize the network's community prediction on one training example,
+# together with the ground truth. Start this with the following code example.
+
+pmpd1 = sparse2th(pmpd1)
+LG1 = G1.line_graph(backtracking=False)
+z = model(G1, LG1, pmpd1)
+_, pred = th.max(z, 1)
+visualize(pred, nx_G1)
+
+#######################################################################################
+# Compared with the ground truth. Note that the color might be reversed for the
+# two communities because the model is for correctly predicting the partitioning.
+visualize(label1, nx_G1)
+
+#########################################
+# Here is an animation to better understand the process. (40 epochs)
+#
+# .. figure:: https://i.imgur.com/KDUyE1S.gif
+#    :alt: lgnn-anim
+#
+# Batching graphs for parallelism
+# --------------------------------
+#
+# LGNN takes a collection of different graphs.
+# You might consider whether batching can be used for parallelism.
+#
+# Batching has been into the data loader itself.
+# In the ``collate_fn`` for PyTorch data loader, graphs are batched using DGL's
+# batched_graph API. DGL batches graphs by merging them
+# into a large graph, with each smaller graph's adjacency matrix being a block
+# along the diagonal of the large graph's adjacency matrix.  Concatenate
+# :math`\{Pm,Pd\}` as block diagonal matrix in correspondence to DGL batched
+# graph API.
+
+
+def collate_fn(batch):
+    graphs, pmpds, labels = zip(*batch)
+    batched_graphs = dgl.batch(graphs)
+    batched_pmpds = sp.block_diag(pmpds)
+    batched_labels = np.concatenate(labels, axis=0)
+    return batched_graphs, batched_pmpds, batched_labels
+
+
+######################################################################################
+# You can find the complete code on Github at
+# `Community Detection with Graph Neural Networks (CDGNN) <https://github.com/dmlc/dgl/tree/master/examples/pytorch/line_graph>`_.
--- a/tutorials/models/1_gnn/9_gat.py
+++ b/tutorials/models/1_gnn/9_gat.py
@@ -105,9 +105,8 @@ structure-free normalization, in the style of attention.
 # subpackage. Simply import the ``GATConv`` as the follows.

 import os
-os.environ['DGLBACKEND'] = 'pytorch'
-from dgl.nn.pytorch import GATConv

+os.environ["DGLBACKEND"] = "pytorch"
 ###############################################################
 # Readers can skip the following step-by-step explanation of the implementation and
 # jump to the `Put everything together`_ for training and visualization results.
@@ -125,6 +124,7 @@ from dgl.nn.pytorch import GATConv
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from dgl.nn.pytorch import GATConv


 class GATLayer(nn.Module):
@@ -139,37 +139,38 @@ class GATLayer(nn.Module):

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain('relu')
+        gain = nn.init.calculate_gain("relu")
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
-        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
+        z2 = torch.cat([edges.src["z"], edges.dst["z"]], dim=1)
        a = self.attn_fc(z2)
-        return {'e': F.leaky_relu(a)}
+        return {"e": F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
-        return {'z': edges.src['z'], 'e': edges.data['e']}
+        return {"z": edges.src["z"], "e": edges.data["e"]}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
-        alpha = F.softmax(nodes.mailbox['e'], dim=1)
+        alpha = F.softmax(nodes.mailbox["e"], dim=1)
        # equation (4)
-        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
-        return {'h': h}
+        h = torch.sum(alpha * nodes.mailbox["z"], dim=1)
+        return {"h": h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
-        self.g.ndata['z'] = z
+        self.g.ndata["z"] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
-        return self.g.ndata.pop('h')
+        return self.g.ndata.pop("h")
+

 ##################################################################
 # Equation (1)
@@ -195,11 +196,13 @@ class GATLayer(nn.Module):
 # ``apply_edges`` API. The argument to the ``apply_edges`` is an **Edge UDF**,
 # which is defined as below:

+
 def edge_attention(self, edges):
    # edge UDF for equation (2)
-    z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
+    z2 = torch.cat([edges.src["z"], edges.dst["z"]], dim=1)
    a = self.attn_fc(z2)
-    return {'e' : F.leaky_relu(a)}
+    return {"e": F.leaky_relu(a)}
+

 ########################################################################3
 # Here, the dot product with the learnable weight vector :math:`\vec{a^{(l)}}`
@@ -229,13 +232,15 @@ def edge_attention(self, edges):
 # Both tasks first fetch data from the mailbox and then manipulate it on the
 # second dimension (``dim=1``), on which the messages are batched.

+
 def reduce_func(self, nodes):
    # reduce UDF for equation (3) & (4)
    # equation (3)
-    alpha = F.softmax(nodes.mailbox['e'], dim=1)
+    alpha = F.softmax(nodes.mailbox["e"], dim=1)
    # equation (4)
-    h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
-    return {'h' : h}
+    h = torch.sum(alpha * nodes.mailbox["z"], dim=1)
+    return {"h": h}
+

 #####################################################################
 # Multi-head attention
@@ -258,8 +263,9 @@ def reduce_func(self, nodes):
 # Use the above defined single-head ``GATLayer`` as the building block
 # for the ``MultiHeadGATLayer`` below:

+
 class MultiHeadGATLayer(nn.Module):
-    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
+    def __init__(self, g, in_dim, out_dim, num_heads, merge="cat"):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
@@ -268,19 +274,21 @@ class MultiHeadGATLayer(nn.Module):

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
-        if self.merge == 'cat':
+        if self.merge == "cat":
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))

+
 ###########################################################################
 # Put everything together
 # ^^^^^^^^^^^^^^^^^^^^^^^
 #
 # Now, you can define a two-layer GAT model.

+
 class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
@@ -296,33 +304,34 @@ class GAT(nn.Module):
        h = self.layer2(h)
        return h

+
+import networkx as nx
+
 #############################################################################
 # We then load the Cora dataset using DGL's built-in data module.

 from dgl import DGLGraph
 from dgl.data import citation_graph as citegrh
-import networkx as nx
+

 def load_cora_data():
    data = citegrh.load_cora()
    g = data[0]
-    mask = torch.BoolTensor(g.ndata['train_mask'])
-    return g, g.ndata['feat'], g.ndata['label'], mask
+    mask = torch.BoolTensor(g.ndata["train_mask"])
+    return g, g.ndata["feat"], g.ndata["label"], mask
+

 ##############################################################################
 # The training loop is exactly the same as in the GCN tutorial.

 import time
+
 import numpy as np

 g, features, labels, mask = load_cora_data()

 # create the model, 2 heads, each head has hidden size 8
-net = GAT(g,
-          in_dim=features.size()[1],
-          hidden_dim=8,
-          out_dim=7,
-          num_heads=2)
+net = GAT(g, in_dim=features.size()[1], hidden_dim=8, out_dim=7, num_heads=2)

 # create optimizer
 optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
@@ -344,8 +353,11 @@ for epoch in range(30):
    if epoch >= 3:
        dur.append(time.time() - t0)

-    print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
-        epoch, loss.item(), np.mean(dur)))
+    print(
+        "Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
+            epoch, loss.item(), np.mean(dur)
+        )
+    )

 #########################################################################
 # Visualizing and understanding attention learned