Unverified Commit dce89919 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] Auto-reformat multiple python folders. (#5325)



* auto-reformat

* lintrunner

---------
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
parent ab812179
from ruamel.yaml.comments import CommentedMap
......@@ -14,12 +13,14 @@ def deep_convert_dict(layer):
return to_ret
import collections.abc
def merge_comment(d, comment_dict, column=30):
for k, v in comment_dict.items():
if isinstance(v, collections.abc.Mapping):
d[k] = merge_comment(d.get(k, CommentedMap()), v)
else:
d.yaml_add_eol_comment(v, key=k, column=column)
return d
\ No newline at end of file
return d
#!/usr/bin/env python
from setuptools import find_packages
from distutils.core import setup
setup(name='dglgo',
version='0.0.2',
description='DGL',
author='DGL Team',
author_email='wmjlyjemaine@gmail.com',
packages=find_packages(),
install_requires=[
'typer>=0.4.0',
'isort>=5.10.1',
'autopep8>=1.6.0',
'numpydoc>=1.1.0',
"pydantic>=1.9.0",
"ruamel.yaml>=0.17.20",
"PyYAML>=5.1",
"ogb>=1.3.3",
"rdkit-pypi",
"scikit-learn>=0.20.0"
],
package_data={"": ["./*"]},
include_package_data=True,
license='APACHE',
entry_points={
'console_scripts': [
"dgl = dglgo.cli.cli:main"
]
},
url='https://github.com/dmlc/dgl',
)
from setuptools import find_packages
setup(
name="dglgo",
version="0.0.2",
description="DGL",
author="DGL Team",
author_email="wmjlyjemaine@gmail.com",
packages=find_packages(),
install_requires=[
"typer>=0.4.0",
"isort>=5.10.1",
"autopep8>=1.6.0",
"numpydoc>=1.1.0",
"pydantic>=1.9.0",
"ruamel.yaml>=0.17.20",
"PyYAML>=5.1",
"ogb>=1.3.3",
"rdkit-pypi",
"scikit-learn>=0.20.0",
],
package_data={"": ["./*"]},
include_package_data=True,
license="APACHE",
entry_points={"console_scripts": ["dgl = dglgo.cli.cli:main"]},
url="https://github.com/dmlc/dgl",
)
......@@ -14,16 +14,18 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath('../../python'))
sys.path.insert(0, os.path.abspath("../../python"))
# -- Project information -----------------------------------------------------
project = 'DGL'
copyright = '2018, DGL Team'
author = 'DGL Team'
project = "DGL"
copyright = "2018, DGL Team"
author = "DGL Team"
import dgl
version = dgl.__version__
release = dgl.__version__
dglbackend = os.environ.get("DGLBACKEND", "pytorch")
......@@ -39,35 +41,35 @@ dglbackend = os.environ.get("DGLBACKEND", "pytorch")
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
'sphinx.ext.intersphinx',
'sphinx.ext.graphviz',
'sphinxemoji.sphinxemoji',
'sphinx_gallery.gen_gallery',
'sphinx_copybutton',
'nbsphinx',
'nbsphinx_link',
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.coverage",
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx.ext.viewcode",
"sphinx.ext.intersphinx",
"sphinx.ext.graphviz",
"sphinxemoji.sphinxemoji",
"sphinx_gallery.gen_gallery",
"sphinx_copybutton",
"nbsphinx",
"nbsphinx_link",
]
# Do not run notebooks on non-pytorch backends
if dglbackend != "pytorch":
nbsphinx_execute = 'never'
nbsphinx_execute = "never"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = ['.rst', '.md']
source_suffix = [".rst", ".md"]
# The master toctree document.
master_doc = 'index'
master_doc = "index"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
......@@ -90,7 +92,7 @@ pygments_style = None
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
......@@ -101,8 +103,8 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/custom.css']
html_static_path = ["_static"]
html_css_files = ["css/custom.css"]
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
......@@ -118,7 +120,7 @@ html_css_files = ['css/custom.css']
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'dgldoc'
htmlhelp_basename = "dgldoc"
# -- Options for LaTeX output ------------------------------------------------
......@@ -127,15 +129,12 @@ latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
......@@ -145,8 +144,7 @@ latex_elements = {
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'dgl.tex', 'DGL Documentation',
'DGL Team', 'manual'),
(master_doc, "dgl.tex", "DGL Documentation", "DGL Team", "manual"),
]
......@@ -154,10 +152,7 @@ latex_documents = [
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'dgl', 'DGL Documentation',
[author], 1)
]
man_pages = [(master_doc, "dgl", "DGL Documentation", [author], 1)]
# -- Options for Texinfo output ----------------------------------------------
......@@ -166,9 +161,15 @@ man_pages = [
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'dgl', 'DGL Documentation',
author, 'dgl', 'Library for deep learning on graphs.',
'Miscellaneous'),
(
master_doc,
"dgl",
"DGL Documentation",
author,
"dgl",
"Library for deep learning on graphs.",
"Miscellaneous",
),
]
......@@ -187,64 +188,71 @@ epub_title = project
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
epub_exclude_files = ["search.html"]
# -- Extension configuration -------------------------------------------------
autosummary_generate = True
autodoc_member_order = 'alphabetical'
autodoc_member_order = "alphabetical"
intersphinx_mapping = {
'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
'numpy': ('http://docs.scipy.org/doc/numpy/', None),
'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
'matplotlib': ('http://matplotlib.org/', None),
'networkx' : ('https://networkx.github.io/documentation/stable', None),
"python": (
"https://docs.python.org/{.major}".format(sys.version_info),
None,
),
"numpy": ("http://docs.scipy.org/doc/numpy/", None),
"scipy": ("http://docs.scipy.org/doc/scipy/reference", None),
"matplotlib": ("http://matplotlib.org/", None),
"networkx": ("https://networkx.github.io/documentation/stable", None),
}
# sphinx gallery configurations
from sphinx_gallery.sorting import FileNameSortKey
examples_dirs = ['../../tutorials/blitz',
'../../tutorials/large',
'../../tutorials/dist',
'../../tutorials/models',
'../../tutorials/multi',
'../../tutorials/cpu'] # path to find sources
gallery_dirs = ['tutorials/blitz/',
'tutorials/large/',
'tutorials/dist/',
'tutorials/models/',
'tutorials/multi/',
'tutorials/cpu'] # path to generate docs
examples_dirs = [
"../../tutorials/blitz",
"../../tutorials/large",
"../../tutorials/dist",
"../../tutorials/models",
"../../tutorials/multi",
"../../tutorials/cpu",
] # path to find sources
gallery_dirs = [
"tutorials/blitz/",
"tutorials/large/",
"tutorials/dist/",
"tutorials/models/",
"tutorials/multi/",
"tutorials/cpu",
] # path to generate docs
if dglbackend != "pytorch":
examples_dirs = []
gallery_dirs = []
reference_url = {
'dgl' : None,
'numpy': 'http://docs.scipy.org/doc/numpy/',
'scipy': 'http://docs.scipy.org/doc/scipy/reference',
'matplotlib': 'http://matplotlib.org/',
'networkx' : 'https://networkx.github.io/documentation/stable',
"dgl": None,
"numpy": "http://docs.scipy.org/doc/numpy/",
"scipy": "http://docs.scipy.org/doc/scipy/reference",
"matplotlib": "http://matplotlib.org/",
"networkx": "https://networkx.github.io/documentation/stable",
}
sphinx_gallery_conf = {
'backreferences_dir' : 'generated/backreferences',
'doc_module' : ('dgl', 'numpy'),
'examples_dirs' : examples_dirs,
'gallery_dirs' : gallery_dirs,
'within_subsection_order' : FileNameSortKey,
'filename_pattern' : '.py',
'download_all_examples' : False,
"backreferences_dir": "generated/backreferences",
"doc_module": ("dgl", "numpy"),
"examples_dirs": examples_dirs,
"gallery_dirs": gallery_dirs,
"within_subsection_order": FileNameSortKey,
"filename_pattern": ".py",
"download_all_examples": False,
}
# Compatibility for different backend when builds tutorials
if dglbackend == 'mxnet':
sphinx_gallery_conf['filename_pattern'] = "/*(?<=mx)\.py"
if dglbackend == 'pytorch':
sphinx_gallery_conf['filename_pattern'] = "/*(?<!mx)\.py"
if dglbackend == "mxnet":
sphinx_gallery_conf["filename_pattern"] = "/*(?<=mx)\.py"
if dglbackend == "pytorch":
sphinx_gallery_conf["filename_pattern"] = "/*(?<!mx)\.py"
# sphinx-copybutton tool
copybutton_prompt_text = r'>>> |\.\.\. '
copybutton_prompt_text = r">>> |\.\.\. "
copybutton_prompt_is_regexp = True
from pytablewriter import RstGridTableWriter, MarkdownTableWriter
import numpy as np
import pandas as pd
from dgl import DGLGraph
from dgl.data.gnn_benchmark import AmazonCoBuy, CoraFull, Coauthor
from dgl.data.karate import KarateClub
from dgl.data.gindt import GINDataset
# from dgl.data.qm9 import QM9
from dgl.data import CitationGraphDataset, PPIDataset, RedditDataset, TUDataset
from dgl.data.bitcoinotc import BitcoinOTC
from dgl.data.gdelt import GDELT
from dgl.data.gindt import GINDataset
from dgl.data.gnn_benchmark import AmazonCoBuy, Coauthor, CoraFull
from dgl.data.icews18 import ICEWS18
from dgl.data.karate import KarateClub
from dgl.data.qm7b import QM7b
# from dgl.data.qm9 import QM9
from dgl.data import CitationGraphDataset, PPIDataset, RedditDataset, TUDataset
from pytablewriter import MarkdownTableWriter, RstGridTableWriter
ds_list = {
"BitcoinOTC": "BitcoinOTC()",
......@@ -40,9 +41,9 @@ writer = RstGridTableWriter()
# writer = MarkdownTableWriter()
extract_graph = lambda g: g if isinstance(g, DGLGraph) else g[0]
stat_list=[]
for k,v in ds_list.items():
print(k, ' ', v)
stat_list = []
for k, v in ds_list.items():
print(k, " ", v)
ds = eval(v.split("/")[0])
num_nodes = []
num_edges = []
......@@ -58,10 +59,10 @@ for k,v in ds_list.items():
"# of graphs": len(ds),
"Avg. # of nodes": np.mean(num_nodes),
"Avg. # of edges": np.mean(num_edges),
"Node field": ', '.join(list(gg.ndata.keys())),
"Edge field": ', '.join(list(gg.edata.keys())),
"Node field": ", ".join(list(gg.ndata.keys())),
"Edge field": ", ".join(list(gg.edata.keys())),
# "Graph field": ', '.join(ds[0][0].gdata.keys()) if hasattr(ds[0][0], "gdata") else "",
"Temporal": hasattr(ds, "is_temporal")
"Temporal": hasattr(ds, "is_temporal"),
}
stat_list.append(dd)
......
......@@ -26,15 +26,14 @@ def get_sddmm_kernels_gpu(idtypes, dtypes):
return ret
if __name__ == '__main__':
binary_path = 'libfeatgraph_kernels.so'
if __name__ == "__main__":
binary_path = "libfeatgraph_kernels.so"
kernels = []
idtypes = ['int32', 'int64']
dtypes = ['float16', 'float64', 'float32', 'int32', 'int64']
idtypes = ["int32", "int64"]
dtypes = ["float16", "float64", "float32", "int32", "int64"]
kernels += get_sddmm_kernels_gpu(idtypes, dtypes)
# build kernels and export the module to libfeatgraph_kernels.so
module = tvm.build(kernels, target='cuda', target_host='llvm')
module = tvm.build(kernels, target="cuda", target_host="llvm")
module.export_library(binary_path)
......@@ -4,8 +4,8 @@ from tvm import te
def sddmm_tree_reduction_gpu(idx_type, feat_type):
""" SDDMM kernels on GPU optimized with Tree Reduction.
"""SDDMM kernels on GPU optimized with Tree Reduction.
Parameters
----------
idx_type : str
......@@ -19,35 +19,40 @@ def sddmm_tree_reduction_gpu(idx_type, feat_type):
The result IRModule.
"""
# define vars and placeholders
nnz = te.var('nnz', idx_type)
num_rows = te.var('num_rows', idx_type)
num_cols = te.var('num_cols', idx_type)
H = te.var('num_heads', idx_type)
D = te.var('feat_len', idx_type)
row = te.placeholder((nnz,), idx_type, 'row')
col = te.placeholder((nnz,), idx_type, 'col')
ufeat = te.placeholder((num_rows, H, D), feat_type, 'ufeat')
vfeat = te.placeholder((num_cols, H, D), feat_type, 'vfeat')
nnz = te.var("nnz", idx_type)
num_rows = te.var("num_rows", idx_type)
num_cols = te.var("num_cols", idx_type)
H = te.var("num_heads", idx_type)
D = te.var("feat_len", idx_type)
row = te.placeholder((nnz,), idx_type, "row")
col = te.placeholder((nnz,), idx_type, "col")
ufeat = te.placeholder((num_rows, H, D), feat_type, "ufeat")
vfeat = te.placeholder((num_cols, H, D), feat_type, "vfeat")
# define edge computation function
def edge_func(eid, h, i):
k = te.reduce_axis((0, D), name='k')
k = te.reduce_axis((0, D), name="k")
return te.sum(ufeat[row[eid], h, k] * vfeat[col[eid], h, k], axis=k)
out = te.compute((nnz, H, tvm.tir.IntImm(idx_type, 1)), edge_func, name='out')
out = te.compute(
(nnz, H, tvm.tir.IntImm(idx_type, 1)), edge_func, name="out"
)
# define schedules
sched = te.create_schedule(out.op)
edge_axis, head_axis, _ = out.op.axis
reduce_axis = out.op.reduce_axis[0]
_, red_inner = sched[out].split(reduce_axis, factor=32)
edge_outer, edge_inner = sched[out].split(edge_axis, factor=32)
sched[out].bind(red_inner, te.thread_axis('threadIdx.x'))
sched[out].bind(edge_inner, te.thread_axis('threadIdx.y'))
sched[out].bind(edge_outer, te.thread_axis('blockIdx.x'))
sched[out].bind(head_axis, te.thread_axis('blockIdx.y'))
return tvm.lower(sched, [row, col, ufeat, vfeat, out],
name='SDDMMTreeReduction_{}_{}'.format(idx_type, feat_type))
sched[out].bind(red_inner, te.thread_axis("threadIdx.x"))
sched[out].bind(edge_inner, te.thread_axis("threadIdx.y"))
sched[out].bind(edge_outer, te.thread_axis("blockIdx.x"))
sched[out].bind(head_axis, te.thread_axis("blockIdx.y"))
return tvm.lower(
sched,
[row, col, ufeat, vfeat, out],
name="SDDMMTreeReduction_{}_{}".format(idx_type, feat_type),
)
if __name__ == '__main__':
kernel0 = sddmm_tree_reduction_gpu('int32', 'float32')
if __name__ == "__main__":
kernel0 = sddmm_tree_reduction_gpu("int32", "float32")
print(kernel0)
import torch
import dgl
import dgl.backend as F
import torch
g = dgl.rand_graph(10, 15).int().to(torch.device(0))
gidx = g._graph
u = torch.rand((10,2,8), device=torch.device(0))
v = torch.rand((10,2,8), device=torch.device(0))
e = dgl.ops.gsddmm(g, 'dot', u, v)
u = torch.rand((10, 2, 8), device=torch.device(0))
v = torch.rand((10, 2, 8), device=torch.device(0))
e = dgl.ops.gsddmm(g, "dot", u, v)
print(e)
e = torch.zeros((15,2,1), device=torch.device(0))
e = torch.zeros((15, 2, 1), device=torch.device(0))
u = F.zerocopy_to_dgl_ndarray(u)
v = F.zerocopy_to_dgl_ndarray(v)
e = F.zerocopy_to_dgl_ndarray_for_write(e)
......
......@@ -22,13 +22,13 @@ networks with PyTorch.
"""
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch
import torch.nn as nn
import torch.nn.functional as F
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F
######################################################################
# Overview of Node Classification with GNN
......
......@@ -31,11 +31,11 @@ By the end of this tutorial you will be able to:
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
import numpy as np
import torch
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import numpy as np
import torch
g = dgl.graph(([0, 0, 0, 0, 0], [1, 2, 3, 4, 5]), num_nodes=6)
# Equivalently, PyTorch LongTensors also work.
......
......@@ -19,13 +19,13 @@ GNN for node classification <1_introduction>`.
"""
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch
import torch.nn as nn
import torch.nn.functional as F
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
######################################################################
# Message passing and GNNs
......
......@@ -19,17 +19,17 @@ By the end of this tutorial you will be able to
import itertools
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.data
######################################################################
# Overview of Link Prediction with GNN
# ------------------------------------
......
......@@ -14,13 +14,13 @@ By the end of this tutorial, you will be able to
"""
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch
import torch.nn as nn
import torch.nn.functional as F
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import torch
import torch.nn as nn
import torch.nn.functional as F
######################################################################
# Overview of Graph Classification with GNN
......@@ -54,6 +54,8 @@ print("Node feature dimensionality:", dataset.dim_nfeats)
print("Number of graph categories:", dataset.gclasses)
from dgl.dataloading import GraphDataLoader
######################################################################
# Defining Data Loader
# --------------------
......@@ -74,8 +76,6 @@ print("Number of graph categories:", dataset.gclasses)
from torch.utils.data.sampler import SubsetRandomSampler
from dgl.dataloading import GraphDataLoader
num_examples = len(dataset)
num_train = int(num_examples * 0.8)
......
......@@ -88,10 +88,10 @@ interactions.head()
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
from dgl.data import DGLDataset
......
......@@ -26,10 +26,11 @@ Sampling for GNN Training <L0_neighbor_sampling_overview>`.
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import numpy as np
import torch
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset("ogbn-arxiv")
......@@ -284,13 +285,14 @@ valid_dataloader = dgl.dataloading.DataLoader(
)
import sklearn.metrics
######################################################################
# The following is a training loop that performs validation every epoch.
# It also saves the model with the best validation accuracy into a file.
#
import tqdm
import sklearn.metrics
best_accuracy = 0
best_model_path = "model.pt"
......
......@@ -53,10 +53,11 @@ Sampling for Node Classification <L1_large_node_classification>`.
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import numpy as np
import torch
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset("ogbn-arxiv")
......@@ -339,6 +340,8 @@ predictor = DotPredictor().to(device)
opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))
import sklearn.metrics
######################################################################
# The following is the training loop for link prediction and
# evaluation, and also saves the model that performs the best on the
......@@ -346,7 +349,6 @@ opt = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()))
#
import tqdm
import sklearn.metrics
best_accuracy = 0
best_model_path = "model.pt"
......
......@@ -14,30 +14,33 @@ for stochastic GNN training. It assumes that
"""
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import numpy as np
import torch
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset('ogbn-arxiv')
device = 'cpu' # change to 'cuda' for GPU
dataset = DglNodePropPredDataset("ogbn-arxiv")
device = "cpu" # change to 'cuda' for GPU
graph, node_labels = dataset[0]
# Add reverse edges since ogbn-arxiv is unidirectional.
graph = dgl.add_reverse_edges(graph)
graph.ndata['label'] = node_labels[:, 0]
graph.ndata["label"] = node_labels[:, 0]
idx_split = dataset.get_idx_split()
train_nids = idx_split['train']
node_features = graph.ndata['feat']
train_nids = idx_split["train"]
node_features = graph.ndata["feat"]
sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
train_dataloader = dgl.dataloading.DataLoader(
graph, train_nids, sampler,
graph,
train_nids,
sampler,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0
num_workers=0,
)
input_nodes, output_nodes, mfgs = next(iter(train_dataloader))
......@@ -75,8 +78,8 @@ print(mfg.num_src_nodes(), mfg.num_dst_nodes())
# will do with ``ndata`` on the graphs you have seen earlier:
#
mfg.srcdata['x'] = torch.zeros(mfg.num_src_nodes(), mfg.num_dst_nodes())
dst_feat = mfg.dstdata['feat']
mfg.srcdata["x"] = torch.zeros(mfg.num_src_nodes(), mfg.num_dst_nodes())
dst_feat = mfg.dstdata["feat"]
######################################################################
......@@ -105,7 +108,11 @@ mfg.srcdata[dgl.NID], mfg.dstdata[dgl.NID]
# .. |image1| image:: https://data.dgl.ai/tutorial/img/bipartite.gif
#
print(torch.equal(mfg.srcdata[dgl.NID][:mfg.num_dst_nodes()], mfg.dstdata[dgl.NID]))
print(
torch.equal(
mfg.srcdata[dgl.NID][: mfg.num_dst_nodes()], mfg.dstdata[dgl.NID]
)
)
######################################################################
......@@ -113,7 +120,7 @@ print(torch.equal(mfg.srcdata[dgl.NID][:mfg.num_dst_nodes()], mfg.dstdata[dgl.NI
# :math:`h_u^{(l-1)}`:
#
mfg.srcdata['h'] = torch.randn(mfg.num_src_nodes(), 10)
mfg.srcdata["h"] = torch.randn(mfg.num_src_nodes(), 10)
######################################################################
......@@ -132,8 +139,8 @@ mfg.srcdata['h'] = torch.randn(mfg.num_src_nodes(), 10)
import dgl.function as fn
mfg.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h'))
m_v = mfg.dstdata['h']
mfg.update_all(message_func=fn.copy_u("h", "m"), reduce_func=fn.mean("m", "h"))
m_v = mfg.dstdata["h"]
m_v
......@@ -147,6 +154,7 @@ import torch.nn as nn
import torch.nn.functional as F
import tqdm
class SAGEConv(nn.Module):
"""Graph convolution module used by the GraphSAGE model.
......@@ -157,6 +165,7 @@ class SAGEConv(nn.Module):
out_feat : int
Output feature size.
"""
def __init__(self, in_feat, out_feat):
super(SAGEConv, self).__init__()
# A linear submodule for projecting the input and neighbor feature to the output.
......@@ -174,14 +183,15 @@ class SAGEConv(nn.Module):
"""
with g.local_scope():
h_src, h_dst = h
g.srcdata['h'] = h_src # <---
g.dstdata['h'] = h_dst # <---
g.srcdata["h"] = h_src # <---
g.dstdata["h"] = h_dst # <---
# update_all is a message passing API.
g.update_all(fn.copy_u('h', 'm'), fn.mean('m', 'h_N'))
h_N = g.dstdata['h_N']
h_total = torch.cat([h_dst, h_N], dim=1) # <---
g.update_all(fn.copy_u("h", "m"), fn.mean("m", "h_N"))
h_N = g.dstdata["h_N"]
h_total = torch.cat([h_dst, h_N], dim=1) # <---
return self.linear(h_total)
class Model(nn.Module):
def __init__(self, in_feats, h_feats, num_classes):
super(Model, self).__init__()
......@@ -189,28 +199,31 @@ class Model(nn.Module):
self.conv2 = SAGEConv(h_feats, num_classes)
def forward(self, mfgs, x):
h_dst = x[:mfgs[0].num_dst_nodes()]
h_dst = x[: mfgs[0].num_dst_nodes()]
h = self.conv1(mfgs[0], (x, h_dst))
h = F.relu(h)
h_dst = h[:mfgs[1].num_dst_nodes()]
h_dst = h[: mfgs[1].num_dst_nodes()]
h = self.conv2(mfgs[1], (h, h_dst))
return h
sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
train_dataloader = dgl.dataloading.DataLoader(
graph, train_nids, sampler,
graph,
train_nids,
sampler,
device=device,
batch_size=1024,
shuffle=True,
drop_last=False,
num_workers=0
num_workers=0,
)
model = Model(graph.ndata['feat'].shape[1], 128, dataset.num_classes).to(device)
model = Model(graph.ndata["feat"].shape[1], 128, dataset.num_classes).to(device)
with tqdm.tqdm(train_dataloader) as tq:
for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
inputs = mfgs[0].srcdata['feat']
labels = mfgs[-1].dstdata['label']
inputs = mfgs[0].srcdata["feat"]
labels = mfgs[-1].dstdata["label"]
predictions = model(mfgs, inputs)
......@@ -232,6 +245,7 @@ with tqdm.tqdm(train_dataloader) as tq:
# Say you start with a GNN module that works for full-graph training only:
#
class SAGEConv(nn.Module):
"""Graph convolution module used by the GraphSAGE model.
......@@ -242,6 +256,7 @@ class SAGEConv(nn.Module):
out_feat : int
Output feature size.
"""
def __init__(self, in_feat, out_feat):
super().__init__()
# A linear submodule for projecting the input and neighbor feature to the output.
......@@ -258,10 +273,13 @@ class SAGEConv(nn.Module):
The input node feature.
"""
with g.local_scope():
g.ndata['h'] = h
g.ndata["h"] = h
# update_all is a message passing API.
g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
h_N = g.ndata['h_N']
g.update_all(
message_func=fn.copy_u("h", "m"),
reduce_func=fn.mean("m", "h_N"),
)
h_N = g.ndata["h_N"]
h_total = torch.cat([h, h_N], dim=1)
return self.linear(h_total)
......@@ -352,6 +370,7 @@ class SAGEConv(nn.Module):
# to something like the following:
#
class SAGEConvForBoth(nn.Module):
"""Graph convolution module used by the GraphSAGE model.
......@@ -362,6 +381,7 @@ class SAGEConvForBoth(nn.Module):
out_feat : int
Output feature size.
"""
def __init__(self, in_feat, out_feat):
super().__init__()
# A linear submodule for projecting the input and neighbor feature to the output.
......@@ -383,10 +403,13 @@ class SAGEConvForBoth(nn.Module):
else:
h_src = h_dst = h
g.srcdata['h'] = h_src
g.srcdata["h"] = h_src
# update_all is a message passing API.
g.update_all(message_func=fn.copy_u('h', 'm'), reduce_func=fn.mean('m', 'h_N'))
h_N = g.ndata['h_N']
g.update_all(
message_func=fn.copy_u("h", "m"),
reduce_func=fn.mean("m", "h_N"),
)
h_N = g.ndata["h_N"]
h_total = torch.cat([h_dst, h_N], dim=1)
return self.linear(h_total)
......
......@@ -20,189 +20,186 @@ Convolutional Networks <https://arxiv.org/pdf/1609.02907.pdf>`_). We explain
what is under the hood of the :class:`~dgl.nn.GraphConv` module.
The reader is expected to learn how to define a new GNN layer using DGL's
message passing APIs.
"""
###############################################################################
# Model Overview
# ------------------------------------------
# GCN from the perspective of message passing
# ```````````````````````````````````````````````
# We describe a layer of graph convolutional neural network from a message
# passing perspective; the math can be found `here <math_>`_.
# It boils down to the following step, for each node :math:`u`:
#
# 1) Aggregate neighbors' representations :math:`h_{v}` to produce an
# intermediate representation :math:`\hat{h}_u`. 2) Transform the aggregated
# representation :math:`\hat{h}_{u}` with a linear projection followed by a
# non-linearity: :math:`h_{u} = f(W_{u} \hat{h}_u)`.
#
# We will implement step 1 with DGL message passing, and step 2 by
# PyTorch ``nn.Module``.
#
# GCN implementation with DGL
# ``````````````````````````````````````````
# We first define the message and reduce function as usual. Since the
# aggregation on a node :math:`u` only involves summing over the neighbors'
# representations :math:`h_v`, we can simply use builtin functions:
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl import DGLGraph
gcn_msg = fn.copy_u(u="h", out="m")
gcn_reduce = fn.sum(msg="m", out="h")
###############################################################################
# We then proceed to define the GCNLayer module. A GCNLayer essentially performs
# message passing on all the nodes then applies a fully-connected layer.
#
# .. note::
#
# This is showing how to implement a GCN from scratch. DGL provides a more
# efficient :class:`builtin GCN layer module <dgl.nn.pytorch.conv.GraphConv>`.
#
class GCNLayer(nn.Module):
def __init__(self, in_feats, out_feats):
super(GCNLayer, self).__init__()
self.linear = nn.Linear(in_feats, out_feats)
def forward(self, g, feature):
# Creating a local scope so that all the stored ndata and edata
# (such as the `'h'` ndata below) are automatically popped out
# when the scope exits.
with g.local_scope():
g.ndata["h"] = feature
g.update_all(gcn_msg, gcn_reduce)
h = g.ndata["h"]
return self.linear(h)
###############################################################################
# The forward function is essentially the same as any other commonly seen NNs
# model in PyTorch. We can initialize GCN like any ``nn.Module``. For example,
# let's define a simple neural network consisting of two GCN layers. Suppose we
# are training the classifier for the cora dataset (the input feature size is
# 1433 and the number of classes is 7). The last GCN layer computes node embeddings,
# so the last layer in general does not apply activation.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layer1 = GCNLayer(1433, 16)
self.layer2 = GCNLayer(16, 7)
def forward(self, g, features):
x = F.relu(self.layer1(g, features))
x = self.layer2(g, x)
return x
net = Net()
print(net)
###############################################################################
# We load the cora dataset using DGL's built-in data module.
from dgl.data import CoraGraphDataset
def load_cora_data():
dataset = CoraGraphDataset()
g = dataset[0]
features = g.ndata["feat"]
labels = g.ndata["label"]
train_mask = g.ndata["train_mask"]
test_mask = g.ndata["test_mask"]
return g, features, labels, train_mask, test_mask
###############################################################################
# When a model is trained, we can use the following method to evaluate
# the performance of the model on the test dataset:
def evaluate(model, g, features, labels, mask):
model.eval()
with th.no_grad():
logits = model(g, features)
logits = logits[mask]
labels = labels[mask]
_, indices = th.max(logits, dim=1)
correct = th.sum(indices == labels)
return correct.item() * 1.0 / len(labels)
###############################################################################
# We then train the network as follows:
import time
import numpy as np
g, features, labels, train_mask, test_mask = load_cora_data()
# Add edges between each node and itself to preserve old node representations
g.add_edges(g.nodes(), g.nodes())
optimizer = th.optim.Adam(net.parameters(), lr=1e-2)
dur = []
for epoch in range(50):
if epoch >= 3:
t0 = time.time()
net.train()
logits = net(g, features)
logp = F.log_softmax(logits, 1)
loss = F.nll_loss(logp[train_mask], labels[train_mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch >= 3:
dur.append(time.time() - t0)
acc = evaluate(net, g, features, labels, test_mask)
print(
"Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
epoch, loss.item(), acc, np.mean(dur)
)
)
###############################################################################
# .. _math:
#
# GCN in one formula
# ------------------
# Mathematically, the GCN model follows this formula:
#
# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
#
# Here, :math:`H^{(l)}` denotes the :math:`l^{th}` layer in the network,
# :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for
# this layer. :math:`\tilde{D}` and :math:`\tilde{A}` are separately the degree
# and adjacency matrices for the graph. With the superscript ~, we are referring
# to the variant where we add additional edges between each node and itself to
# preserve its old representation in graph convolutions. The shape of the input
# :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes
# and :math:`D` is the number of input features. We can chain up multiple
# layers as such to produce a node-level representation output with shape
# :math:`N \times F`, where :math:`F` is the dimension of the output node
# feature vector.
#
# The equation can be efficiently implemented using sparse matrix
# multiplication kernels (such as Kipf's
# `pygcn <https://github.com/tkipf/pygcn>`_ code). The above DGL implementation
# in fact has already used this trick due to the use of builtin functions.
#
# Note that the tutorial code implements a simplified version of GCN where we
# replace :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` with
# :math:`\tilde{A}`. For a full implementation, see our example
# `here <https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn>`_.
"""
###############################################################################
# Model Overview
# ------------------------------------------
# GCN from the perspective of message passing
# ```````````````````````````````````````````````
# We describe a layer of graph convolutional neural network from a message
# passing perspective; the math can be found `here <math_>`_.
# It boils down to the following step, for each node :math:`u`:
#
# 1) Aggregate neighbors' representations :math:`h_{v}` to produce an
# intermediate representation :math:`\hat{h}_u`. 2) Transform the aggregated
# representation :math:`\hat{h}_{u}` with a linear projection followed by a
# non-linearity: :math:`h_{u} = f(W_{u} \hat{h}_u)`.
#
# We will implement step 1 with DGL message passing, and step 2 by
# PyTorch ``nn.Module``.
#
# GCN implementation with DGL
# ``````````````````````````````````````````
# We first define the message and reduce function as usual. Since the
# aggregation on a node :math:`u` only involves summing over the neighbors'
# representations :math:`h_v`, we can simply use builtin functions:
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
gcn_msg = fn.copy_u(u="h", out="m")
gcn_reduce = fn.sum(msg="m", out="h")
###############################################################################
# We then proceed to define the GCNLayer module. A GCNLayer essentially performs
# message passing on all the nodes then applies a fully-connected layer.
#
# .. note::
#
# This is showing how to implement a GCN from scratch. DGL provides a more
# efficient :class:`builtin GCN layer module <dgl.nn.pytorch.conv.GraphConv>`.
#
class GCNLayer(nn.Module):
def __init__(self, in_feats, out_feats):
super(GCNLayer, self).__init__()
self.linear = nn.Linear(in_feats, out_feats)
def forward(self, g, feature):
# Creating a local scope so that all the stored ndata and edata
# (such as the `'h'` ndata below) are automatically popped out
# when the scope exits.
with g.local_scope():
g.ndata["h"] = feature
g.update_all(gcn_msg, gcn_reduce)
h = g.ndata["h"]
return self.linear(h)
###############################################################################
# The forward function is essentially the same as any other commonly seen NNs
# model in PyTorch. We can initialize GCN like any ``nn.Module``. For example,
# let's define a simple neural network consisting of two GCN layers. Suppose we
# are training the classifier for the cora dataset (the input feature size is
# 1433 and the number of classes is 7). The last GCN layer computes node embeddings,
# so the last layer in general does not apply activation.
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layer1 = GCNLayer(1433, 16)
self.layer2 = GCNLayer(16, 7)
def forward(self, g, features):
x = F.relu(self.layer1(g, features))
x = self.layer2(g, x)
return x
net = Net()
print(net)
###############################################################################
# We load the cora dataset using DGL's built-in data module.
from dgl.data import CoraGraphDataset
def load_cora_data():
dataset = CoraGraphDataset()
g = dataset[0]
features = g.ndata["feat"]
labels = g.ndata["label"]
train_mask = g.ndata["train_mask"]
test_mask = g.ndata["test_mask"]
return g, features, labels, train_mask, test_mask
###############################################################################
# When a model is trained, we can use the following method to evaluate
# the performance of the model on the test dataset:
def evaluate(model, g, features, labels, mask):
model.eval()
with th.no_grad():
logits = model(g, features)
logits = logits[mask]
labels = labels[mask]
_, indices = th.max(logits, dim=1)
correct = th.sum(indices == labels)
return correct.item() * 1.0 / len(labels)
###############################################################################
# We then train the network as follows:
import time
import numpy as np
g, features, labels, train_mask, test_mask = load_cora_data()
# Add edges between each node and itself to preserve old node representations
g.add_edges(g.nodes(), g.nodes())
optimizer = th.optim.Adam(net.parameters(), lr=1e-2)
dur = []
for epoch in range(50):
if epoch >= 3:
t0 = time.time()
net.train()
logits = net(g, features)
logp = F.log_softmax(logits, 1)
loss = F.nll_loss(logp[train_mask], labels[train_mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch >= 3:
dur.append(time.time() - t0)
acc = evaluate(net, g, features, labels, test_mask)
print(
"Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
epoch, loss.item(), acc, np.mean(dur)
)
)
###############################################################################
# .. _math:
#
# GCN in one formula
# ------------------
# Mathematically, the GCN model follows this formula:
#
# :math:`H^{(l+1)} = \sigma(\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}H^{(l)}W^{(l)})`
#
# Here, :math:`H^{(l)}` denotes the :math:`l^{th}` layer in the network,
# :math:`\sigma` is the non-linearity, and :math:`W` is the weight matrix for
# this layer. :math:`\tilde{D}` and :math:`\tilde{A}` are separately the degree
# and adjacency matrices for the graph. With the superscript ~, we are referring
# to the variant where we add additional edges between each node and itself to
# preserve its old representation in graph convolutions. The shape of the input
# :math:`H^{(0)}` is :math:`N \times D`, where :math:`N` is the number of nodes
# and :math:`D` is the number of input features. We can chain up multiple
# layers as such to produce a node-level representation output with shape
# :math:`N \times F`, where :math:`F` is the dimension of the output node
# feature vector.
#
# The equation can be efficiently implemented using sparse matrix
# multiplication kernels (such as Kipf's
# `pygcn <https://github.com/tkipf/pygcn>`_ code). The above DGL implementation
# in fact has already used this trick due to the use of builtin functions.
#
# Note that the tutorial code implements a simplified version of GCN where we
# replace :math:`\tilde{D}^{-\frac{1}{2}}\tilde{A}\tilde{D}^{-\frac{1}{2}}` with
# :math:`\tilde{A}`. For a full implementation, see our example
# `here <https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn>`_.
......@@ -29,340 +29,389 @@ subject, relation, object. Edges thus encode important information and
have their own embeddings to be learned. Furthermore, there may exist
multiple edges among any given pair.
"""
###############################################################################
# A brief introduction to R-GCN
# ---------------------------
# In *statistical relational learning* (SRL), there are two fundamental
# tasks:
#
# - **Entity classification** - Where you assign types and categorical
# properties to entities.
# - **Link prediction** - Where you recover missing triples.
#
# In both cases, missing information is expected to be recovered from the
# neighborhood structure of the graph. For example, the R-GCN
# paper cited earlier provides the following example. Knowing that Mikhail Baryshnikov was educated at the Vaganova Academy
# implies both that Mikhail Baryshnikov should have the label person, and
# that the triple (Mikhail Baryshnikov, lived in, Russia) must belong to the
# knowledge graph.
#
# R-GCN solves these two problems using a common graph convolutional network. It's
# extended with multi-edge encoding to compute embedding of the entities, but
# with different downstream processing.
#
# - Entity classification is done by attaching a softmax classifier at the
# final embedding of an entity (node). Training is through loss of standard
# cross-entropy.
# - Link prediction is done by reconstructing an edge with an autoencoder
# architecture, using a parameterized score function. Training uses negative
# sampling.
#
# This tutorial focuses on the first task, entity classification, to show how to generate entity
# representation. `Complete
# code <https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn>`_
# for both tasks is found in the DGL Github repository.
#
# Key ideas of R-GCN
# -------------------
# Recall that in GCN, the hidden representation for each node :math:`i` at
# :math:`(l+1)^{th}` layer is computed by:
#
# .. math:: h_i^{l+1} = \sigma\left(\sum_{j\in N_i}\frac{1}{c_i} W^{(l)} h_j^{(l)}\right)~~~~~~~~~~(1)\\
#
# where :math:`c_i` is a normalization constant.
#
# The key difference between R-GCN and GCN is that in R-GCN, edges can
# represent different relations. In GCN, weight :math:`W^{(l)}` in equation
# :math:`(1)` is shared by all edges in layer :math:`l`. In contrast, in
# R-GCN, different edge types use different weights and only edges of the
# same relation type :math:`r` are associated with the same projection weight
# :math:`W_r^{(l)}`.
#
# So the hidden representation of entities in :math:`(l+1)^{th}` layer in
# R-GCN can be formulated as the following equation:
#
# .. math:: h_i^{l+1} = \sigma\left(W_0^{(l)}h_i^{(l)}+\sum_{r\in R}\sum_{j\in N_i^r}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}\right)~~~~~~~~~~(2)\\
#
# where :math:`N_i^r` denotes the set of neighbor indices of node :math:`i`
# under relation :math:`r\in R` and :math:`c_{i,r}` is a normalization
# constant. In entity classification, the R-GCN paper uses
# :math:`c_{i,r}=|N_i^r|`.
#
# The problem of applying the above equation directly is the rapid growth of
# the number of parameters, especially with highly multi-relational data. In
# order to reduce model parameter size and prevent overfitting, the original
# paper proposes to use basis decomposition.
#
# .. math:: W_r^{(l)}=\sum\limits_{b=1}^B a_{rb}^{(l)}V_b^{(l)}~~~~~~~~~~(3)\\
#
# Therefore, the weight :math:`W_r^{(l)}` is a linear combination of basis
# transformation :math:`V_b^{(l)}` with coefficients :math:`a_{rb}^{(l)}`.
# The number of bases :math:`B` is much smaller than the number of relations
# in the knowledge base.
#
# .. note::
# Another weight regularization, block-decomposition, is implemented in
# the `link prediction <link-prediction_>`_.
#
# Implement R-GCN in DGL
# ----------------------
#
# An R-GCN model is composed of several R-GCN layers. The first R-GCN layer
# also serves as input layer and takes in features (for example, description texts)
# that are associated with node entity and project to hidden space. In this tutorial,
# we only use the entity ID as an entity feature.
#
# R-GCN layers
# ~~~~~~~~~~~~
#
# For each node, an R-GCN layer performs the following steps:
#
# - Compute outgoing message using node representation and weight matrix
# associated with the edge type (message function)
# - Aggregate incoming messages and generate new node representations (reduce
# and apply function)
#
# The following code is the definition of an R-GCN hidden layer.
#
# .. note::
# Each relation type is associated with a different weight. Therefore,
# the full weight matrix has three dimensions: relation, input_feature,
# output_feature.
#
# .. note::
#
# This is showing how to implement an R-GCN from scratch. DGL provides a more
# efficient :class:`builtin R-GCN layer module <dgl.nn.pytorch.conv.RelGraphConv>`.
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial
class RGCNLayer(nn.Module):
def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
activation=None, is_input_layer=False):
super(RGCNLayer, self).__init__()
self.in_feat = in_feat
self.out_feat = out_feat
self.num_rels = num_rels
self.num_bases = num_bases
self.bias = bias
self.activation = activation
self.is_input_layer = is_input_layer
# sanity check
if self.num_bases <= 0 or self.num_bases > self.num_rels:
self.num_bases = self.num_rels
# weight bases in equation (3)
self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
self.out_feat))
if self.num_bases < self.num_rels:
# linear combination coefficients in equation (3)
self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))
# add bias
if self.bias:
self.bias = nn.Parameter(torch.Tensor(out_feat))
# init trainable parameters
nn.init.xavier_uniform_(self.weight,
gain=nn.init.calculate_gain('relu'))
if self.num_bases < self.num_rels:
nn.init.xavier_uniform_(self.w_comp,
gain=nn.init.calculate_gain('relu'))
if self.bias:
nn.init.xavier_uniform_(self.bias,
gain=nn.init.calculate_gain('relu'))
def forward(self, g):
if self.num_bases < self.num_rels:
# generate all weights from bases (equation (3))
weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
self.in_feat, self.out_feat)
else:
weight = self.weight
if self.is_input_layer:
def message_func(edges):
# for input layer, matrix multiply can be converted to be
# an embedding lookup using source node id
embed = weight.view(-1, self.out_feat)
index = edges.data[dgl.ETYPE] * self.in_feat + edges.src['id']
return {'msg': embed[index] * edges.data['norm']}
else:
def message_func(edges):
w = weight[edges.data[dgl.ETYPE]]
msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
msg = msg * edges.data['norm']
return {'msg': msg}
def apply_func(nodes):
h = nodes.data['h']
if self.bias:
h = h + self.bias
if self.activation:
h = self.activation(h)
return {'h': h}
g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)
###############################################################################
# Full R-GCN model defined
# ~~~~~~~~~~~~~~~~~~~~~~~
class Model(nn.Module):
def __init__(self, num_nodes, h_dim, out_dim, num_rels,
num_bases=-1, num_hidden_layers=1):
super(Model, self).__init__()
self.num_nodes = num_nodes
self.h_dim = h_dim
self.out_dim = out_dim
self.num_rels = num_rels
self.num_bases = num_bases
self.num_hidden_layers = num_hidden_layers
# create rgcn layers
self.build_model()
# create initial features
self.features = self.create_features()
def build_model(self):
self.layers = nn.ModuleList()
# input to hidden
i2h = self.build_input_layer()
self.layers.append(i2h)
# hidden to hidden
for _ in range(self.num_hidden_layers):
h2h = self.build_hidden_layer()
self.layers.append(h2h)
# hidden to output
h2o = self.build_output_layer()
self.layers.append(h2o)
# initialize feature for each node
def create_features(self):
features = torch.arange(self.num_nodes)
return features
def build_input_layer(self):
return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
activation=F.relu, is_input_layer=True)
def build_hidden_layer(self):
return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
activation=F.relu)
def build_output_layer(self):
return RGCNLayer(self.h_dim, self.out_dim, self.num_rels, self.num_bases,
activation=partial(F.softmax, dim=1))
def forward(self, g):
if self.features is not None:
g.ndata['id'] = self.features
for layer in self.layers:
layer(g)
return g.ndata.pop('h')
###############################################################################
# Handle dataset
# ~~~~~~~~~~~~~~~~
# This tutorial uses Institute for Applied Informatics and Formal Description Methods (AIFB) dataset from R-GCN paper.
# load graph data
dataset = dgl.data.rdf.AIFBDataset()
g = dataset[0]
category = dataset.predict_category
train_mask = g.nodes[category].data.pop('train_mask')
test_mask = g.nodes[category].data.pop('test_mask')
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
labels = g.nodes[category].data.pop('label')
num_rels = len(g.canonical_etypes)
num_classes = dataset.num_classes
# normalization factor
for cetype in g.canonical_etypes:
g.edges[cetype].data['norm'] = dgl.norm_by_dst(g, cetype).unsqueeze(1)
category_id = g.ntypes.index(category)
###############################################################################
# Create graph and model
# ~~~~~~~~~~~~~~~~~~~~~~~
# configurations
n_hidden = 16 # number of hidden units
n_bases = -1 # use number of relations as number of bases
n_hidden_layers = 0 # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 25 # epochs to train
lr = 0.01 # learning rate
l2norm = 0 # L2 norm coefficient
# create graph
g = dgl.to_homogeneous(g, edata=['norm'])
node_ids = torch.arange(g.num_nodes())
target_idx = node_ids[g.ndata[dgl.NTYPE] == category_id]
# create model
model = Model(g.num_nodes(),
n_hidden,
num_classes,
num_rels,
num_bases=n_bases,
num_hidden_layers=n_hidden_layers)
###############################################################################
# Training loop
# ~~~~~~~~~~~~~~~~
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
print("start training...")
model.train()
for epoch in range(n_epochs):
optimizer.zero_grad()
logits = model.forward(g)
logits = logits[target_idx]
loss = F.cross_entropy(logits[train_idx], labels[train_idx])
loss.backward()
optimizer.step()
train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
train_acc = train_acc.item() / len(train_idx)
val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
val_acc = val_acc.item() / len(test_idx)
print("Epoch {:05d} | ".format(epoch) +
"Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
train_acc, loss.item()) +
"Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
val_acc, val_loss.item()))
###############################################################################
# .. _link-prediction:
#
# The second task, link prediction
# --------------------------------
# So far, you have seen how to use DGL to implement entity classification with an
# R-GCN model. In the knowledge base setting, representation generated by
# R-GCN can be used to uncover potential relationships between nodes. In the
# R-GCN paper, the authors feed the entity representations generated by R-GCN
# into the `DistMult <https://arxiv.org/pdf/1412.6575.pdf>`_ prediction model
# to predict possible relationships.
#
# The implementation is similar to that presented here, but with an extra DistMult layer
# stacked on top of the R-GCN layers. You can find the complete
# implementation of link prediction with R-GCN in our `Github Python code
# example <https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/link.py>`_.
"""
###############################################################################
# A brief introduction to R-GCN
# ---------------------------
# In *statistical relational learning* (SRL), there are two fundamental
# tasks:
#
# - **Entity classification** - Where you assign types and categorical
# properties to entities.
# - **Link prediction** - Where you recover missing triples.
#
# In both cases, missing information is expected to be recovered from the
# neighborhood structure of the graph. For example, the R-GCN
# paper cited earlier provides the following example. Knowing that Mikhail Baryshnikov was educated at the Vaganova Academy
# implies both that Mikhail Baryshnikov should have the label person, and
# that the triple (Mikhail Baryshnikov, lived in, Russia) must belong to the
# knowledge graph.
#
# R-GCN solves these two problems using a common graph convolutional network. It's
# extended with multi-edge encoding to compute embedding of the entities, but
# with different downstream processing.
#
# - Entity classification is done by attaching a softmax classifier at the
# final embedding of an entity (node). Training is through loss of standard
# cross-entropy.
# - Link prediction is done by reconstructing an edge with an autoencoder
# architecture, using a parameterized score function. Training uses negative
# sampling.
#
# This tutorial focuses on the first task, entity classification, to show how to generate entity
# representation. `Complete
# code <https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn>`_
# for both tasks is found in the DGL Github repository.
#
# Key ideas of R-GCN
# -------------------
# Recall that in GCN, the hidden representation for each node :math:`i` at
# :math:`(l+1)^{th}` layer is computed by:
#
# .. math:: h_i^{l+1} = \sigma\left(\sum_{j\in N_i}\frac{1}{c_i} W^{(l)} h_j^{(l)}\right)~~~~~~~~~~(1)\\
#
# where :math:`c_i` is a normalization constant.
#
# The key difference between R-GCN and GCN is that in R-GCN, edges can
# represent different relations. In GCN, weight :math:`W^{(l)}` in equation
# :math:`(1)` is shared by all edges in layer :math:`l`. In contrast, in
# R-GCN, different edge types use different weights and only edges of the
# same relation type :math:`r` are associated with the same projection weight
# :math:`W_r^{(l)}`.
#
# So the hidden representation of entities in :math:`(l+1)^{th}` layer in
# R-GCN can be formulated as the following equation:
#
# .. math:: h_i^{l+1} = \sigma\left(W_0^{(l)}h_i^{(l)}+\sum_{r\in R}\sum_{j\in N_i^r}\frac{1}{c_{i,r}}W_r^{(l)}h_j^{(l)}\right)~~~~~~~~~~(2)\\
#
# where :math:`N_i^r` denotes the set of neighbor indices of node :math:`i`
# under relation :math:`r\in R` and :math:`c_{i,r}` is a normalization
# constant. In entity classification, the R-GCN paper uses
# :math:`c_{i,r}=|N_i^r|`.
#
# The problem of applying the above equation directly is the rapid growth of
# the number of parameters, especially with highly multi-relational data. In
# order to reduce model parameter size and prevent overfitting, the original
# paper proposes to use basis decomposition.
#
# .. math:: W_r^{(l)}=\sum\limits_{b=1}^B a_{rb}^{(l)}V_b^{(l)}~~~~~~~~~~(3)\\
#
# Therefore, the weight :math:`W_r^{(l)}` is a linear combination of basis
# transformation :math:`V_b^{(l)}` with coefficients :math:`a_{rb}^{(l)}`.
# The number of bases :math:`B` is much smaller than the number of relations
# in the knowledge base.
#
# .. note::
# Another weight regularization, block-decomposition, is implemented in
# the `link prediction <link-prediction_>`_.
#
# Implement R-GCN in DGL
# ----------------------
#
# An R-GCN model is composed of several R-GCN layers. The first R-GCN layer
# also serves as input layer and takes in features (for example, description texts)
# that are associated with node entity and project to hidden space. In this tutorial,
# we only use the entity ID as an entity feature.
#
# R-GCN layers
# ~~~~~~~~~~~~
#
# For each node, an R-GCN layer performs the following steps:
#
# - Compute outgoing message using node representation and weight matrix
# associated with the edge type (message function)
# - Aggregate incoming messages and generate new node representations (reduce
# and apply function)
#
# The following code is the definition of an R-GCN hidden layer.
#
# .. note::
# Each relation type is associated with a different weight. Therefore,
# the full weight matrix has three dimensions: relation, input_feature,
# output_feature.
#
# .. note::
#
# This is showing how to implement an R-GCN from scratch. DGL provides a more
# efficient :class:`builtin R-GCN layer module <dgl.nn.pytorch.conv.RelGraphConv>`.
#
import os
os.environ["DGLBACKEND"] = "pytorch"
from functools import partial
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
class RGCNLayer(nn.Module):
def __init__(
self,
in_feat,
out_feat,
num_rels,
num_bases=-1,
bias=None,
activation=None,
is_input_layer=False,
):
super(RGCNLayer, self).__init__()
self.in_feat = in_feat
self.out_feat = out_feat
self.num_rels = num_rels
self.num_bases = num_bases
self.bias = bias
self.activation = activation
self.is_input_layer = is_input_layer
# sanity check
if self.num_bases <= 0 or self.num_bases > self.num_rels:
self.num_bases = self.num_rels
# weight bases in equation (3)
self.weight = nn.Parameter(
torch.Tensor(self.num_bases, self.in_feat, self.out_feat)
)
if self.num_bases < self.num_rels:
# linear combination coefficients in equation (3)
self.w_comp = nn.Parameter(
torch.Tensor(self.num_rels, self.num_bases)
)
# add bias
if self.bias:
self.bias = nn.Parameter(torch.Tensor(out_feat))
# init trainable parameters
nn.init.xavier_uniform_(
self.weight, gain=nn.init.calculate_gain("relu")
)
if self.num_bases < self.num_rels:
nn.init.xavier_uniform_(
self.w_comp, gain=nn.init.calculate_gain("relu")
)
if self.bias:
nn.init.xavier_uniform_(
self.bias, gain=nn.init.calculate_gain("relu")
)
def forward(self, g):
if self.num_bases < self.num_rels:
# generate all weights from bases (equation (3))
weight = self.weight.view(
self.in_feat, self.num_bases, self.out_feat
)
weight = torch.matmul(self.w_comp, weight).view(
self.num_rels, self.in_feat, self.out_feat
)
else:
weight = self.weight
if self.is_input_layer:
def message_func(edges):
# for input layer, matrix multiply can be converted to be
# an embedding lookup using source node id
embed = weight.view(-1, self.out_feat)
index = edges.data[dgl.ETYPE] * self.in_feat + edges.src["id"]
return {"msg": embed[index] * edges.data["norm"]}
else:
def message_func(edges):
w = weight[edges.data[dgl.ETYPE]]
msg = torch.bmm(edges.src["h"].unsqueeze(1), w).squeeze()
msg = msg * edges.data["norm"]
return {"msg": msg}
def apply_func(nodes):
h = nodes.data["h"]
if self.bias:
h = h + self.bias
if self.activation:
h = self.activation(h)
return {"h": h}
g.update_all(message_func, fn.sum(msg="msg", out="h"), apply_func)
###############################################################################
# Full R-GCN model defined
# ~~~~~~~~~~~~~~~~~~~~~~~
class Model(nn.Module):
def __init__(
self,
num_nodes,
h_dim,
out_dim,
num_rels,
num_bases=-1,
num_hidden_layers=1,
):
super(Model, self).__init__()
self.num_nodes = num_nodes
self.h_dim = h_dim
self.out_dim = out_dim
self.num_rels = num_rels
self.num_bases = num_bases
self.num_hidden_layers = num_hidden_layers
# create rgcn layers
self.build_model()
# create initial features
self.features = self.create_features()
def build_model(self):
self.layers = nn.ModuleList()
# input to hidden
i2h = self.build_input_layer()
self.layers.append(i2h)
# hidden to hidden
for _ in range(self.num_hidden_layers):
h2h = self.build_hidden_layer()
self.layers.append(h2h)
# hidden to output
h2o = self.build_output_layer()
self.layers.append(h2o)
# initialize feature for each node
def create_features(self):
features = torch.arange(self.num_nodes)
return features
def build_input_layer(self):
return RGCNLayer(
self.num_nodes,
self.h_dim,
self.num_rels,
self.num_bases,
activation=F.relu,
is_input_layer=True,
)
def build_hidden_layer(self):
return RGCNLayer(
self.h_dim,
self.h_dim,
self.num_rels,
self.num_bases,
activation=F.relu,
)
def build_output_layer(self):
return RGCNLayer(
self.h_dim,
self.out_dim,
self.num_rels,
self.num_bases,
activation=partial(F.softmax, dim=1),
)
def forward(self, g):
if self.features is not None:
g.ndata["id"] = self.features
for layer in self.layers:
layer(g)
return g.ndata.pop("h")
###############################################################################
# Handle dataset
# ~~~~~~~~~~~~~~~~
# This tutorial uses Institute for Applied Informatics and Formal Description Methods (AIFB) dataset from R-GCN paper.
# load graph data
dataset = dgl.data.rdf.AIFBDataset()
g = dataset[0]
category = dataset.predict_category
train_mask = g.nodes[category].data.pop("train_mask")
test_mask = g.nodes[category].data.pop("test_mask")
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
labels = g.nodes[category].data.pop("label")
num_rels = len(g.canonical_etypes)
num_classes = dataset.num_classes
# normalization factor
for cetype in g.canonical_etypes:
g.edges[cetype].data["norm"] = dgl.norm_by_dst(g, cetype).unsqueeze(1)
category_id = g.ntypes.index(category)
###############################################################################
# Create graph and model
# ~~~~~~~~~~~~~~~~~~~~~~~
# configurations
n_hidden = 16 # number of hidden units
n_bases = -1 # use number of relations as number of bases
n_hidden_layers = 0 # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 25 # epochs to train
lr = 0.01 # learning rate
l2norm = 0 # L2 norm coefficient
# create graph
g = dgl.to_homogeneous(g, edata=["norm"])
node_ids = torch.arange(g.num_nodes())
target_idx = node_ids[g.ndata[dgl.NTYPE] == category_id]
# create model
model = Model(
g.num_nodes(),
n_hidden,
num_classes,
num_rels,
num_bases=n_bases,
num_hidden_layers=n_hidden_layers,
)
###############################################################################
# Training loop
# ~~~~~~~~~~~~~~~~
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
print("start training...")
model.train()
for epoch in range(n_epochs):
optimizer.zero_grad()
logits = model.forward(g)
logits = logits[target_idx]
loss = F.cross_entropy(logits[train_idx], labels[train_idx])
loss.backward()
optimizer.step()
train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
train_acc = train_acc.item() / len(train_idx)
val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
val_acc = val_acc.item() / len(test_idx)
print(
"Epoch {:05d} | ".format(epoch)
+ "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
train_acc, loss.item()
)
+ "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
val_acc, val_loss.item()
)
)
###############################################################################
# .. _link-prediction:
#
# The second task, link prediction
# --------------------------------
# So far, you have seen how to use DGL to implement entity classification with an
# R-GCN model. In the knowledge base setting, representation generated by
# R-GCN can be used to uncover potential relationships between nodes. In the
# R-GCN paper, the authors feed the entity representations generated by R-GCN
# into the `DistMult <https://arxiv.org/pdf/1412.6575.pdf>`_ prediction model
# to predict possible relationships.
#
# The implementation is similar to that presented here, but with an extra DistMult layer
# stacked on top of the R-GCN layers. You can find the complete
# implementation of link prediction with R-GCN in our `Github Python code
# example <https://github.com/dmlc/dgl/blob/master/examples/pytorch/rgcn/link.py>`_.
......@@ -14,612 +14,640 @@ Line Graph Neural Network
efficiency. For recommended implementation, please refer to the `official
examples <https://github.com/dmlc/dgl/tree/master/examples>`_.
"""
###########################################################################################
#
# In this tutorial, you learn how to solve community detection tasks by implementing a line
# graph neural network (LGNN). Community detection, or graph clustering, consists of partitioning
# the vertices in a graph into clusters in which nodes are more similar to
# one another.
#
# In the :doc:`Graph convolutinal network tutorial <1_gcn>`, you learned how to classify the nodes of an input
# graph in a semi-supervised setting. You used a graph convolutional neural network (GCN)
# as an embedding mechanism for graph features.
#
# To generalize a graph neural network (GNN) into supervised community detection, a line-graph based
# variation of GNN is introduced in the research paper
# `Supervised Community Detection with Line Graph Neural Networks <https://arxiv.org/abs/1705.08415>`__.
# One of the highlights of the model is
# to augment the straightforward GNN architecture so that it operates on
# a line graph of edge adjacencies, defined with a non-backtracking operator.
#
# A line graph neural network (LGNN) shows how DGL can implement an advanced graph algorithm by
# mixing basic tensor operations, sparse-matrix multiplication, and message-
# passing APIs.
#
# In the following sections, you learn about community detection, line
# graphs, LGNN, and its implementation.
#
# Supervised community detection task with the Cora dataset
# --------------------------------------------
# Community detection
# ~~~~~~~~~~~~~~~~~~~~
# In a community detection task, you cluster similar nodes instead of
# labeling them. The node similarity is typically described as having higher inner
# density within each cluster.
#
# What's the difference between community detection and node classification?
# Comparing to node classification, community detection focuses on retrieving
# cluster information in the graph, rather than assigning a specific label to
# a node. For example, as long as a node is clustered with its community
# members, it doesn't matter whether the node is assigned as "community A",
# or "community B", while assigning all "great movies" to label "bad movies"
# will be a disaster in a movie network classification task.
#
# What's the difference then, between a community detection algorithm and
# other clustering algorithm such as k-means? Community detection algorithm operates on
# graph-structured data. Comparing to k-means, community detection leverages
# graph structure, instead of simply clustering nodes based on their
# features.
#
# Cora dataset
# ~~~~~
# To be consistent with the GCN tutorial,
# you use the `Cora dataset <https://linqs.soe.ucsc.edu/data>`__
# to illustrate a simple community detection task. Cora is a scientific publication dataset,
# with 2708 papers belonging to seven
# different machine learning fields. Here, you formulate Cora as a
# directed graph, with each node being a paper, and each edge being a
# citation link (A->B means A cites B). Here is a visualization of the whole
# Cora dataset.
#
# .. figure:: https://i.imgur.com/X404Byc.png
# :alt: cora
# :height: 400px
# :width: 500px
# :align: center
#
# Cora naturally contains seven classes, and statistics below show that each
# class does satisfy our assumption of community, i.e. nodes of same class
# class have higher connection probability among them than with nodes of different class.
# The following code snippet verifies that there are more intra-class edges
# than inter-class.
import os
os.environ['DGLBACKEND'] = 'pytorch'
import torch
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.data import citation_graph as citegrh
data = citegrh.load_cora()
G = data[0]
labels = th.tensor(G.ndata['label'])
# find all the nodes labeled with class 0
label0_nodes = th.nonzero(labels == 0, as_tuple=False).squeeze()
# find all the edges pointing to class 0 nodes
src, _ = G.in_edges(label0_nodes)
src_labels = labels[src]
# find all the edges whose both endpoints are in class 0
intra_src = th.nonzero(src_labels == 0, as_tuple=False)
print('Intra-class edges percent: %.4f' % (len(intra_src) / len(src_labels)))
###########################################################################################
# Binary community subgraph from Cora with a test dataset
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Without loss of generality, in this tutorial you limit the scope of the
# task to binary community detection.
#
# .. note::
#
# To create a practice binary-community dataset from Cora, first extract
# all two-class pairs from the original Cora seven classes. For each pair, you
# treat each class as one community, and find the largest subgraph that
# at least contains one cross-community edge as the training example. As
# a result, there are a total of 21 training samples in this small dataset.
#
# With the following code, you can visualize one of the training samples and its community structure.
import networkx as nx
import matplotlib.pyplot as plt
train_set = dgl.data.CoraBinary()
G1, pmpd1, label1 = train_set[1]
nx_G1 = G1.to_networkx()
def visualize(labels, g):
pos = nx.spring_layout(g, seed=1)
plt.figure(figsize=(8, 8))
plt.axis('off')
nx.draw_networkx(g, pos=pos, node_size=50, cmap=plt.get_cmap('coolwarm'),
node_color=labels, edge_color='k',
arrows=False, width=0.5, style='dotted', with_labels=False)
visualize(label1, nx_G1)
###########################################################################################
# To learn more, go the original research paper to see how to generalize
# to multiple communities case.
#
# Community detection in a supervised setting
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The community detection problem could be tackled with both supervised and
# unsupervised approaches. You can formulate
# community detection in a supervised setting as follows:
#
# - Each training example consists of :math:`(G, L)`, where :math:`G` is a
# directed graph :math:`(V, E)`. For each node :math:`v` in :math:`V`, we
# assign a ground truth community label :math:`z_v \in \{0,1\}`.
# - The parameterized model :math:`f(G, \theta)` predicts a label set
# :math:`\tilde{Z} = f(G)` for nodes :math:`V`.
# - For each example :math:`(G,L)`, the model learns to minimize a specially
# designed loss function (equivariant loss) :math:`L_{equivariant} =
# (\tilde{Z},Z)`
#
# .. note::
#
# In this supervised setting, the model naturally predicts a label for
# each community. However, community assignment should be equivariant to
# label permutations. To achieve this, in each forward process, we take
# the minimum among losses calculated from all possible permutations of
# labels.
#
# Mathematically, this means
# :math:`L_{equivariant} = \underset{\pi \in S_c} {min}-\log(\hat{\pi}, \pi)`,
# where :math:`S_c` is the set of all permutations of labels, and
# :math:`\hat{\pi}` is the set of predicted labels,
# :math:`- \log(\hat{\pi},\pi)` denotes negative log likelihood.
#
# For instance, for a sample graph with node :math:`\{1,2,3,4\}` and
# community assignment :math:`\{A, A, A, B\}`, with each node's label
# :math:`l \in \{0,1\}`,The group of all possible permutations
# :math:`S_c = \{\{0,0,0,1\}, \{1,1,1,0\}\}`.
#
# Line graph neural network key ideas
# ------------------------------------
# An key innovation in this topic is the use of a line graph.
# Unlike models in previous tutorials, message passing happens not only on the
# original graph, e.g. the binary community subgraph from Cora, but also on the
# line graph associated with the original graph.
#
# What is a line-graph?
# ~~~~~~~~~~~~~~~~~~~~~
# In graph theory, line graph is a graph representation that encodes the
# edge adjacency structure in the original graph.
#
# Specifically, a line-graph :math:`L(G)` turns an edge of the original graph `G`
# into a node. This is illustrated with the graph below (taken from the
# research paper).
#
# .. figure:: https://i.imgur.com/4WO5jEm.png
# :alt: lg
# :align: center
#
# Here, :math:`e_{A}:= (i\rightarrow j)` and :math:`e_{B}:= (j\rightarrow k)`
# are two edges in the original graph :math:`G`. In line graph :math:`G_L`,
# they correspond to nodes :math:`v^{l}_{A}, v^{l}_{B}`.
#
# The next natural question is, how to connect nodes in line-graph? How to
# connect two edges? Here, we use the following connection rule:
#
# Two nodes :math:`v^{l}_{A}`, :math:`v^{l}_{B}` in `lg` are connected if
# the corresponding two edges :math:`e_{A}, e_{B}` in `g` share one and only
# one node:
# :math:`e_{A}`'s destination node is :math:`e_{B}`'s source node
# (:math:`j`).
#
# .. note::
#
# Mathematically, this definition corresponds to a notion called non-backtracking
# operator:
# :math:`B_{(i \rightarrow j), (\hat{i} \rightarrow \hat{j})}`
# :math:`= \begin{cases}
# 1 \text{ if } j = \hat{i}, \hat{j} \neq i\\
# 0 \text{ otherwise} \end{cases}`
# where an edge is formed if :math:`B_{node1, node2} = 1`.
#
#
# One layer in LGNN, algorithm structure
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# LGNN chains together a series of line graph neural network layers. The graph
# representation :math:`x` and its line graph companion :math:`y` evolve with
# the dataflow as follows.
#
# .. figure:: https://i.imgur.com/bZGGIGp.png
# :alt: alg
# :align: center
#
# At the :math:`k`-th layer, the :math:`i`-th neuron of the :math:`l`-th
# channel updates its embedding :math:`x^{(k+1)}_{i,l}` with:
#
# .. math::
# \begin{split}
# x^{(k+1)}_{i,l} ={}&\rho[x^{(k)}_{i}\theta^{(k)}_{1,l}
# +(Dx^{(k)})_{i}\theta^{(k)}_{2,l} \\
# &+\sum^{J-1}_{j=0}(A^{2^{j}}x^{k})_{i}\theta^{(k)}_{3+j,l}\\
# &+[\{\text{Pm},\text{Pd}\}y^{(k)}]_{i}\theta^{(k)}_{3+J,l}] \\
# &+\text{skip-connection}
# \qquad i \in V, l = 1,2,3, ... b_{k+1}/2
# \end{split}
#
# Then, the line-graph representation :math:`y^{(k+1)}_{i,l}` with,
#
# .. math::
#
# \begin{split}
# y^{(k+1)}_{i',l^{'}} = {}&\rho[y^{(k)}_{i^{'}}\gamma^{(k)}_{1,l^{'}}+
# (D_{L(G)}y^{(k)})_{i^{'}}\gamma^{(k)}_{2,l^{'}}\\
# &+\sum^{J-1}_{j=0}(A_{L(G)}^{2^{j}}y^{k})_{i}\gamma^{(k)}_{3+j,l^{'}}\\
# &+[\{\text{Pm},\text{Pd}\}^{T}x^{(k+1)}]_{i^{'}}\gamma^{(k)}_{3+J,l^{'}}]\\
# &+\text{skip-connection}
# \qquad i^{'} \in V_{l}, l^{'} = 1,2,3, ... b^{'}_{k+1}/2
# \end{split}
#
# Where :math:`\text{skip-connection}` refers to performing the same operation without the non-linearity
# :math:`\rho`, and with linear projection :math:`\theta_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`
# and :math:`\gamma_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`.
#
# Implement LGNN in DGL
# ---------------------
# Even though the equations in the previous section might seem intimidating,
# it helps to understand the following information before you implement the LGNN.
#
# The two equations are symmetric and can be implemented as two instances
# of the same class with different parameters.
# The first equation operates on graph representation :math:`x`,
# whereas the second operates on line-graph
# representation :math:`y`. Let us denote this abstraction as :math:`f`. Then
# the first is :math:`f(x,y; \theta_x)`, and the second
# is :math:`f(y,x, \theta_y)`. That is, they are parameterized to compute
# representations of the original graph and its
# companion line graph, respectively.
#
# Each equation consists of four terms. Take the first one as an example, which follows.
#
# - :math:`x^{(k)}\theta^{(k)}_{1,l}`, a linear projection of previous
# layer's output :math:`x^{(k)}`, denote as :math:`\text{prev}(x)`.
# - :math:`(Dx^{(k)})\theta^{(k)}_{2,l}`, a linear projection of degree
# operator on :math:`x^{(k)}`, denote as :math:`\text{deg}(x)`.
# - :math:`\sum^{J-1}_{j=0}(A^{2^{j}}x^{(k)})\theta^{(k)}_{3+j,l}`,
# a summation of :math:`2^{j}` adjacency operator on :math:`x^{(k)}`,
# denote as :math:`\text{radius}(x)`
# - :math:`[\{Pm,Pd\}y^{(k)}]\theta^{(k)}_{3+J,l}`, fusing another
# graph's embedding information using incidence matrix
# :math:`\{Pm, Pd\}`, followed with a linear projection,
# denote as :math:`\text{fuse}(y)`.
#
# Each of the terms are performed again with different
# parameters, and without the nonlinearity after the sum.
# Therefore, :math:`f` could be written as:
#
# .. math::
# \begin{split}
# f(x^{(k)},y^{(k)}) = {}\rho[&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1})
# +\text{fuse}(y^{(k)})]\\
# +&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1}) +\text{fuse}(y^{(k)})
# \end{split}
#
# Two equations are chained-up in the following order:
#
# .. math::
# \begin{split}
# x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
# y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
# \end{split}
#
# Keep in mind the listed observations in this overview and proceed to implementation.
# An important point is that you use different strategies for the noted terms.
#
# .. note::
# You can understand :math:`\{Pm, Pd\}` more thoroughly with this explanation.
# Roughly speaking, there is a relationship between how :math:`g` and
# :math:`lg` (the line graph) work together with loopy brief propagation.
# Here, you implement :math:`\{Pm, Pd\}` as a SciPy COO sparse matrix in the dataset,
# and stack them as tensors when batching. Another batching solution is to
# treat :math:`\{Pm, Pd\}` as the adjacency matrix of a bipartite graph, which maps
# line graph's feature to graph's, and vice versa.
#
# Implementing :math:`\text{prev}` and :math:`\text{deg}` as tensor operation
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Linear projection and degree operation are both simply matrix
# multiplication. Write them as PyTorch tensor operations.
#
# In ``__init__``, you define the projection variables.
#
# ::
#
# self.linear_prev = nn.Linear(in_feats, out_feats)
# self.linear_deg = nn.Linear(in_feats, out_feats)
#
#
# In ``forward()``, :math:`\text{prev}` and :math:`\text{deg}` are the same
# as any other PyTorch tensor operations.
#
# ::
#
# prev_proj = self.linear_prev(feat_a)
# deg_proj = self.linear_deg(deg * feat_a)
#
# Implementing :math:`\text{radius}` as message passing in DGL
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# As discussed in GCN tutorial, you can formulate one adjacency operator as
# doing one-step message passing. As a generalization, :math:`2^j` adjacency
# operations can be formulated as performing :math:`2^j` step of message
# passing. Therefore, the summation is equivalent to summing nodes'
# representation of :math:`2^j, j=0, 1, 2..` step message passing, i.e.
# gathering information in :math:`2^{j}` neighborhood of each node.
#
# In ``__init__``, define the projection variables used in each
# :math:`2^j` steps of message passing.
#
# ::
#
# self.linear_radius = nn.ModuleList(
# [nn.Linear(in_feats, out_feats) for i in range(radius)])
#
# In ``__forward__``, use following function ``aggregate_radius()`` to
# gather data from multiple hops. This can be seen in the following code.
# Note that the ``update_all`` is called multiple times.
# Return a list containing features gathered from multiple radius.
import dgl.function as fn
def aggregate_radius(radius, g, z):
# initializing list to collect message passing result
z_list = []
g.ndata['z'] = z
# pulling message from 1-hop neighbourhood
g.update_all(fn.copy_u(u='z', out='m'), fn.sum(msg='m', out='z'))
z_list.append(g.ndata['z'])
for i in range(radius - 1):
for j in range(2 ** i):
#pulling message from 2^j neighborhood
g.update_all(fn.copy_u(u='z', out='m'), fn.sum(msg='m', out='z'))
z_list.append(g.ndata['z'])
return z_list
#########################################################################
# Implementing :math:`\text{fuse}` as sparse matrix multiplication
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# :math:`\{Pm, Pd\}` is a sparse matrix with only two non-zero entries on
# each column. Therefore, you construct it as a sparse matrix in the dataset,
# and implement :math:`\text{fuse}` as a sparse matrix multiplication.
#
# in ``__forward__``:
#
# ::
#
# fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
#
# Completing :math:`f(x, y)`
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Finally, the following shows how to sum up all the terms together, pass it to skip connection, and
# batch norm.
#
# ::
#
# result = prev_proj + deg_proj + radius_proj + fuse
#
# Pass result to skip connection.
#
# ::
#
# result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
#
# Then pass the result to batch norm.
#
# ::
#
# result = self.bn(result) #Batch Normalization.
#
#
# Here is the complete code for one LGNN layer's abstraction :math:`f(x,y)`
class LGNNCore(nn.Module):
def __init__(self, in_feats, out_feats, radius):
super(LGNNCore, self).__init__()
self.out_feats = out_feats
self.radius = radius
self.linear_prev = nn.Linear(in_feats, out_feats)
self.linear_deg = nn.Linear(in_feats, out_feats)
self.linear_radius = nn.ModuleList(
[nn.Linear(in_feats, out_feats) for i in range(radius)])
self.linear_fuse = nn.Linear(in_feats, out_feats)
self.bn = nn.BatchNorm1d(out_feats)
def forward(self, g, feat_a, feat_b, deg, pm_pd):
# term "prev"
prev_proj = self.linear_prev(feat_a)
# term "deg"
deg_proj = self.linear_deg(deg * feat_a)
# term "radius"
# aggregate 2^j-hop features
hop2j_list = aggregate_radius(self.radius, g, feat_a)
# apply linear transformation
hop2j_list = [linear(x) for linear, x in zip(self.linear_radius, hop2j_list)]
radius_proj = sum(hop2j_list)
# term "fuse"
fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
# sum them together
result = prev_proj + deg_proj + radius_proj + fuse
# skip connection and batch norm
n = self.out_feats // 2
result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
result = self.bn(result)
return result
##############################################################################################################
# Chain-up LGNN abstractions as an LGNN layer
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To implement:
#
# .. math::
# \begin{split}
# x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
# y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
# \end{split}
#
# Chain-up two ``LGNNCore`` instances, as in the example code, with different parameters in the forward pass.
class LGNNLayer(nn.Module):
def __init__(self, in_feats, out_feats, radius):
super(LGNNLayer, self).__init__()
self.g_layer = LGNNCore(in_feats, out_feats, radius)
self.lg_layer = LGNNCore(in_feats, out_feats, radius)
def forward(self, g, lg, x, lg_x, deg_g, deg_lg, pm_pd):
next_x = self.g_layer(g, x, lg_x, deg_g, pm_pd)
pm_pd_y = th.transpose(pm_pd, 0, 1)
next_lg_x = self.lg_layer(lg, lg_x, x, deg_lg, pm_pd_y)
return next_x, next_lg_x
########################################################################################
# Chain-up LGNN layers
# ~~~~~~~~~~~~~~~~~~~~
# Define an LGNN with three hidden layers, as in the following example.
class LGNN(nn.Module):
def __init__(self, radius):
super(LGNN, self).__init__()
self.layer1 = LGNNLayer(1, 16, radius) # input is scalar feature
self.layer2 = LGNNLayer(16, 16, radius) # hidden size is 16
self.layer3 = LGNNLayer(16, 16, radius)
self.linear = nn.Linear(16, 2) # predice two classes
def forward(self, g, lg, pm_pd):
# compute the degrees
deg_g = g.in_degrees().float().unsqueeze(1)
deg_lg = lg.in_degrees().float().unsqueeze(1)
# use degree as the input feature
x, lg_x = deg_g, deg_lg
x, lg_x = self.layer1(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
x, lg_x = self.layer2(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
x, lg_x = self.layer3(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
return self.linear(x)
#########################################################################################
# Training and inference
# -----------------------
# First load the data.
from torch.utils.data import DataLoader
training_loader = DataLoader(train_set,
batch_size=1,
collate_fn=train_set.collate_fn,
drop_last=True)
#######################################################################################
# Next, define the main training loop. Note that each training sample contains
# three objects: A :class:`~dgl.DGLGraph`, a SciPy sparse matrix ``pmpd``, and a label
# array in ``numpy.ndarray``. Generate the line graph by using this command:
#
# ::
#
# lg = g.line_graph(backtracking=False)
#
# Note that ``backtracking=False`` is required to correctly simulate non-backtracking
# operation. We also define a utility function to convert the SciPy sparse matrix to
# torch sparse tensor.
# Create the model
model = LGNN(radius=3)
# define the optimizer
optimizer = th.optim.Adam(model.parameters(), lr=1e-2)
# A utility function to convert a scipy.coo_matrix to torch.SparseFloat
def sparse2th(mat):
value = mat.data
indices = th.LongTensor([mat.row, mat.col])
tensor = th.sparse.FloatTensor(indices, th.from_numpy(value).float(), mat.shape)
return tensor
# Train for 20 epochs
for i in range(20):
all_loss = []
all_acc = []
for [g, pmpd, label] in training_loader:
# Generate the line graph.
lg = g.line_graph(backtracking=False)
# Create torch tensors
pmpd = sparse2th(pmpd)
label = th.from_numpy(label)
# Forward
z = model(g, lg, pmpd)
# Calculate loss:
# Since there are only two communities, there are only two permutations
# of the community labels.
loss_perm1 = F.cross_entropy(z, label)
loss_perm2 = F.cross_entropy(z, 1 - label)
loss = th.min(loss_perm1, loss_perm2)
# Calculate accuracy:
_, pred = th.max(z, 1)
acc_perm1 = (pred == label).float().mean()
acc_perm2 = (pred == 1 - label).float().mean()
acc = th.max(acc_perm1, acc_perm2)
all_loss.append(loss.item())
all_acc.append(acc.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
niters = len(all_loss)
print("Epoch %d | loss %.4f | accuracy %.4f" % (i,
sum(all_loss) / niters, sum(all_acc) / niters))
#######################################################################################
# Visualize training progress
# -----------------------------
# You can visualize the network's community prediction on one training example,
# together with the ground truth. Start this with the following code example.
pmpd1 = sparse2th(pmpd1)
LG1 = G1.line_graph(backtracking=False)
z = model(G1, LG1, pmpd1)
_, pred = th.max(z, 1)
visualize(pred, nx_G1)
#######################################################################################
# Compared with the ground truth. Note that the color might be reversed for the
# two communities because the model is for correctly predicting the partitioning.
visualize(label1, nx_G1)
#########################################
# Here is an animation to better understand the process. (40 epochs)
#
# .. figure:: https://i.imgur.com/KDUyE1S.gif
# :alt: lgnn-anim
#
# Batching graphs for parallelism
# --------------------------------
#
# LGNN takes a collection of different graphs.
# You might consider whether batching can be used for parallelism.
#
# Batching has been into the data loader itself.
# In the ``collate_fn`` for PyTorch data loader, graphs are batched using DGL's
# batched_graph API. DGL batches graphs by merging them
# into a large graph, with each smaller graph's adjacency matrix being a block
# along the diagonal of the large graph's adjacency matrix. Concatenate
# :math`\{Pm,Pd\}` as block diagonal matrix in correspondence to DGL batched
# graph API.
def collate_fn(batch):
graphs, pmpds, labels = zip(*batch)
batched_graphs = dgl.batch(graphs)
batched_pmpds = sp.block_diag(pmpds)
batched_labels = np.concatenate(labels, axis=0)
return batched_graphs, batched_pmpds, batched_labels
######################################################################################
# You can find the complete code on Github at
# `Community Detection with Graph Neural Networks (CDGNN) <https://github.com/dmlc/dgl/tree/master/examples/pytorch/line_graph>`_.
"""
###########################################################################################
#
# In this tutorial, you learn how to solve community detection tasks by implementing a line
# graph neural network (LGNN). Community detection, or graph clustering, consists of partitioning
# the vertices in a graph into clusters in which nodes are more similar to
# one another.
#
# In the :doc:`Graph convolutinal network tutorial <1_gcn>`, you learned how to classify the nodes of an input
# graph in a semi-supervised setting. You used a graph convolutional neural network (GCN)
# as an embedding mechanism for graph features.
#
# To generalize a graph neural network (GNN) into supervised community detection, a line-graph based
# variation of GNN is introduced in the research paper
# `Supervised Community Detection with Line Graph Neural Networks <https://arxiv.org/abs/1705.08415>`__.
# One of the highlights of the model is
# to augment the straightforward GNN architecture so that it operates on
# a line graph of edge adjacencies, defined with a non-backtracking operator.
#
# A line graph neural network (LGNN) shows how DGL can implement an advanced graph algorithm by
# mixing basic tensor operations, sparse-matrix multiplication, and message-
# passing APIs.
#
# In the following sections, you learn about community detection, line
# graphs, LGNN, and its implementation.
#
# Supervised community detection task with the Cora dataset
# --------------------------------------------
# Community detection
# ~~~~~~~~~~~~~~~~~~~~
# In a community detection task, you cluster similar nodes instead of
# labeling them. The node similarity is typically described as having higher inner
# density within each cluster.
#
# What's the difference between community detection and node classification?
# Comparing to node classification, community detection focuses on retrieving
# cluster information in the graph, rather than assigning a specific label to
# a node. For example, as long as a node is clustered with its community
# members, it doesn't matter whether the node is assigned as "community A",
# or "community B", while assigning all "great movies" to label "bad movies"
# will be a disaster in a movie network classification task.
#
# What's the difference then, between a community detection algorithm and
# other clustering algorithm such as k-means? Community detection algorithm operates on
# graph-structured data. Comparing to k-means, community detection leverages
# graph structure, instead of simply clustering nodes based on their
# features.
#
# Cora dataset
# ~~~~~
# To be consistent with the GCN tutorial,
# you use the `Cora dataset <https://linqs.soe.ucsc.edu/data>`__
# to illustrate a simple community detection task. Cora is a scientific publication dataset,
# with 2708 papers belonging to seven
# different machine learning fields. Here, you formulate Cora as a
# directed graph, with each node being a paper, and each edge being a
# citation link (A->B means A cites B). Here is a visualization of the whole
# Cora dataset.
#
# .. figure:: https://i.imgur.com/X404Byc.png
# :alt: cora
# :height: 400px
# :width: 500px
# :align: center
#
# Cora naturally contains seven classes, and statistics below show that each
# class does satisfy our assumption of community, i.e. nodes of same class
# class have higher connection probability among them than with nodes of different class.
# The following code snippet verifies that there are more intra-class edges
# than inter-class.
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from dgl.data import citation_graph as citegrh
data = citegrh.load_cora()
G = data[0]
labels = th.tensor(G.ndata["label"])
# find all the nodes labeled with class 0
label0_nodes = th.nonzero(labels == 0, as_tuple=False).squeeze()
# find all the edges pointing to class 0 nodes
src, _ = G.in_edges(label0_nodes)
src_labels = labels[src]
# find all the edges whose both endpoints are in class 0
intra_src = th.nonzero(src_labels == 0, as_tuple=False)
print("Intra-class edges percent: %.4f" % (len(intra_src) / len(src_labels)))
import matplotlib.pyplot as plt
###########################################################################################
# Binary community subgraph from Cora with a test dataset
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Without loss of generality, in this tutorial you limit the scope of the
# task to binary community detection.
#
# .. note::
#
# To create a practice binary-community dataset from Cora, first extract
# all two-class pairs from the original Cora seven classes. For each pair, you
# treat each class as one community, and find the largest subgraph that
# at least contains one cross-community edge as the training example. As
# a result, there are a total of 21 training samples in this small dataset.
#
# With the following code, you can visualize one of the training samples and its community structure.
import networkx as nx
train_set = dgl.data.CoraBinary()
G1, pmpd1, label1 = train_set[1]
nx_G1 = G1.to_networkx()
def visualize(labels, g):
pos = nx.spring_layout(g, seed=1)
plt.figure(figsize=(8, 8))
plt.axis("off")
nx.draw_networkx(
g,
pos=pos,
node_size=50,
cmap=plt.get_cmap("coolwarm"),
node_color=labels,
edge_color="k",
arrows=False,
width=0.5,
style="dotted",
with_labels=False,
)
visualize(label1, nx_G1)
###########################################################################################
# To learn more, go the original research paper to see how to generalize
# to multiple communities case.
#
# Community detection in a supervised setting
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The community detection problem could be tackled with both supervised and
# unsupervised approaches. You can formulate
# community detection in a supervised setting as follows:
#
# - Each training example consists of :math:`(G, L)`, where :math:`G` is a
# directed graph :math:`(V, E)`. For each node :math:`v` in :math:`V`, we
# assign a ground truth community label :math:`z_v \in \{0,1\}`.
# - The parameterized model :math:`f(G, \theta)` predicts a label set
# :math:`\tilde{Z} = f(G)` for nodes :math:`V`.
# - For each example :math:`(G,L)`, the model learns to minimize a specially
# designed loss function (equivariant loss) :math:`L_{equivariant} =
# (\tilde{Z},Z)`
#
# .. note::
#
# In this supervised setting, the model naturally predicts a label for
# each community. However, community assignment should be equivariant to
# label permutations. To achieve this, in each forward process, we take
# the minimum among losses calculated from all possible permutations of
# labels.
#
# Mathematically, this means
# :math:`L_{equivariant} = \underset{\pi \in S_c} {min}-\log(\hat{\pi}, \pi)`,
# where :math:`S_c` is the set of all permutations of labels, and
# :math:`\hat{\pi}` is the set of predicted labels,
# :math:`- \log(\hat{\pi},\pi)` denotes negative log likelihood.
#
# For instance, for a sample graph with node :math:`\{1,2,3,4\}` and
# community assignment :math:`\{A, A, A, B\}`, with each node's label
# :math:`l \in \{0,1\}`,The group of all possible permutations
# :math:`S_c = \{\{0,0,0,1\}, \{1,1,1,0\}\}`.
#
# Line graph neural network key ideas
# ------------------------------------
# An key innovation in this topic is the use of a line graph.
# Unlike models in previous tutorials, message passing happens not only on the
# original graph, e.g. the binary community subgraph from Cora, but also on the
# line graph associated with the original graph.
#
# What is a line-graph?
# ~~~~~~~~~~~~~~~~~~~~~
# In graph theory, line graph is a graph representation that encodes the
# edge adjacency structure in the original graph.
#
# Specifically, a line-graph :math:`L(G)` turns an edge of the original graph `G`
# into a node. This is illustrated with the graph below (taken from the
# research paper).
#
# .. figure:: https://i.imgur.com/4WO5jEm.png
# :alt: lg
# :align: center
#
# Here, :math:`e_{A}:= (i\rightarrow j)` and :math:`e_{B}:= (j\rightarrow k)`
# are two edges in the original graph :math:`G`. In line graph :math:`G_L`,
# they correspond to nodes :math:`v^{l}_{A}, v^{l}_{B}`.
#
# The next natural question is, how to connect nodes in line-graph? How to
# connect two edges? Here, we use the following connection rule:
#
# Two nodes :math:`v^{l}_{A}`, :math:`v^{l}_{B}` in `lg` are connected if
# the corresponding two edges :math:`e_{A}, e_{B}` in `g` share one and only
# one node:
# :math:`e_{A}`'s destination node is :math:`e_{B}`'s source node
# (:math:`j`).
#
# .. note::
#
# Mathematically, this definition corresponds to a notion called non-backtracking
# operator:
# :math:`B_{(i \rightarrow j), (\hat{i} \rightarrow \hat{j})}`
# :math:`= \begin{cases}
# 1 \text{ if } j = \hat{i}, \hat{j} \neq i\\
# 0 \text{ otherwise} \end{cases}`
# where an edge is formed if :math:`B_{node1, node2} = 1`.
#
#
# One layer in LGNN, algorithm structure
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# LGNN chains together a series of line graph neural network layers. The graph
# representation :math:`x` and its line graph companion :math:`y` evolve with
# the dataflow as follows.
#
# .. figure:: https://i.imgur.com/bZGGIGp.png
# :alt: alg
# :align: center
#
# At the :math:`k`-th layer, the :math:`i`-th neuron of the :math:`l`-th
# channel updates its embedding :math:`x^{(k+1)}_{i,l}` with:
#
# .. math::
# \begin{split}
# x^{(k+1)}_{i,l} ={}&\rho[x^{(k)}_{i}\theta^{(k)}_{1,l}
# +(Dx^{(k)})_{i}\theta^{(k)}_{2,l} \\
# &+\sum^{J-1}_{j=0}(A^{2^{j}}x^{k})_{i}\theta^{(k)}_{3+j,l}\\
# &+[\{\text{Pm},\text{Pd}\}y^{(k)}]_{i}\theta^{(k)}_{3+J,l}] \\
# &+\text{skip-connection}
# \qquad i \in V, l = 1,2,3, ... b_{k+1}/2
# \end{split}
#
# Then, the line-graph representation :math:`y^{(k+1)}_{i,l}` with,
#
# .. math::
#
# \begin{split}
# y^{(k+1)}_{i',l^{'}} = {}&\rho[y^{(k)}_{i^{'}}\gamma^{(k)}_{1,l^{'}}+
# (D_{L(G)}y^{(k)})_{i^{'}}\gamma^{(k)}_{2,l^{'}}\\
# &+\sum^{J-1}_{j=0}(A_{L(G)}^{2^{j}}y^{k})_{i}\gamma^{(k)}_{3+j,l^{'}}\\
# &+[\{\text{Pm},\text{Pd}\}^{T}x^{(k+1)}]_{i^{'}}\gamma^{(k)}_{3+J,l^{'}}]\\
# &+\text{skip-connection}
# \qquad i^{'} \in V_{l}, l^{'} = 1,2,3, ... b^{'}_{k+1}/2
# \end{split}
#
# Where :math:`\text{skip-connection}` refers to performing the same operation without the non-linearity
# :math:`\rho`, and with linear projection :math:`\theta_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`
# and :math:`\gamma_\{\frac{b_{k+1}}{2} + 1, ..., b_{k+1}-1, b_{k+1}\}`.
#
# Implement LGNN in DGL
# ---------------------
# Even though the equations in the previous section might seem intimidating,
# it helps to understand the following information before you implement the LGNN.
#
# The two equations are symmetric and can be implemented as two instances
# of the same class with different parameters.
# The first equation operates on graph representation :math:`x`,
# whereas the second operates on line-graph
# representation :math:`y`. Let us denote this abstraction as :math:`f`. Then
# the first is :math:`f(x,y; \theta_x)`, and the second
# is :math:`f(y,x, \theta_y)`. That is, they are parameterized to compute
# representations of the original graph and its
# companion line graph, respectively.
#
# Each equation consists of four terms. Take the first one as an example, which follows.
#
# - :math:`x^{(k)}\theta^{(k)}_{1,l}`, a linear projection of previous
# layer's output :math:`x^{(k)}`, denote as :math:`\text{prev}(x)`.
# - :math:`(Dx^{(k)})\theta^{(k)}_{2,l}`, a linear projection of degree
# operator on :math:`x^{(k)}`, denote as :math:`\text{deg}(x)`.
# - :math:`\sum^{J-1}_{j=0}(A^{2^{j}}x^{(k)})\theta^{(k)}_{3+j,l}`,
# a summation of :math:`2^{j}` adjacency operator on :math:`x^{(k)}`,
# denote as :math:`\text{radius}(x)`
# - :math:`[\{Pm,Pd\}y^{(k)}]\theta^{(k)}_{3+J,l}`, fusing another
# graph's embedding information using incidence matrix
# :math:`\{Pm, Pd\}`, followed with a linear projection,
# denote as :math:`\text{fuse}(y)`.
#
# Each of the terms are performed again with different
# parameters, and without the nonlinearity after the sum.
# Therefore, :math:`f` could be written as:
#
# .. math::
# \begin{split}
# f(x^{(k)},y^{(k)}) = {}\rho[&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1})
# +\text{fuse}(y^{(k)})]\\
# +&\text{prev}(x^{(k-1)}) + \text{deg}(x^{(k-1)}) +\text{radius}(x^{k-1}) +\text{fuse}(y^{(k)})
# \end{split}
#
# Two equations are chained-up in the following order:
#
# .. math::
# \begin{split}
# x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
# y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
# \end{split}
#
# Keep in mind the listed observations in this overview and proceed to implementation.
# An important point is that you use different strategies for the noted terms.
#
# .. note::
# You can understand :math:`\{Pm, Pd\}` more thoroughly with this explanation.
# Roughly speaking, there is a relationship between how :math:`g` and
# :math:`lg` (the line graph) work together with loopy brief propagation.
# Here, you implement :math:`\{Pm, Pd\}` as a SciPy COO sparse matrix in the dataset,
# and stack them as tensors when batching. Another batching solution is to
# treat :math:`\{Pm, Pd\}` as the adjacency matrix of a bipartite graph, which maps
# line graph's feature to graph's, and vice versa.
#
# Implementing :math:`\text{prev}` and :math:`\text{deg}` as tensor operation
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Linear projection and degree operation are both simply matrix
# multiplication. Write them as PyTorch tensor operations.
#
# In ``__init__``, you define the projection variables.
#
# ::
#
# self.linear_prev = nn.Linear(in_feats, out_feats)
# self.linear_deg = nn.Linear(in_feats, out_feats)
#
#
# In ``forward()``, :math:`\text{prev}` and :math:`\text{deg}` are the same
# as any other PyTorch tensor operations.
#
# ::
#
# prev_proj = self.linear_prev(feat_a)
# deg_proj = self.linear_deg(deg * feat_a)
#
# Implementing :math:`\text{radius}` as message passing in DGL
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# As discussed in GCN tutorial, you can formulate one adjacency operator as
# doing one-step message passing. As a generalization, :math:`2^j` adjacency
# operations can be formulated as performing :math:`2^j` step of message
# passing. Therefore, the summation is equivalent to summing nodes'
# representation of :math:`2^j, j=0, 1, 2..` step message passing, i.e.
# gathering information in :math:`2^{j}` neighborhood of each node.
#
# In ``__init__``, define the projection variables used in each
# :math:`2^j` steps of message passing.
#
# ::
#
# self.linear_radius = nn.ModuleList(
# [nn.Linear(in_feats, out_feats) for i in range(radius)])
#
# In ``__forward__``, use following function ``aggregate_radius()`` to
# gather data from multiple hops. This can be seen in the following code.
# Note that the ``update_all`` is called multiple times.
# Return a list containing features gathered from multiple radius.
import dgl.function as fn
def aggregate_radius(radius, g, z):
# initializing list to collect message passing result
z_list = []
g.ndata["z"] = z
# pulling message from 1-hop neighbourhood
g.update_all(fn.copy_u(u="z", out="m"), fn.sum(msg="m", out="z"))
z_list.append(g.ndata["z"])
for i in range(radius - 1):
for j in range(2**i):
# pulling message from 2^j neighborhood
g.update_all(fn.copy_u(u="z", out="m"), fn.sum(msg="m", out="z"))
z_list.append(g.ndata["z"])
return z_list
#########################################################################
# Implementing :math:`\text{fuse}` as sparse matrix multiplication
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# :math:`\{Pm, Pd\}` is a sparse matrix with only two non-zero entries on
# each column. Therefore, you construct it as a sparse matrix in the dataset,
# and implement :math:`\text{fuse}` as a sparse matrix multiplication.
#
# in ``__forward__``:
#
# ::
#
# fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
#
# Completing :math:`f(x, y)`
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Finally, the following shows how to sum up all the terms together, pass it to skip connection, and
# batch norm.
#
# ::
#
# result = prev_proj + deg_proj + radius_proj + fuse
#
# Pass result to skip connection.
#
# ::
#
# result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
#
# Then pass the result to batch norm.
#
# ::
#
# result = self.bn(result) #Batch Normalization.
#
#
# Here is the complete code for one LGNN layer's abstraction :math:`f(x,y)`
class LGNNCore(nn.Module):
def __init__(self, in_feats, out_feats, radius):
super(LGNNCore, self).__init__()
self.out_feats = out_feats
self.radius = radius
self.linear_prev = nn.Linear(in_feats, out_feats)
self.linear_deg = nn.Linear(in_feats, out_feats)
self.linear_radius = nn.ModuleList(
[nn.Linear(in_feats, out_feats) for i in range(radius)]
)
self.linear_fuse = nn.Linear(in_feats, out_feats)
self.bn = nn.BatchNorm1d(out_feats)
def forward(self, g, feat_a, feat_b, deg, pm_pd):
# term "prev"
prev_proj = self.linear_prev(feat_a)
# term "deg"
deg_proj = self.linear_deg(deg * feat_a)
# term "radius"
# aggregate 2^j-hop features
hop2j_list = aggregate_radius(self.radius, g, feat_a)
# apply linear transformation
hop2j_list = [
linear(x) for linear, x in zip(self.linear_radius, hop2j_list)
]
radius_proj = sum(hop2j_list)
# term "fuse"
fuse = self.linear_fuse(th.mm(pm_pd, feat_b))
# sum them together
result = prev_proj + deg_proj + radius_proj + fuse
# skip connection and batch norm
n = self.out_feats // 2
result = th.cat([result[:, :n], F.relu(result[:, n:])], 1)
result = self.bn(result)
return result
##############################################################################################################
# Chain-up LGNN abstractions as an LGNN layer
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To implement:
#
# .. math::
# \begin{split}
# x^{(k+1)} = {}& f(x^{(k)}, y^{(k)})\\
# y^{(k+1)} = {}& f(y^{(k)}, x^{(k+1)})
# \end{split}
#
# Chain-up two ``LGNNCore`` instances, as in the example code, with different parameters in the forward pass.
class LGNNLayer(nn.Module):
def __init__(self, in_feats, out_feats, radius):
super(LGNNLayer, self).__init__()
self.g_layer = LGNNCore(in_feats, out_feats, radius)
self.lg_layer = LGNNCore(in_feats, out_feats, radius)
def forward(self, g, lg, x, lg_x, deg_g, deg_lg, pm_pd):
next_x = self.g_layer(g, x, lg_x, deg_g, pm_pd)
pm_pd_y = th.transpose(pm_pd, 0, 1)
next_lg_x = self.lg_layer(lg, lg_x, x, deg_lg, pm_pd_y)
return next_x, next_lg_x
########################################################################################
# Chain-up LGNN layers
# ~~~~~~~~~~~~~~~~~~~~
# Define an LGNN with three hidden layers, as in the following example.
class LGNN(nn.Module):
def __init__(self, radius):
super(LGNN, self).__init__()
self.layer1 = LGNNLayer(1, 16, radius) # input is scalar feature
self.layer2 = LGNNLayer(16, 16, radius) # hidden size is 16
self.layer3 = LGNNLayer(16, 16, radius)
self.linear = nn.Linear(16, 2) # predice two classes
def forward(self, g, lg, pm_pd):
# compute the degrees
deg_g = g.in_degrees().float().unsqueeze(1)
deg_lg = lg.in_degrees().float().unsqueeze(1)
# use degree as the input feature
x, lg_x = deg_g, deg_lg
x, lg_x = self.layer1(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
x, lg_x = self.layer2(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
x, lg_x = self.layer3(g, lg, x, lg_x, deg_g, deg_lg, pm_pd)
return self.linear(x)
#########################################################################################
# Training and inference
# -----------------------
# First load the data.
from torch.utils.data import DataLoader
training_loader = DataLoader(
train_set, batch_size=1, collate_fn=train_set.collate_fn, drop_last=True
)
#######################################################################################
# Next, define the main training loop. Note that each training sample contains
# three objects: A :class:`~dgl.DGLGraph`, a SciPy sparse matrix ``pmpd``, and a label
# array in ``numpy.ndarray``. Generate the line graph by using this command:
#
# ::
#
# lg = g.line_graph(backtracking=False)
#
# Note that ``backtracking=False`` is required to correctly simulate non-backtracking
# operation. We also define a utility function to convert the SciPy sparse matrix to
# torch sparse tensor.
# Create the model
model = LGNN(radius=3)
# define the optimizer
optimizer = th.optim.Adam(model.parameters(), lr=1e-2)
# A utility function to convert a scipy.coo_matrix to torch.SparseFloat
def sparse2th(mat):
value = mat.data
indices = th.LongTensor([mat.row, mat.col])
tensor = th.sparse.FloatTensor(
indices, th.from_numpy(value).float(), mat.shape
)
return tensor
# Train for 20 epochs
for i in range(20):
all_loss = []
all_acc = []
for [g, pmpd, label] in training_loader:
# Generate the line graph.
lg = g.line_graph(backtracking=False)
# Create torch tensors
pmpd = sparse2th(pmpd)
label = th.from_numpy(label)
# Forward
z = model(g, lg, pmpd)
# Calculate loss:
# Since there are only two communities, there are only two permutations
# of the community labels.
loss_perm1 = F.cross_entropy(z, label)
loss_perm2 = F.cross_entropy(z, 1 - label)
loss = th.min(loss_perm1, loss_perm2)
# Calculate accuracy:
_, pred = th.max(z, 1)
acc_perm1 = (pred == label).float().mean()
acc_perm2 = (pred == 1 - label).float().mean()
acc = th.max(acc_perm1, acc_perm2)
all_loss.append(loss.item())
all_acc.append(acc.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
niters = len(all_loss)
print(
"Epoch %d | loss %.4f | accuracy %.4f"
% (i, sum(all_loss) / niters, sum(all_acc) / niters)
)
#######################################################################################
# Visualize training progress
# -----------------------------
# You can visualize the network's community prediction on one training example,
# together with the ground truth. Start this with the following code example.
pmpd1 = sparse2th(pmpd1)
LG1 = G1.line_graph(backtracking=False)
z = model(G1, LG1, pmpd1)
_, pred = th.max(z, 1)
visualize(pred, nx_G1)
#######################################################################################
# Compared with the ground truth. Note that the color might be reversed for the
# two communities because the model is for correctly predicting the partitioning.
visualize(label1, nx_G1)
#########################################
# Here is an animation to better understand the process. (40 epochs)
#
# .. figure:: https://i.imgur.com/KDUyE1S.gif
# :alt: lgnn-anim
#
# Batching graphs for parallelism
# --------------------------------
#
# LGNN takes a collection of different graphs.
# You might consider whether batching can be used for parallelism.
#
# Batching has been into the data loader itself.
# In the ``collate_fn`` for PyTorch data loader, graphs are batched using DGL's
# batched_graph API. DGL batches graphs by merging them
# into a large graph, with each smaller graph's adjacency matrix being a block
# along the diagonal of the large graph's adjacency matrix. Concatenate
# :math`\{Pm,Pd\}` as block diagonal matrix in correspondence to DGL batched
# graph API.
def collate_fn(batch):
graphs, pmpds, labels = zip(*batch)
batched_graphs = dgl.batch(graphs)
batched_pmpds = sp.block_diag(pmpds)
batched_labels = np.concatenate(labels, axis=0)
return batched_graphs, batched_pmpds, batched_labels
######################################################################################
# You can find the complete code on Github at
# `Community Detection with Graph Neural Networks (CDGNN) <https://github.com/dmlc/dgl/tree/master/examples/pytorch/line_graph>`_.
......@@ -105,9 +105,8 @@ structure-free normalization, in the style of attention.
# subpackage. Simply import the ``GATConv`` as the follows.
import os
os.environ['DGLBACKEND'] = 'pytorch'
from dgl.nn.pytorch import GATConv
os.environ["DGLBACKEND"] = "pytorch"
###############################################################
# Readers can skip the following step-by-step explanation of the implementation and
# jump to the `Put everything together`_ for training and visualization results.
......@@ -125,6 +124,7 @@ from dgl.nn.pytorch import GATConv
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GATConv
class GATLayer(nn.Module):
......@@ -139,37 +139,38 @@ class GATLayer(nn.Module):
def reset_parameters(self):
"""Reinitialize learnable parameters."""
gain = nn.init.calculate_gain('relu')
gain = nn.init.calculate_gain("relu")
nn.init.xavier_normal_(self.fc.weight, gain=gain)
nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)
def edge_attention(self, edges):
# edge UDF for equation (2)
z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
z2 = torch.cat([edges.src["z"], edges.dst["z"]], dim=1)
a = self.attn_fc(z2)
return {'e': F.leaky_relu(a)}
return {"e": F.leaky_relu(a)}
def message_func(self, edges):
# message UDF for equation (3) & (4)
return {'z': edges.src['z'], 'e': edges.data['e']}
return {"z": edges.src["z"], "e": edges.data["e"]}
def reduce_func(self, nodes):
# reduce UDF for equation (3) & (4)
# equation (3)
alpha = F.softmax(nodes.mailbox['e'], dim=1)
alpha = F.softmax(nodes.mailbox["e"], dim=1)
# equation (4)
h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
return {'h': h}
h = torch.sum(alpha * nodes.mailbox["z"], dim=1)
return {"h": h}
def forward(self, h):
# equation (1)
z = self.fc(h)
self.g.ndata['z'] = z
self.g.ndata["z"] = z
# equation (2)
self.g.apply_edges(self.edge_attention)
# equation (3) & (4)
self.g.update_all(self.message_func, self.reduce_func)
return self.g.ndata.pop('h')
return self.g.ndata.pop("h")
##################################################################
# Equation (1)
......@@ -195,11 +196,13 @@ class GATLayer(nn.Module):
# ``apply_edges`` API. The argument to the ``apply_edges`` is an **Edge UDF**,
# which is defined as below:
def edge_attention(self, edges):
# edge UDF for equation (2)
z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
z2 = torch.cat([edges.src["z"], edges.dst["z"]], dim=1)
a = self.attn_fc(z2)
return {'e' : F.leaky_relu(a)}
return {"e": F.leaky_relu(a)}
########################################################################3
# Here, the dot product with the learnable weight vector :math:`\vec{a^{(l)}}`
......@@ -229,13 +232,15 @@ def edge_attention(self, edges):
# Both tasks first fetch data from the mailbox and then manipulate it on the
# second dimension (``dim=1``), on which the messages are batched.
def reduce_func(self, nodes):
# reduce UDF for equation (3) & (4)
# equation (3)
alpha = F.softmax(nodes.mailbox['e'], dim=1)
alpha = F.softmax(nodes.mailbox["e"], dim=1)
# equation (4)
h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
return {'h' : h}
h = torch.sum(alpha * nodes.mailbox["z"], dim=1)
return {"h": h}
#####################################################################
# Multi-head attention
......@@ -258,8 +263,9 @@ def reduce_func(self, nodes):
# Use the above defined single-head ``GATLayer`` as the building block
# for the ``MultiHeadGATLayer`` below:
class MultiHeadGATLayer(nn.Module):
def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
def __init__(self, g, in_dim, out_dim, num_heads, merge="cat"):
super(MultiHeadGATLayer, self).__init__()
self.heads = nn.ModuleList()
for i in range(num_heads):
......@@ -268,19 +274,21 @@ class MultiHeadGATLayer(nn.Module):
def forward(self, h):
head_outs = [attn_head(h) for attn_head in self.heads]
if self.merge == 'cat':
if self.merge == "cat":
# concat on the output feature dimension (dim=1)
return torch.cat(head_outs, dim=1)
else:
# merge using average
return torch.mean(torch.stack(head_outs))
###########################################################################
# Put everything together
# ^^^^^^^^^^^^^^^^^^^^^^^
#
# Now, you can define a two-layer GAT model.
class GAT(nn.Module):
def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
super(GAT, self).__init__()
......@@ -296,33 +304,34 @@ class GAT(nn.Module):
h = self.layer2(h)
return h
import networkx as nx
#############################################################################
# We then load the Cora dataset using DGL's built-in data module.
from dgl import DGLGraph
from dgl.data import citation_graph as citegrh
import networkx as nx
def load_cora_data():
data = citegrh.load_cora()
g = data[0]
mask = torch.BoolTensor(g.ndata['train_mask'])
return g, g.ndata['feat'], g.ndata['label'], mask
mask = torch.BoolTensor(g.ndata["train_mask"])
return g, g.ndata["feat"], g.ndata["label"], mask
##############################################################################
# The training loop is exactly the same as in the GCN tutorial.
import time
import numpy as np
g, features, labels, mask = load_cora_data()
# create the model, 2 heads, each head has hidden size 8
net = GAT(g,
in_dim=features.size()[1],
hidden_dim=8,
out_dim=7,
num_heads=2)
net = GAT(g, in_dim=features.size()[1], hidden_dim=8, out_dim=7, num_heads=2)
# create optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
......@@ -344,8 +353,11 @@ for epoch in range(30):
if epoch >= 3:
dur.append(time.time() - t0)
print("Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
epoch, loss.item(), np.mean(dur)))
print(
"Epoch {:05d} | Loss {:.4f} | Time(s) {:.4f}".format(
epoch, loss.item(), np.mean(dur)
)
)
#########################################################################
# Visualizing and understanding attention learned
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment