Unverified Commit dce89919 authored by Hongzhi (Steve), Chen's avatar Hongzhi (Steve), Chen Committed by GitHub
Browse files

[Misc] Auto-reformat multiple python folders. (#5325)



* auto-reformat

* lintrunner

---------
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
parent ab812179
......@@ -15,376 +15,408 @@ Tree-LSTM in DGL
efficiency. For recommended implementation, please refer to the `official
examples <https://github.com/dmlc/dgl/tree/master/examples>`_.
"""
##############################################################################
#
# In this tutorial, you learn to use Tree-LSTM networks for sentiment analysis.
# The Tree-LSTM is a generalization of long short-term memory (LSTM) networks to tree-structured network topologies.
#
# The Tree-LSTM structure was first introduced by Kai et. al in an ACL 2015
# paper: `Improved Semantic Representations From Tree-Structured Long
# Short-Term Memory Networks <https://arxiv.org/pdf/1503.00075.pdf>`__.
# The core idea is to introduce syntactic information for language tasks by
# extending the chain-structured LSTM to a tree-structured LSTM. The dependency
# tree and constituency tree techniques are leveraged to obtain a ''latent tree''.
#
# The challenge in training Tree-LSTMs is batching --- a standard
# technique in machine learning to accelerate optimization. However, since trees
# generally have different shapes by nature, parallization is non-trivial.
# DGL offers an alternative. Pool all the trees into one single graph then
# induce the message passing over them, guided by the structure of each tree.
#
# The task and the dataset
# ------------------------
#
# The steps here use the
# `Stanford Sentiment Treebank <https://nlp.stanford.edu/sentiment/>`__ in
# ``dgl.data``. The dataset provides a fine-grained, tree-level sentiment
# annotation. There are five classes: Very negative, negative, neutral, positive, and
# very positive, which indicate the sentiment in the current subtree. Non-leaf
# nodes in a constituency tree do not contain words, so use a special
# ``PAD_WORD`` token to denote them. During training and inference
# their embeddings would be masked to all-zero.
#
# .. figure:: https://i.loli.net/2018/11/08/5be3d4bfe031b.png
# :alt:
#
# The figure displays one sample of the SST dataset, which is a
# constituency parse tree with their nodes labeled with sentiment. To
# speed up things, build a tiny set with five sentences and take a look
# at the first one.
#
from collections import namedtuple
import os
os.environ['DGLBACKEND'] = 'pytorch'
import dgl
from dgl.data.tree import SSTDataset
SSTBatch = namedtuple('SSTBatch', ['graph', 'mask', 'wordid', 'label'])
# Each sample in the dataset is a constituency tree. The leaf nodes
# represent words. The word is an int value stored in the "x" field.
# The non-leaf nodes have a special word PAD_WORD. The sentiment
# label is stored in the "y" feature field.
trainset = SSTDataset(mode='tiny') # the "tiny" set has only five trees
tiny_sst = [tr for tr in trainset]
num_vocabs = trainset.vocab_size
num_classes = trainset.num_classes
vocab = trainset.vocab # vocabulary dict: key -> id
inv_vocab = {v: k for k, v in vocab.items()} # inverted vocabulary dict: id -> word
a_tree = tiny_sst[0]
for token in a_tree.ndata['x'].tolist():
if token != trainset.PAD_WORD:
print(inv_vocab[token], end=" ")
##############################################################################
# Step 1: Batching
# ----------------
#
# Add all the trees to one graph, using
# the :func:`~dgl.batched_graph.batch` API.
#
import networkx as nx
import matplotlib.pyplot as plt
graph = dgl.batch(tiny_sst)
def plot_tree(g):
# this plot requires pygraphviz package
pos = nx.nx_agraph.graphviz_layout(g, prog='dot')
nx.draw(g, pos, with_labels=False, node_size=10,
node_color=[[.5, .5, .5]], arrowsize=4)
plt.show()
plot_tree(graph.to_networkx())
#################################################################################
# You can read more about the definition of :func:`~dgl.batch`, or
# skip ahead to the next step:
# .. note::
#
# **Definition**: :func:`~dgl.batch` unions a list of :math:`B`
# :class:`~dgl.DGLGraph`\ s and returns a :class:`~dgl.DGLGraph` of batch
# size :math:`B`.
#
# - The union includes all the nodes,
# edges, and their features. The order of nodes, edges, and features are
# preserved.
#
# - Given that you have :math:`V_i` nodes for graph
# :math:`\mathcal{G}_i`, the node ID :math:`j` in graph
# :math:`\mathcal{G}_i` correspond to node ID
# :math:`j + \sum_{k=1}^{i-1} V_k` in the batched graph.
#
# - Therefore, performing feature transformation and message passing on
# the batched graph is equivalent to doing those
# on all ``DGLGraph`` constituents in parallel.
#
# - Duplicate references to the same graph are
# treated as deep copies; the nodes, edges, and features are duplicated,
# and mutation on one reference does not affect the other.
# - The batched graph keeps track of the meta
# information of the constituents so it can be
# :func:`~dgl.batched_graph.unbatch`\ ed to list of ``DGLGraph``\ s.
#
# Step 2: Tree-LSTM cell with message-passing APIs
# ------------------------------------------------
#
# Researchers have proposed two types of Tree-LSTMs: Child-Sum
# Tree-LSTMs, and :math:`N`-ary Tree-LSTMs. In this tutorial you focus
# on applying *Binary* Tree-LSTM to binarized constituency trees. This
# application is also known as *Constituency Tree-LSTM*. Use PyTorch
# as a backend framework to set up the network.
#
# In `N`-ary Tree-LSTM, each unit at node :math:`j` maintains a hidden
# representation :math:`h_j` and a memory cell :math:`c_j`. The unit
# :math:`j` takes the input vector :math:`x_j` and the hidden
# representations of the child units: :math:`h_{jl}, 1\leq l\leq N` as
# input, then update its new hidden representation :math:`h_j` and memory
# cell :math:`c_j` by:
#
# .. math::
#
# i_j & = & \sigma\left(W^{(i)}x_j + \sum_{l=1}^{N}U^{(i)}_l h_{jl} + b^{(i)}\right), & (1)\\
# f_{jk} & = & \sigma\left(W^{(f)}x_j + \sum_{l=1}^{N}U_{kl}^{(f)} h_{jl} + b^{(f)} \right), & (2)\\
# o_j & = & \sigma\left(W^{(o)}x_j + \sum_{l=1}^{N}U_{l}^{(o)} h_{jl} + b^{(o)} \right), & (3) \\
# u_j & = & \textrm{tanh}\left(W^{(u)}x_j + \sum_{l=1}^{N} U_l^{(u)}h_{jl} + b^{(u)} \right), & (4)\\
# c_j & = & i_j \odot u_j + \sum_{l=1}^{N} f_{jl} \odot c_{jl}, &(5) \\
# h_j & = & o_j \cdot \textrm{tanh}(c_j), &(6) \\
#
# It can be decomposed into three phases: ``message_func``,
# ``reduce_func`` and ``apply_node_func``.
#
# .. note::
# ``apply_node_func`` is a new node UDF that has not been introduced before. In
# ``apply_node_func``, a user specifies what to do with node features,
# without considering edge features and messages. In a Tree-LSTM case,
# ``apply_node_func`` is a must, since there exists (leaf) nodes with
# :math:`0` incoming edges, which would not be updated with
# ``reduce_func``.
#
import torch as th
import torch.nn as nn
class TreeLSTMCell(nn.Module):
def __init__(self, x_size, h_size):
super(TreeLSTMCell, self).__init__()
self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
self.U_iou = nn.Linear(2 * h_size, 3 * h_size, bias=False)
self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
self.U_f = nn.Linear(2 * h_size, 2 * h_size)
def message_func(self, edges):
return {'h': edges.src['h'], 'c': edges.src['c']}
def reduce_func(self, nodes):
# concatenate h_jl for equation (1), (2), (3), (4)
h_cat = nodes.mailbox['h'].view(nodes.mailbox['h'].size(0), -1)
# equation (2)
f = th.sigmoid(self.U_f(h_cat)).view(*nodes.mailbox['h'].size())
# second term of equation (5)
c = th.sum(f * nodes.mailbox['c'], 1)
return {'iou': self.U_iou(h_cat), 'c': c}
def apply_node_func(self, nodes):
# equation (1), (3), (4)
iou = nodes.data['iou'] + self.b_iou
i, o, u = th.chunk(iou, 3, 1)
i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
# equation (5)
c = i * u + nodes.data['c']
# equation (6)
h = o * th.tanh(c)
return {'h' : h, 'c' : c}
##############################################################################
# Step 3: Define traversal
# ------------------------
#
# After you define the message-passing functions, induce the
# right order to trigger them. This is a significant departure from models
# such as GCN, where all nodes are pulling messages from upstream ones
# *simultaneously*.
#
# In the case of Tree-LSTM, messages start from leaves of the tree, and
# propagate/processed upwards until they reach the roots. A visualization
# is as follows:
#
# .. figure:: https://i.loli.net/2018/11/09/5be4b5d2df54d.gif
# :alt:
#
# DGL defines a generator to perform the topological sort, each item is a
# tensor recording the nodes from bottom level to the roots. One can
# appreciate the degree of parallelism by inspecting the difference of the
# followings:
#
# to heterogenous graph
trv_a_tree = dgl.graph(a_tree.edges())
print('Traversing one tree:')
print(dgl.topological_nodes_generator(trv_a_tree))
# to heterogenous graph
trv_graph = dgl.graph(graph.edges())
print('Traversing many trees at the same time:')
print(dgl.topological_nodes_generator(trv_graph))
##############################################################################
# Call :meth:`~dgl.DGLGraph.prop_nodes` to trigger the message passing:
import dgl.function as fn
import torch as th
trv_graph.ndata['a'] = th.ones(graph.number_of_nodes(), 1)
traversal_order = dgl.topological_nodes_generator(trv_graph)
trv_graph.prop_nodes(traversal_order,
message_func=fn.copy_u('a', 'a'),
reduce_func=fn.sum('a', 'a'))
# the following is a syntax sugar that does the same
# dgl.prop_nodes_topo(graph)
##############################################################################
# .. note::
#
# Before you call :meth:`~dgl.DGLGraph.prop_nodes`, specify a
# `message_func` and `reduce_func` in advance. In the example, you can see built-in
# copy-from-source and sum functions as message functions, and a reduce
# function for demonstration.
#
# Putting it together
# -------------------
#
# Here is the complete code that specifies the ``Tree-LSTM`` class.
#
class TreeLSTM(nn.Module):
def __init__(self,
num_vocabs,
x_size,
h_size,
num_classes,
dropout,
pretrained_emb=None):
super(TreeLSTM, self).__init__()
self.x_size = x_size
self.embedding = nn.Embedding(num_vocabs, x_size)
if pretrained_emb is not None:
print('Using glove')
self.embedding.weight.data.copy_(pretrained_emb)
self.embedding.weight.requires_grad = True
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(h_size, num_classes)
self.cell = TreeLSTMCell(x_size, h_size)
def forward(self, batch, h, c):
"""Compute tree-lstm prediction given a batch.
Parameters
----------
batch : dgl.data.SSTBatch
The data batch.
h : Tensor
Initial hidden state.
c : Tensor
Initial cell state.
Returns
-------
logits : Tensor
The prediction of each node.
"""
g = batch.graph
# to heterogenous graph
g = dgl.graph(g.edges())
# feed embedding
embeds = self.embedding(batch.wordid * batch.mask)
g.ndata['iou'] = self.cell.W_iou(self.dropout(embeds)) * batch.mask.float().unsqueeze(-1)
g.ndata['h'] = h
g.ndata['c'] = c
# propagate
dgl.prop_nodes_topo(g,
message_func=self.cell.message_func,
reduce_func=self.cell.reduce_func,
apply_node_func=self.cell.apply_node_func)
# compute logits
h = self.dropout(g.ndata.pop('h'))
logits = self.linear(h)
return logits
##############################################################################
# Main Loop
# ---------
#
# Finally, you could write a training paradigm in PyTorch.
#
from torch.utils.data import DataLoader
import torch.nn.functional as F
device = th.device('cpu')
# hyper parameters
x_size = 256
h_size = 256
dropout = 0.5
lr = 0.05
weight_decay = 1e-4
epochs = 10
# create the model
model = TreeLSTM(trainset.vocab_size,
x_size,
h_size,
trainset.num_classes,
dropout)
print(model)
# create the optimizer
optimizer = th.optim.Adagrad(model.parameters(),
lr=lr,
weight_decay=weight_decay)
def batcher(dev):
def batcher_dev(batch):
batch_trees = dgl.batch(batch)
return SSTBatch(graph=batch_trees,
mask=batch_trees.ndata['mask'].to(device),
wordid=batch_trees.ndata['x'].to(device),
label=batch_trees.ndata['y'].to(device))
return batcher_dev
train_loader = DataLoader(dataset=tiny_sst,
batch_size=5,
collate_fn=batcher(device),
shuffle=False,
num_workers=0)
# training loop
for epoch in range(epochs):
for step, batch in enumerate(train_loader):
g = batch.graph
n = g.number_of_nodes()
h = th.zeros((n, h_size))
c = th.zeros((n, h_size))
logits = model(batch, h, c)
logp = F.log_softmax(logits, 1)
loss = F.nll_loss(logp, batch.label, reduction='sum')
optimizer.zero_grad()
loss.backward()
optimizer.step()
pred = th.argmax(logits, 1)
acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label)
print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format(
epoch, step, loss.item(), acc))
##############################################################################
# To train the model on a full dataset with different settings (such as CPU or GPU),
# refer to the `PyTorch example <https://github.com/dmlc/dgl/tree/master/examples/pytorch/tree_lstm>`__.
# There is also an implementation of the Child-Sum Tree-LSTM.
"""
import os
##############################################################################
#
# In this tutorial, you learn to use Tree-LSTM networks for sentiment analysis.
# The Tree-LSTM is a generalization of long short-term memory (LSTM) networks to tree-structured network topologies.
#
# The Tree-LSTM structure was first introduced by Kai et. al in an ACL 2015
# paper: `Improved Semantic Representations From Tree-Structured Long
# Short-Term Memory Networks <https://arxiv.org/pdf/1503.00075.pdf>`__.
# The core idea is to introduce syntactic information for language tasks by
# extending the chain-structured LSTM to a tree-structured LSTM. The dependency
# tree and constituency tree techniques are leveraged to obtain a ''latent tree''.
#
# The challenge in training Tree-LSTMs is batching --- a standard
# technique in machine learning to accelerate optimization. However, since trees
# generally have different shapes by nature, parallization is non-trivial.
# DGL offers an alternative. Pool all the trees into one single graph then
# induce the message passing over them, guided by the structure of each tree.
#
# The task and the dataset
# ------------------------
#
# The steps here use the
# `Stanford Sentiment Treebank <https://nlp.stanford.edu/sentiment/>`__ in
# ``dgl.data``. The dataset provides a fine-grained, tree-level sentiment
# annotation. There are five classes: Very negative, negative, neutral, positive, and
# very positive, which indicate the sentiment in the current subtree. Non-leaf
# nodes in a constituency tree do not contain words, so use a special
# ``PAD_WORD`` token to denote them. During training and inference
# their embeddings would be masked to all-zero.
#
# .. figure:: https://i.loli.net/2018/11/08/5be3d4bfe031b.png
# :alt:
#
# The figure displays one sample of the SST dataset, which is a
# constituency parse tree with their nodes labeled with sentiment. To
# speed up things, build a tiny set with five sentences and take a look
# at the first one.
#
from collections import namedtuple
os.environ["DGLBACKEND"] = "pytorch"
import dgl
from dgl.data.tree import SSTDataset
SSTBatch = namedtuple("SSTBatch", ["graph", "mask", "wordid", "label"])
# Each sample in the dataset is a constituency tree. The leaf nodes
# represent words. The word is an int value stored in the "x" field.
# The non-leaf nodes have a special word PAD_WORD. The sentiment
# label is stored in the "y" feature field.
trainset = SSTDataset(mode="tiny") # the "tiny" set has only five trees
tiny_sst = [tr for tr in trainset]
num_vocabs = trainset.vocab_size
num_classes = trainset.num_classes
vocab = trainset.vocab # vocabulary dict: key -> id
inv_vocab = {
v: k for k, v in vocab.items()
} # inverted vocabulary dict: id -> word
a_tree = tiny_sst[0]
for token in a_tree.ndata["x"].tolist():
if token != trainset.PAD_WORD:
print(inv_vocab[token], end=" ")
import matplotlib.pyplot as plt
##############################################################################
# Step 1: Batching
# ----------------
#
# Add all the trees to one graph, using
# the :func:`~dgl.batched_graph.batch` API.
#
import networkx as nx
graph = dgl.batch(tiny_sst)
def plot_tree(g):
# this plot requires pygraphviz package
pos = nx.nx_agraph.graphviz_layout(g, prog="dot")
nx.draw(
g,
pos,
with_labels=False,
node_size=10,
node_color=[[0.5, 0.5, 0.5]],
arrowsize=4,
)
plt.show()
plot_tree(graph.to_networkx())
#################################################################################
# You can read more about the definition of :func:`~dgl.batch`, or
# skip ahead to the next step:
# .. note::
#
# **Definition**: :func:`~dgl.batch` unions a list of :math:`B`
# :class:`~dgl.DGLGraph`\ s and returns a :class:`~dgl.DGLGraph` of batch
# size :math:`B`.
#
# - The union includes all the nodes,
# edges, and their features. The order of nodes, edges, and features are
# preserved.
#
# - Given that you have :math:`V_i` nodes for graph
# :math:`\mathcal{G}_i`, the node ID :math:`j` in graph
# :math:`\mathcal{G}_i` correspond to node ID
# :math:`j + \sum_{k=1}^{i-1} V_k` in the batched graph.
#
# - Therefore, performing feature transformation and message passing on
# the batched graph is equivalent to doing those
# on all ``DGLGraph`` constituents in parallel.
#
# - Duplicate references to the same graph are
# treated as deep copies; the nodes, edges, and features are duplicated,
# and mutation on one reference does not affect the other.
# - The batched graph keeps track of the meta
# information of the constituents so it can be
# :func:`~dgl.batched_graph.unbatch`\ ed to list of ``DGLGraph``\ s.
#
# Step 2: Tree-LSTM cell with message-passing APIs
# ------------------------------------------------
#
# Researchers have proposed two types of Tree-LSTMs: Child-Sum
# Tree-LSTMs, and :math:`N`-ary Tree-LSTMs. In this tutorial you focus
# on applying *Binary* Tree-LSTM to binarized constituency trees. This
# application is also known as *Constituency Tree-LSTM*. Use PyTorch
# as a backend framework to set up the network.
#
# In `N`-ary Tree-LSTM, each unit at node :math:`j` maintains a hidden
# representation :math:`h_j` and a memory cell :math:`c_j`. The unit
# :math:`j` takes the input vector :math:`x_j` and the hidden
# representations of the child units: :math:`h_{jl}, 1\leq l\leq N` as
# input, then update its new hidden representation :math:`h_j` and memory
# cell :math:`c_j` by:
#
# .. math::
#
# i_j & = & \sigma\left(W^{(i)}x_j + \sum_{l=1}^{N}U^{(i)}_l h_{jl} + b^{(i)}\right), & (1)\\
# f_{jk} & = & \sigma\left(W^{(f)}x_j + \sum_{l=1}^{N}U_{kl}^{(f)} h_{jl} + b^{(f)} \right), & (2)\\
# o_j & = & \sigma\left(W^{(o)}x_j + \sum_{l=1}^{N}U_{l}^{(o)} h_{jl} + b^{(o)} \right), & (3) \\
# u_j & = & \textrm{tanh}\left(W^{(u)}x_j + \sum_{l=1}^{N} U_l^{(u)}h_{jl} + b^{(u)} \right), & (4)\\
# c_j & = & i_j \odot u_j + \sum_{l=1}^{N} f_{jl} \odot c_{jl}, &(5) \\
# h_j & = & o_j \cdot \textrm{tanh}(c_j), &(6) \\
#
# It can be decomposed into three phases: ``message_func``,
# ``reduce_func`` and ``apply_node_func``.
#
# .. note::
# ``apply_node_func`` is a new node UDF that has not been introduced before. In
# ``apply_node_func``, a user specifies what to do with node features,
# without considering edge features and messages. In a Tree-LSTM case,
# ``apply_node_func`` is a must, since there exists (leaf) nodes with
# :math:`0` incoming edges, which would not be updated with
# ``reduce_func``.
#
import torch as th
import torch.nn as nn
class TreeLSTMCell(nn.Module):
def __init__(self, x_size, h_size):
super(TreeLSTMCell, self).__init__()
self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
self.U_iou = nn.Linear(2 * h_size, 3 * h_size, bias=False)
self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
self.U_f = nn.Linear(2 * h_size, 2 * h_size)
def message_func(self, edges):
return {"h": edges.src["h"], "c": edges.src["c"]}
def reduce_func(self, nodes):
# concatenate h_jl for equation (1), (2), (3), (4)
h_cat = nodes.mailbox["h"].view(nodes.mailbox["h"].size(0), -1)
# equation (2)
f = th.sigmoid(self.U_f(h_cat)).view(*nodes.mailbox["h"].size())
# second term of equation (5)
c = th.sum(f * nodes.mailbox["c"], 1)
return {"iou": self.U_iou(h_cat), "c": c}
def apply_node_func(self, nodes):
# equation (1), (3), (4)
iou = nodes.data["iou"] + self.b_iou
i, o, u = th.chunk(iou, 3, 1)
i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
# equation (5)
c = i * u + nodes.data["c"]
# equation (6)
h = o * th.tanh(c)
return {"h": h, "c": c}
##############################################################################
# Step 3: Define traversal
# ------------------------
#
# After you define the message-passing functions, induce the
# right order to trigger them. This is a significant departure from models
# such as GCN, where all nodes are pulling messages from upstream ones
# *simultaneously*.
#
# In the case of Tree-LSTM, messages start from leaves of the tree, and
# propagate/processed upwards until they reach the roots. A visualization
# is as follows:
#
# .. figure:: https://i.loli.net/2018/11/09/5be4b5d2df54d.gif
# :alt:
#
# DGL defines a generator to perform the topological sort, each item is a
# tensor recording the nodes from bottom level to the roots. One can
# appreciate the degree of parallelism by inspecting the difference of the
# followings:
#
# to heterogenous graph
trv_a_tree = dgl.graph(a_tree.edges())
print("Traversing one tree:")
print(dgl.topological_nodes_generator(trv_a_tree))
# to heterogenous graph
trv_graph = dgl.graph(graph.edges())
print("Traversing many trees at the same time:")
print(dgl.topological_nodes_generator(trv_graph))
##############################################################################
# Call :meth:`~dgl.DGLGraph.prop_nodes` to trigger the message passing:
import dgl.function as fn
import torch as th
trv_graph.ndata["a"] = th.ones(graph.number_of_nodes(), 1)
traversal_order = dgl.topological_nodes_generator(trv_graph)
trv_graph.prop_nodes(
traversal_order,
message_func=fn.copy_u("a", "a"),
reduce_func=fn.sum("a", "a"),
)
# the following is a syntax sugar that does the same
# dgl.prop_nodes_topo(graph)
##############################################################################
# .. note::
#
# Before you call :meth:`~dgl.DGLGraph.prop_nodes`, specify a
# `message_func` and `reduce_func` in advance. In the example, you can see built-in
# copy-from-source and sum functions as message functions, and a reduce
# function for demonstration.
#
# Putting it together
# -------------------
#
# Here is the complete code that specifies the ``Tree-LSTM`` class.
#
class TreeLSTM(nn.Module):
def __init__(
self,
num_vocabs,
x_size,
h_size,
num_classes,
dropout,
pretrained_emb=None,
):
super(TreeLSTM, self).__init__()
self.x_size = x_size
self.embedding = nn.Embedding(num_vocabs, x_size)
if pretrained_emb is not None:
print("Using glove")
self.embedding.weight.data.copy_(pretrained_emb)
self.embedding.weight.requires_grad = True
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(h_size, num_classes)
self.cell = TreeLSTMCell(x_size, h_size)
def forward(self, batch, h, c):
"""Compute tree-lstm prediction given a batch.
Parameters
----------
batch : dgl.data.SSTBatch
The data batch.
h : Tensor
Initial hidden state.
c : Tensor
Initial cell state.
Returns
-------
logits : Tensor
The prediction of each node.
"""
g = batch.graph
# to heterogenous graph
g = dgl.graph(g.edges())
# feed embedding
embeds = self.embedding(batch.wordid * batch.mask)
g.ndata["iou"] = self.cell.W_iou(
self.dropout(embeds)
) * batch.mask.float().unsqueeze(-1)
g.ndata["h"] = h
g.ndata["c"] = c
# propagate
dgl.prop_nodes_topo(
g,
message_func=self.cell.message_func,
reduce_func=self.cell.reduce_func,
apply_node_func=self.cell.apply_node_func,
)
# compute logits
h = self.dropout(g.ndata.pop("h"))
logits = self.linear(h)
return logits
import torch.nn.functional as F
##############################################################################
# Main Loop
# ---------
#
# Finally, you could write a training paradigm in PyTorch.
#
from torch.utils.data import DataLoader
device = th.device("cpu")
# hyper parameters
x_size = 256
h_size = 256
dropout = 0.5
lr = 0.05
weight_decay = 1e-4
epochs = 10
# create the model
model = TreeLSTM(
trainset.vocab_size, x_size, h_size, trainset.num_classes, dropout
)
print(model)
# create the optimizer
optimizer = th.optim.Adagrad(
model.parameters(), lr=lr, weight_decay=weight_decay
)
def batcher(dev):
def batcher_dev(batch):
batch_trees = dgl.batch(batch)
return SSTBatch(
graph=batch_trees,
mask=batch_trees.ndata["mask"].to(device),
wordid=batch_trees.ndata["x"].to(device),
label=batch_trees.ndata["y"].to(device),
)
return batcher_dev
train_loader = DataLoader(
dataset=tiny_sst,
batch_size=5,
collate_fn=batcher(device),
shuffle=False,
num_workers=0,
)
# training loop
for epoch in range(epochs):
for step, batch in enumerate(train_loader):
g = batch.graph
n = g.number_of_nodes()
h = th.zeros((n, h_size))
c = th.zeros((n, h_size))
logits = model(batch, h, c)
logp = F.log_softmax(logits, 1)
loss = F.nll_loss(logp, batch.label, reduction="sum")
optimizer.zero_grad()
loss.backward()
optimizer.step()
pred = th.argmax(logits, 1)
acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label)
print(
"Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format(
epoch, step, loss.item(), acc
)
)
##############################################################################
# To train the model on a full dataset with different settings (such as CPU or GPU),
# refer to the `PyTorch example <https://github.com/dmlc/dgl/tree/master/examples/pytorch/tree_lstm>`__.
# There is also an implementation of the Child-Sum Tree-LSTM.
......@@ -14,764 +14,780 @@ Generative Models of Graphs
efficiency. For recommended implementation, please refer to the `official
examples <https://github.com/dmlc/dgl/tree/master/examples>`_.
"""
##############################################################################
#
# In this tutorial, you learn how to train and generate one graph at
# a time. You also explore parallelism within the graph embedding operation, which is an
# essential building block. The tutorial ends with a simple optimization that
# delivers double the speed by batching across graphs.
#
# Earlier tutorials showed how embedding a graph or
# a node enables you to work on tasks such as `semi-supervised classification for nodes
# <http://docs.dgl.ai/tutorials/models/1_gcn.html#sphx-glr-tutorials-models-1-gcn-py>`__
# or `sentiment analysis
# <http://docs.dgl.ai/tutorials/models/3_tree-lstm.html#sphx-glr-tutorials-models-3-tree-lstm-py>`__.
# Wouldn't it be interesting to predict the future evolution of the graph and
# perform the analysis iteratively?
#
# To address the evolution of the graphs, you generate a variety of graph samples. In other words, you need
# **generative models** of graphs. In-addition to learning
# node and edge features, you would need to model the distribution of arbitrary graphs.
# While general generative models can model the density function explicitly and
# implicitly and generate samples at once or sequentially, you only focus
# on explicit generative models for sequential generation here. Typical applications
# include drug or materials discovery, chemical processes, or proteomics.
#
# Introduction
# --------------------
# The primitive actions of mutating a graph in Deep Graph Library (DGL) are nothing more than ``add_nodes``
# and ``add_edges``. That is, if you were to draw a circle of three nodes,
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48313438-78baf000-e5f7-11e8-931e-cd00ab34fa50.gif
# :alt:
#
# you can write the code as follows.
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
import dgl
g = dgl.DGLGraph()
g.add_nodes(1) # Add node 0
g.add_nodes(1) # Add node 1
# Edges in DGLGraph are directed by default.
# For undirected edges, add edges for both directions.
g.add_edges([1, 0], [0, 1]) # Add edges (1, 0), (0, 1)
g.add_nodes(1) # Add node 2
g.add_edges([2, 1], [1, 2]) # Add edges (2, 1), (1, 2)
g.add_edges([2, 0], [0, 2]) # Add edges (2, 0), (0, 2)
#######################################################################################
# Real-world graphs are much more complex. There are many families of graphs,
# with different sizes, topologies, node types, edge types, and the possibility
# of multigraphs. Besides, a same graph can be generated in many different
# orders. Regardless, the generative process entails a few steps.
#
# - Encode a changing graph.
# - Perform actions stochastically.
# - If you are training, collect error signals and optimize the model parameters.
#
# When it comes to implementation, another important aspect is speed. How do you
# parallelize the computation, given that generating a graph is fundamentally a
# sequential process?
#
# .. note::
#
# To be sure, this is not necessarily a hard constraint. Subgraphs can be
# built in parallel and then get assembled. But we
# will restrict ourselves to the sequential processes for this tutorial.
#
#
# DGMG: The main flow
# --------------------
# For this tutorial, you use
# `Deep Generative Models of Graphs <https://arxiv.org/abs/1803.03324>`__
# ) (DGMG) to implement a graph generative model using DGL. Its algorithmic
# framework is general but also challenging to parallelize.
#
# .. note::
#
# While it's possible for DGMG to handle complex graphs with typed nodes,
# typed edges, and multigraphs, here you use a simplified version of it
# for generating graph topologies.
#
# DGMG generates a graph by following a state machine, which is basically a
# two-level loop. Generate one node at a time and connect it to a subset of
# the existing nodes, one at a time. This is similar to language modeling. The
# generative process is an iterative one that emits one word or character or sentence
# at a time, conditioned on the sequence generated so far.
#
# At each time step, you either:
# - Add a new node to the graph
# - Select two existing nodes and add an edge between them
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48605003-7f11e900-e9b6-11e8-8880-87362348e154.png
# :alt:
#
# The Python code will look as follows. In fact, this is *exactly* how inference
# with DGMG is implemented in DGL.
#
def forward_inference(self):
stop = self.add_node_and_update()
while (not stop) and (self.g.number_of_nodes() < self.v_max + 1):
num_trials = 0
to_add_edge = self.add_edge_or_not()
while to_add_edge and (num_trials < self.g.number_of_nodes() - 1):
self.choose_dest_and_update()
num_trials += 1
to_add_edge = self.add_edge_or_not()
stop = self.add_node_and_update()
return self.g
#######################################################################################
# Assume you have a pre-trained model for generating cycles of nodes 10-20.
# How does it generate a cycle on-the-fly during inference? Use the code below
# to create an animation with your own model.
#
# ::
#
# import torch
# import matplotlib.animation as animation
# import matplotlib.pyplot as plt
# import networkx as nx
# from copy import deepcopy
#
# if __name__ == '__main__':
# # pre-trained model saved with path ./model.pth
# model = torch.load('./model.pth')
# model.eval()
# g = model()
#
# src_list = g.edges()[1]
# dest_list = g.edges()[0]
#
# evolution = []
#
# nx_g = nx.Graph()
# evolution.append(deepcopy(nx_g))
#
# for i in range(0, len(src_list), 2):
# src = src_list[i].item()
# dest = dest_list[i].item()
# if src not in nx_g.nodes():
# nx_g.add_node(src)
# evolution.append(deepcopy(nx_g))
# if dest not in nx_g.nodes():
# nx_g.add_node(dest)
# evolution.append(deepcopy(nx_g))
# nx_g.add_edges_from([(src, dest), (dest, src)])
# evolution.append(deepcopy(nx_g))
#
# def animate(i):
# ax.cla()
# g_t = evolution[i]
# nx.draw_circular(g_t, with_labels=True, ax=ax,
# node_color=['#FEBD69'] * g_t.number_of_nodes())
#
# fig, ax = plt.subplots()
# ani = animation.FuncAnimation(fig, animate,
# frames=len(evolution),
# interval=600)
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48928548-2644d200-ef1b-11e8-8591-da93345382ad.gif
# :alt:
#
# DGMG: Optimization objective
# ------------------------------
# Similar to language modeling, DGMG trains the model with *behavior cloning*,
# or *teacher forcing*. Assume for each graph there exists a sequence of
# *oracle actions* :math:`a_{1},\cdots,a_{T}` that generates it. What the model
# does is to follow these actions, compute the joint probabilities of such
# action sequences, and maximize them.
#
# By chain rule, the probability of taking :math:`a_{1},\cdots,a_{T}` is:
#
# .. math::
#
# p(a_{1},\cdots, a_{T}) = p(a_{1})p(a_{2}|a_{1})\cdots p(a_{T}|a_{1},\cdots,a_{T-1}).\\
#
# The optimization objective is then simply the typical MLE loss:
#
# .. math::
#
# -\log p(a_{1},\cdots,a_{T})=-\sum_{t=1}^{T}\log p(a_{t}|a_{1},\cdots, a_{t-1}).\\
#
def forward_train(self, actions):
"""
- actions: list
- Contains a_1, ..., a_T described above
- self.prepare_for_train()
- Initializes self.action_step to be 0, which will get
incremented by 1 every time it is called.
- Initializes objects recording log p(a_t|a_1,...a_{t-1})
Returns
-------
- self.get_log_prob(): log p(a_1, ..., a_T)
"""
self.prepare_for_train()
stop = self.add_node_and_update(a=actions[self.action_step])
while not stop:
to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
while to_add_edge:
self.choose_dest_and_update(a=actions[self.action_step])
to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
stop = self.add_node_and_update(a=actions[self.action_step])
return self.get_log_prob()
#######################################################################################
# The key difference between ``forward_train`` and ``forward_inference`` is
# that the training process takes oracle actions as input and returns log
# probabilities for evaluating the loss.
#
# DGMG: The implementation
# --------------------------
# The ``DGMG`` class
# ``````````````````````````
# Below you can find the skeleton code for the model. You gradually
# fill in the details for each function.
#
import torch.nn as nn
class DGMGSkeleton(nn.Module):
def __init__(self, v_max):
"""
Parameters
----------
v_max: int
Max number of nodes considered
"""
super(DGMGSkeleton, self).__init__()
# Graph configuration
self.v_max = v_max
def add_node_and_update(self, a=None):
"""Decide if to add a new node.
If a new node should be added, update the graph."""
return NotImplementedError
def add_edge_or_not(self, a=None):
"""Decide if a new edge should be added."""
return NotImplementedError
def choose_dest_and_update(self, a=None):
"""Choose destination and connect it to the latest node.
Add edges for both directions and update the graph."""
return NotImplementedError
def forward_train(self, actions):
"""Forward at training time. It records the probability
of generating a ground truth graph following the actions."""
return NotImplementedError
def forward_inference(self):
"""Forward at inference time.
It generates graphs on the fly."""
return NotImplementedError
def forward(self, actions=None):
# The graph you will work on
self.g = dgl.DGLGraph()
# If there are some features for nodes and edges,
# zero tensors will be set for those of new nodes and edges.
self.g.set_n_initializer(dgl.frame.zero_initializer)
self.g.set_e_initializer(dgl.frame.zero_initializer)
if self.training:
return self.forward_train(actions=actions)
else:
return self.forward_inference()
#######################################################################################
# Encoding a dynamic graph
# ``````````````````````````
# All the actions generating a graph are sampled from probability
# distributions. In order to do that, you project the structured data,
# namely the graph, onto an Euclidean space. The challenge is that such
# process, called *embedding*, needs to be repeated as the graphs mutate.
#
# Graph embedding
# ''''''''''''''''''''''''''
# Let :math:`G=(V,E)` be an arbitrary graph. Each node :math:`v` has an
# embedding vector :math:`\textbf{h}_{v} \in \mathbb{R}^{n}`. Similarly,
# the graph has an embedding vector :math:`\textbf{h}_{G} \in \mathbb{R}^{k}`.
# Typically, :math:`k > n` since a graph contains more information than
# an individual node.
#
# The graph embedding is a weighted sum of node embeddings under a linear
# transformation:
#
# .. math::
#
# \textbf{h}_{G} =\sum_{v\in V}\text{Sigmoid}(g_m(\textbf{h}_{v}))f_{m}(\textbf{h}_{v}),\\
#
# The first term, :math:`\text{Sigmoid}(g_m(\textbf{h}_{v}))`, computes a
# gating function and can be thought of as how much the overall graph embedding
# attends on each node. The second term :math:`f_{m}:\mathbb{R}^{n}\rightarrow\mathbb{R}^{k}`
# maps the node embeddings to the space of graph embeddings.
#
# Implement graph embedding as a ``GraphEmbed`` class.
#
import torch
class GraphEmbed(nn.Module):
def __init__(self, node_hidden_size):
super(GraphEmbed, self).__init__()
# Setting from the paper
self.graph_hidden_size = 2 * node_hidden_size
# Embed graphs
self.node_gating = nn.Sequential(
nn.Linear(node_hidden_size, 1),
nn.Sigmoid()
)
self.node_to_graph = nn.Linear(node_hidden_size,
self.graph_hidden_size)
def forward(self, g):
if g.number_of_nodes() == 0:
return torch.zeros(1, self.graph_hidden_size)
else:
# Node features are stored as hv in ndata.
hvs = g.ndata['hv']
return (self.node_gating(hvs) *
self.node_to_graph(hvs)).sum(0, keepdim=True)
#######################################################################################
# Update node embeddings via graph propagation
# '''''''''''''''''''''''''''''''''''''''''''''
#
# The mechanism of updating node embeddings in DGMG is similar to that for
# graph convolutional networks. For a node :math:`v` in the graph, its
# neighbor :math:`u` sends a message to it with
#
# .. math::
#
# \textbf{m}_{u\rightarrow v}=\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m},\\
#
# where :math:`\textbf{x}_{u,v}` is the embedding of the edge between
# :math:`u` and :math:`v`.
#
# After receiving messages from all its neighbors, :math:`v` summarizes them
# with a node activation vector
#
# .. math::
#
# \textbf{a}_{v} = \sum_{u: (u, v)\in E}\textbf{m}_{u\rightarrow v}\\
#
# and use this information to update its own feature:
#
# .. math::
#
# \textbf{h}'_{v} = \textbf{GRU}(\textbf{h}_{v}, \textbf{a}_{v}).\\
#
# Performing all the operations above once for all nodes synchronously is
# called one round of graph propagation. The more rounds of graph propagation
# you perform, the longer distance messages travel throughout the graph.
#
# With DGL, you implement graph propagation with ``g.update_all``.
# The message notation here can be a bit confusing. Researchers can refer
# to :math:`\textbf{m}_{u\rightarrow v}` as messages, however the message function
# below only passes :math:`\text{concat}([\textbf{h}_{u}, \textbf{x}_{u, v}])`.
# The operation :math:`\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m}`
# is then performed across all edges at once for efficiency consideration.
#
from functools import partial
class GraphProp(nn.Module):
def __init__(self, num_prop_rounds, node_hidden_size):
super(GraphProp, self).__init__()
self.num_prop_rounds = num_prop_rounds
# Setting from the paper
self.node_activation_hidden_size = 2 * node_hidden_size
message_funcs = []
node_update_funcs = []
self.reduce_funcs = []
for t in range(num_prop_rounds):
# input being [hv, hu, xuv]
message_funcs.append(nn.Linear(2 * node_hidden_size + 1,
self.node_activation_hidden_size))
self.reduce_funcs.append(partial(self.dgmg_reduce, round=t))
node_update_funcs.append(
nn.GRUCell(self.node_activation_hidden_size,
node_hidden_size))
self.message_funcs = nn.ModuleList(message_funcs)
self.node_update_funcs = nn.ModuleList(node_update_funcs)
def dgmg_msg(self, edges):
"""For an edge u->v, return concat([h_u, x_uv])"""
return {'m': torch.cat([edges.src['hv'],
edges.data['he']],
dim=1)}
def dgmg_reduce(self, nodes, round):
hv_old = nodes.data['hv']
m = nodes.mailbox['m']
message = torch.cat([
hv_old.unsqueeze(1).expand(-1, m.size(1), -1), m], dim=2)
node_activation = (self.message_funcs[round](message)).sum(1)
return {'a': node_activation}
def forward(self, g):
if g.number_of_edges() > 0:
for t in range(self.num_prop_rounds):
g.update_all(message_func=self.dgmg_msg,
reduce_func=self.reduce_funcs[t])
g.ndata['hv'] = self.node_update_funcs[t](
g.ndata['a'], g.ndata['hv'])
#######################################################################################
# Actions
# ``````````````````````````
# All actions are sampled from distributions parameterized using neural networks
# and here they are in turn.
#
# Action 1: Add nodes
# ''''''''''''''''''''''''''
#
# Given the graph embedding vector :math:`\textbf{h}_{G}`, evaluate
#
# .. math::
#
# \text{Sigmoid}(\textbf{W}_{\text{add node}}\textbf{h}_{G}+b_{\text{add node}}),\\
#
# which is then used to parametrize a Bernoulli distribution for deciding whether
# to add a new node.
#
# If a new node is to be added, initialize its feature with
#
# .. math::
#
# \textbf{W}_{\text{init}}\text{concat}([\textbf{h}_{\text{init}} , \textbf{h}_{G}])+\textbf{b}_{\text{init}},\\
#
# where :math:`\textbf{h}_{\text{init}}` is a learnable embedding module for
# untyped nodes.
#
import torch.nn.functional as F
from torch.distributions import Bernoulli
def bernoulli_action_log_prob(logit, action):
"""Calculate the log p of an action with respect to a Bernoulli
distribution. Use logit rather than prob for numerical stability."""
if action == 0:
return F.logsigmoid(-logit)
else:
return F.logsigmoid(logit)
class AddNode(nn.Module):
def __init__(self, graph_embed_func, node_hidden_size):
super(AddNode, self).__init__()
self.graph_op = {'embed': graph_embed_func}
self.stop = 1
self.add_node = nn.Linear(graph_embed_func.graph_hidden_size, 1)
# If to add a node, initialize its hv
self.node_type_embed = nn.Embedding(1, node_hidden_size)
self.initialize_hv = nn.Linear(node_hidden_size + \
graph_embed_func.graph_hidden_size,
node_hidden_size)
self.init_node_activation = torch.zeros(1, 2 * node_hidden_size)
def _initialize_node_repr(self, g, node_type, graph_embed):
"""Whenver a node is added, initialize its representation."""
num_nodes = g.number_of_nodes()
hv_init = self.initialize_hv(
torch.cat([
self.node_type_embed(torch.LongTensor([node_type])),
graph_embed], dim=1))
g.nodes[num_nodes - 1].data['hv'] = hv_init
g.nodes[num_nodes - 1].data['a'] = self.init_node_activation
def prepare_training(self):
self.log_prob = []
def forward(self, g, action=None):
graph_embed = self.graph_op['embed'](g)
logit = self.add_node(graph_embed)
prob = torch.sigmoid(logit)
if not self.training:
action = Bernoulli(prob).sample().item()
stop = bool(action == self.stop)
if not stop:
g.add_nodes(1)
self._initialize_node_repr(g, action, graph_embed)
if self.training:
sample_log_prob = bernoulli_action_log_prob(logit, action)
self.log_prob.append(sample_log_prob)
return stop
#######################################################################################
# Action 2: Add edges
# ''''''''''''''''''''''''''
#
# Given the graph embedding vector :math:`\textbf{h}_{G}` and the node
# embedding vector :math:`\textbf{h}_{v}` for the latest node :math:`v`,
# you evaluate
#
# .. math::
#
# \text{Sigmoid}(\textbf{W}_{\text{add edge}}\text{concat}([\textbf{h}_{G}, \textbf{h}_{v}])+b_{\text{add edge}}),\\
#
# which is then used to parametrize a Bernoulli distribution for deciding
# whether to add a new edge starting from :math:`v`.
#
class AddEdge(nn.Module):
def __init__(self, graph_embed_func, node_hidden_size):
super(AddEdge, self).__init__()
self.graph_op = {'embed': graph_embed_func}
self.add_edge = nn.Linear(graph_embed_func.graph_hidden_size + \
node_hidden_size, 1)
def prepare_training(self):
self.log_prob = []
def forward(self, g, action=None):
graph_embed = self.graph_op['embed'](g)
src_embed = g.nodes[g.number_of_nodes() - 1].data['hv']
logit = self.add_edge(torch.cat(
[graph_embed, src_embed], dim=1))
prob = torch.sigmoid(logit)
if self.training:
sample_log_prob = bernoulli_action_log_prob(logit, action)
self.log_prob.append(sample_log_prob)
else:
action = Bernoulli(prob).sample().item()
to_add_edge = bool(action == 0)
return to_add_edge
#######################################################################################
# Action 3: Choose a destination
# '''''''''''''''''''''''''''''''''
#
# When action 2 returns `True`, choose a destination for the
# latest node :math:`v`.
#
# For each possible destination :math:`u\in\{0, \cdots, v-1\}`, the
# probability of choosing it is given by
#
# .. math::
#
# \frac{\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{u}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}{\sum_{i=0}^{v-1}\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{i}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}\\
#
from torch.distributions import Categorical
class ChooseDestAndUpdate(nn.Module):
def __init__(self, graph_prop_func, node_hidden_size):
super(ChooseDestAndUpdate, self).__init__()
self.graph_op = {'prop': graph_prop_func}
self.choose_dest = nn.Linear(2 * node_hidden_size, 1)
def _initialize_edge_repr(self, g, src_list, dest_list):
# For untyped edges, only add 1 to indicate its existence.
# For multiple edge types, use a one-hot representation
# or an embedding module.
edge_repr = torch.ones(len(src_list), 1)
g.edges[src_list, dest_list].data['he'] = edge_repr
def prepare_training(self):
self.log_prob = []
def forward(self, g, dest):
src = g.number_of_nodes() - 1
possible_dests = range(src)
src_embed_expand = g.nodes[src].data['hv'].expand(src, -1)
possible_dests_embed = g.nodes[possible_dests].data['hv']
dests_scores = self.choose_dest(
torch.cat([possible_dests_embed,
src_embed_expand], dim=1)).view(1, -1)
dests_probs = F.softmax(dests_scores, dim=1)
if not self.training:
dest = Categorical(dests_probs).sample().item()
if not g.has_edges_between(src, dest):
# For undirected graphs, add edges for both directions
# so that you can perform graph propagation.
src_list = [src, dest]
dest_list = [dest, src]
g.add_edges(src_list, dest_list)
self._initialize_edge_repr(g, src_list, dest_list)
self.graph_op['prop'](g)
if self.training:
if dests_probs.nelement() > 1:
self.log_prob.append(
F.log_softmax(dests_scores, dim=1)[:, dest: dest + 1])
#######################################################################################
# Putting it together
# ``````````````````````````
#
# You are now ready to have a complete implementation of the model class.
#
class DGMG(DGMGSkeleton):
def __init__(self, v_max, node_hidden_size,
num_prop_rounds):
super(DGMG, self).__init__(v_max)
# Graph embedding module
self.graph_embed = GraphEmbed(node_hidden_size)
# Graph propagation module
self.graph_prop = GraphProp(num_prop_rounds,
node_hidden_size)
# Actions
self.add_node_agent = AddNode(
self.graph_embed, node_hidden_size)
self.add_edge_agent = AddEdge(
self.graph_embed, node_hidden_size)
self.choose_dest_agent = ChooseDestAndUpdate(
self.graph_prop, node_hidden_size)
# Forward functions
self.forward_train = partial(forward_train, self=self)
self.forward_inference = partial(forward_inference, self=self)
@property
def action_step(self):
old_step_count = self.step_count
self.step_count += 1
return old_step_count
def prepare_for_train(self):
self.step_count = 0
self.add_node_agent.prepare_training()
self.add_edge_agent.prepare_training()
self.choose_dest_agent.prepare_training()
def add_node_and_update(self, a=None):
"""Decide if to add a new node.
If a new node should be added, update the graph."""
return self.add_node_agent(self.g, a)
def add_edge_or_not(self, a=None):
"""Decide if a new edge should be added."""
return self.add_edge_agent(self.g, a)
def choose_dest_and_update(self, a=None):
"""Choose destination and connect it to the latest node.
Add edges for both directions and update the graph."""
self.choose_dest_agent(self.g, a)
def get_log_prob(self):
add_node_log_p = torch.cat(self.add_node_agent.log_prob).sum()
add_edge_log_p = torch.cat(self.add_edge_agent.log_prob).sum()
choose_dest_log_p = torch.cat(self.choose_dest_agent.log_prob).sum()
return add_node_log_p + add_edge_log_p + choose_dest_log_p
#######################################################################################
# Below is an animation where a graph is generated on the fly
# after every 10 batches of training for the first 400 batches. You
# can see how the model improves over time and begins generating cycles.
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48929291-60fe3880-ef22-11e8-832a-fbe56656559a.gif
# :alt:
#
# For generative models, you can evaluate performance by checking the percentage
# of valid graphs among the graphs it generates on the fly.
import torch.utils.model_zoo as model_zoo
# Download a pre-trained model state dict for generating cycles with 10-20 nodes.
state_dict = model_zoo.load_url('https://data.dgl.ai/model/dgmg_cycles-5a0c40be.pth')
model = DGMG(v_max=20, node_hidden_size=16, num_prop_rounds=2)
model.load_state_dict(state_dict)
model.eval()
def is_valid(g):
# Check if g is a cycle having 10-20 nodes.
def _get_previous(i, v_max):
if i == 0:
return v_max
else:
return i - 1
def _get_next(i, v_max):
if i == v_max:
return 0
else:
return i + 1
size = g.number_of_nodes()
if size < 10 or size > 20:
return False
for node in range(size):
neighbors = g.successors(node)
if len(neighbors) != 2:
return False
if _get_previous(node, size - 1) not in neighbors:
return False
if _get_next(node, size - 1) not in neighbors:
return False
return True
num_valid = 0
for i in range(100):
g = model()
num_valid += is_valid(g)
del model
print('Among 100 graphs generated, {}% are valid.'.format(num_valid))
#######################################################################################
# For the complete implementation, see the `DGL DGMG example
# <https://github.com/dmlc/dgl/tree/master/examples/pytorch/dgmg>`__.
#
"""
##############################################################################
#
# In this tutorial, you learn how to train and generate one graph at
# a time. You also explore parallelism within the graph embedding operation, which is an
# essential building block. The tutorial ends with a simple optimization that
# delivers double the speed by batching across graphs.
#
# Earlier tutorials showed how embedding a graph or
# a node enables you to work on tasks such as `semi-supervised classification for nodes
# <http://docs.dgl.ai/tutorials/models/1_gcn.html#sphx-glr-tutorials-models-1-gcn-py>`__
# or `sentiment analysis
# <http://docs.dgl.ai/tutorials/models/3_tree-lstm.html#sphx-glr-tutorials-models-3-tree-lstm-py>`__.
# Wouldn't it be interesting to predict the future evolution of the graph and
# perform the analysis iteratively?
#
# To address the evolution of the graphs, you generate a variety of graph samples. In other words, you need
# **generative models** of graphs. In-addition to learning
# node and edge features, you would need to model the distribution of arbitrary graphs.
# While general generative models can model the density function explicitly and
# implicitly and generate samples at once or sequentially, you only focus
# on explicit generative models for sequential generation here. Typical applications
# include drug or materials discovery, chemical processes, or proteomics.
#
# Introduction
# --------------------
# The primitive actions of mutating a graph in Deep Graph Library (DGL) are nothing more than ``add_nodes``
# and ``add_edges``. That is, if you were to draw a circle of three nodes,
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48313438-78baf000-e5f7-11e8-931e-cd00ab34fa50.gif
# :alt:
#
# you can write the code as follows.
#
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
g = dgl.DGLGraph()
g.add_nodes(1) # Add node 0
g.add_nodes(1) # Add node 1
# Edges in DGLGraph are directed by default.
# For undirected edges, add edges for both directions.
g.add_edges([1, 0], [0, 1]) # Add edges (1, 0), (0, 1)
g.add_nodes(1) # Add node 2
g.add_edges([2, 1], [1, 2]) # Add edges (2, 1), (1, 2)
g.add_edges([2, 0], [0, 2]) # Add edges (2, 0), (0, 2)
#######################################################################################
# Real-world graphs are much more complex. There are many families of graphs,
# with different sizes, topologies, node types, edge types, and the possibility
# of multigraphs. Besides, a same graph can be generated in many different
# orders. Regardless, the generative process entails a few steps.
#
# - Encode a changing graph.
# - Perform actions stochastically.
# - If you are training, collect error signals and optimize the model parameters.
#
# When it comes to implementation, another important aspect is speed. How do you
# parallelize the computation, given that generating a graph is fundamentally a
# sequential process?
#
# .. note::
#
# To be sure, this is not necessarily a hard constraint. Subgraphs can be
# built in parallel and then get assembled. But we
# will restrict ourselves to the sequential processes for this tutorial.
#
#
# DGMG: The main flow
# --------------------
# For this tutorial, you use
# `Deep Generative Models of Graphs <https://arxiv.org/abs/1803.03324>`__
# ) (DGMG) to implement a graph generative model using DGL. Its algorithmic
# framework is general but also challenging to parallelize.
#
# .. note::
#
# While it's possible for DGMG to handle complex graphs with typed nodes,
# typed edges, and multigraphs, here you use a simplified version of it
# for generating graph topologies.
#
# DGMG generates a graph by following a state machine, which is basically a
# two-level loop. Generate one node at a time and connect it to a subset of
# the existing nodes, one at a time. This is similar to language modeling. The
# generative process is an iterative one that emits one word or character or sentence
# at a time, conditioned on the sequence generated so far.
#
# At each time step, you either:
# - Add a new node to the graph
# - Select two existing nodes and add an edge between them
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48605003-7f11e900-e9b6-11e8-8880-87362348e154.png
# :alt:
#
# The Python code will look as follows. In fact, this is *exactly* how inference
# with DGMG is implemented in DGL.
#
def forward_inference(self):
stop = self.add_node_and_update()
while (not stop) and (self.g.number_of_nodes() < self.v_max + 1):
num_trials = 0
to_add_edge = self.add_edge_or_not()
while to_add_edge and (num_trials < self.g.number_of_nodes() - 1):
self.choose_dest_and_update()
num_trials += 1
to_add_edge = self.add_edge_or_not()
stop = self.add_node_and_update()
return self.g
#######################################################################################
# Assume you have a pre-trained model for generating cycles of nodes 10-20.
# How does it generate a cycle on-the-fly during inference? Use the code below
# to create an animation with your own model.
#
# ::
#
# import torch
# import matplotlib.animation as animation
# import matplotlib.pyplot as plt
# import networkx as nx
# from copy import deepcopy
#
# if __name__ == '__main__':
# # pre-trained model saved with path ./model.pth
# model = torch.load('./model.pth')
# model.eval()
# g = model()
#
# src_list = g.edges()[1]
# dest_list = g.edges()[0]
#
# evolution = []
#
# nx_g = nx.Graph()
# evolution.append(deepcopy(nx_g))
#
# for i in range(0, len(src_list), 2):
# src = src_list[i].item()
# dest = dest_list[i].item()
# if src not in nx_g.nodes():
# nx_g.add_node(src)
# evolution.append(deepcopy(nx_g))
# if dest not in nx_g.nodes():
# nx_g.add_node(dest)
# evolution.append(deepcopy(nx_g))
# nx_g.add_edges_from([(src, dest), (dest, src)])
# evolution.append(deepcopy(nx_g))
#
# def animate(i):
# ax.cla()
# g_t = evolution[i]
# nx.draw_circular(g_t, with_labels=True, ax=ax,
# node_color=['#FEBD69'] * g_t.number_of_nodes())
#
# fig, ax = plt.subplots()
# ani = animation.FuncAnimation(fig, animate,
# frames=len(evolution),
# interval=600)
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48928548-2644d200-ef1b-11e8-8591-da93345382ad.gif
# :alt:
#
# DGMG: Optimization objective
# ------------------------------
# Similar to language modeling, DGMG trains the model with *behavior cloning*,
# or *teacher forcing*. Assume for each graph there exists a sequence of
# *oracle actions* :math:`a_{1},\cdots,a_{T}` that generates it. What the model
# does is to follow these actions, compute the joint probabilities of such
# action sequences, and maximize them.
#
# By chain rule, the probability of taking :math:`a_{1},\cdots,a_{T}` is:
#
# .. math::
#
# p(a_{1},\cdots, a_{T}) = p(a_{1})p(a_{2}|a_{1})\cdots p(a_{T}|a_{1},\cdots,a_{T-1}).\\
#
# The optimization objective is then simply the typical MLE loss:
#
# .. math::
#
# -\log p(a_{1},\cdots,a_{T})=-\sum_{t=1}^{T}\log p(a_{t}|a_{1},\cdots, a_{t-1}).\\
#
def forward_train(self, actions):
"""
- actions: list
- Contains a_1, ..., a_T described above
- self.prepare_for_train()
- Initializes self.action_step to be 0, which will get
incremented by 1 every time it is called.
- Initializes objects recording log p(a_t|a_1,...a_{t-1})
Returns
-------
- self.get_log_prob(): log p(a_1, ..., a_T)
"""
self.prepare_for_train()
stop = self.add_node_and_update(a=actions[self.action_step])
while not stop:
to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
while to_add_edge:
self.choose_dest_and_update(a=actions[self.action_step])
to_add_edge = self.add_edge_or_not(a=actions[self.action_step])
stop = self.add_node_and_update(a=actions[self.action_step])
return self.get_log_prob()
#######################################################################################
# The key difference between ``forward_train`` and ``forward_inference`` is
# that the training process takes oracle actions as input and returns log
# probabilities for evaluating the loss.
#
# DGMG: The implementation
# --------------------------
# The ``DGMG`` class
# ``````````````````````````
# Below you can find the skeleton code for the model. You gradually
# fill in the details for each function.
#
import torch.nn as nn
class DGMGSkeleton(nn.Module):
def __init__(self, v_max):
"""
Parameters
----------
v_max: int
Max number of nodes considered
"""
super(DGMGSkeleton, self).__init__()
# Graph configuration
self.v_max = v_max
def add_node_and_update(self, a=None):
"""Decide if to add a new node.
If a new node should be added, update the graph."""
return NotImplementedError
def add_edge_or_not(self, a=None):
"""Decide if a new edge should be added."""
return NotImplementedError
def choose_dest_and_update(self, a=None):
"""Choose destination and connect it to the latest node.
Add edges for both directions and update the graph."""
return NotImplementedError
def forward_train(self, actions):
"""Forward at training time. It records the probability
of generating a ground truth graph following the actions."""
return NotImplementedError
def forward_inference(self):
"""Forward at inference time.
It generates graphs on the fly."""
return NotImplementedError
def forward(self, actions=None):
# The graph you will work on
self.g = dgl.DGLGraph()
# If there are some features for nodes and edges,
# zero tensors will be set for those of new nodes and edges.
self.g.set_n_initializer(dgl.frame.zero_initializer)
self.g.set_e_initializer(dgl.frame.zero_initializer)
if self.training:
return self.forward_train(actions=actions)
else:
return self.forward_inference()
#######################################################################################
# Encoding a dynamic graph
# ``````````````````````````
# All the actions generating a graph are sampled from probability
# distributions. In order to do that, you project the structured data,
# namely the graph, onto an Euclidean space. The challenge is that such
# process, called *embedding*, needs to be repeated as the graphs mutate.
#
# Graph embedding
# ''''''''''''''''''''''''''
# Let :math:`G=(V,E)` be an arbitrary graph. Each node :math:`v` has an
# embedding vector :math:`\textbf{h}_{v} \in \mathbb{R}^{n}`. Similarly,
# the graph has an embedding vector :math:`\textbf{h}_{G} \in \mathbb{R}^{k}`.
# Typically, :math:`k > n` since a graph contains more information than
# an individual node.
#
# The graph embedding is a weighted sum of node embeddings under a linear
# transformation:
#
# .. math::
#
# \textbf{h}_{G} =\sum_{v\in V}\text{Sigmoid}(g_m(\textbf{h}_{v}))f_{m}(\textbf{h}_{v}),\\
#
# The first term, :math:`\text{Sigmoid}(g_m(\textbf{h}_{v}))`, computes a
# gating function and can be thought of as how much the overall graph embedding
# attends on each node. The second term :math:`f_{m}:\mathbb{R}^{n}\rightarrow\mathbb{R}^{k}`
# maps the node embeddings to the space of graph embeddings.
#
# Implement graph embedding as a ``GraphEmbed`` class.
#
import torch
class GraphEmbed(nn.Module):
def __init__(self, node_hidden_size):
super(GraphEmbed, self).__init__()
# Setting from the paper
self.graph_hidden_size = 2 * node_hidden_size
# Embed graphs
self.node_gating = nn.Sequential(
nn.Linear(node_hidden_size, 1), nn.Sigmoid()
)
self.node_to_graph = nn.Linear(node_hidden_size, self.graph_hidden_size)
def forward(self, g):
if g.number_of_nodes() == 0:
return torch.zeros(1, self.graph_hidden_size)
else:
# Node features are stored as hv in ndata.
hvs = g.ndata["hv"]
return (self.node_gating(hvs) * self.node_to_graph(hvs)).sum(
0, keepdim=True
)
#######################################################################################
# Update node embeddings via graph propagation
# '''''''''''''''''''''''''''''''''''''''''''''
#
# The mechanism of updating node embeddings in DGMG is similar to that for
# graph convolutional networks. For a node :math:`v` in the graph, its
# neighbor :math:`u` sends a message to it with
#
# .. math::
#
# \textbf{m}_{u\rightarrow v}=\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m},\\
#
# where :math:`\textbf{x}_{u,v}` is the embedding of the edge between
# :math:`u` and :math:`v`.
#
# After receiving messages from all its neighbors, :math:`v` summarizes them
# with a node activation vector
#
# .. math::
#
# \textbf{a}_{v} = \sum_{u: (u, v)\in E}\textbf{m}_{u\rightarrow v}\\
#
# and use this information to update its own feature:
#
# .. math::
#
# \textbf{h}'_{v} = \textbf{GRU}(\textbf{h}_{v}, \textbf{a}_{v}).\\
#
# Performing all the operations above once for all nodes synchronously is
# called one round of graph propagation. The more rounds of graph propagation
# you perform, the longer distance messages travel throughout the graph.
#
# With DGL, you implement graph propagation with ``g.update_all``.
# The message notation here can be a bit confusing. Researchers can refer
# to :math:`\textbf{m}_{u\rightarrow v}` as messages, however the message function
# below only passes :math:`\text{concat}([\textbf{h}_{u}, \textbf{x}_{u, v}])`.
# The operation :math:`\textbf{W}_{m}\text{concat}([\textbf{h}_{v}, \textbf{h}_{u}, \textbf{x}_{u, v}]) + \textbf{b}_{m}`
# is then performed across all edges at once for efficiency consideration.
#
from functools import partial
class GraphProp(nn.Module):
def __init__(self, num_prop_rounds, node_hidden_size):
super(GraphProp, self).__init__()
self.num_prop_rounds = num_prop_rounds
# Setting from the paper
self.node_activation_hidden_size = 2 * node_hidden_size
message_funcs = []
node_update_funcs = []
self.reduce_funcs = []
for t in range(num_prop_rounds):
# input being [hv, hu, xuv]
message_funcs.append(
nn.Linear(
2 * node_hidden_size + 1, self.node_activation_hidden_size
)
)
self.reduce_funcs.append(partial(self.dgmg_reduce, round=t))
node_update_funcs.append(
nn.GRUCell(self.node_activation_hidden_size, node_hidden_size)
)
self.message_funcs = nn.ModuleList(message_funcs)
self.node_update_funcs = nn.ModuleList(node_update_funcs)
def dgmg_msg(self, edges):
"""For an edge u->v, return concat([h_u, x_uv])"""
return {"m": torch.cat([edges.src["hv"], edges.data["he"]], dim=1)}
def dgmg_reduce(self, nodes, round):
hv_old = nodes.data["hv"]
m = nodes.mailbox["m"]
message = torch.cat(
[hv_old.unsqueeze(1).expand(-1, m.size(1), -1), m], dim=2
)
node_activation = (self.message_funcs[round](message)).sum(1)
return {"a": node_activation}
def forward(self, g):
if g.number_of_edges() > 0:
for t in range(self.num_prop_rounds):
g.update_all(
message_func=self.dgmg_msg, reduce_func=self.reduce_funcs[t]
)
g.ndata["hv"] = self.node_update_funcs[t](
g.ndata["a"], g.ndata["hv"]
)
#######################################################################################
# Actions
# ``````````````````````````
# All actions are sampled from distributions parameterized using neural networks
# and here they are in turn.
#
# Action 1: Add nodes
# ''''''''''''''''''''''''''
#
# Given the graph embedding vector :math:`\textbf{h}_{G}`, evaluate
#
# .. math::
#
# \text{Sigmoid}(\textbf{W}_{\text{add node}}\textbf{h}_{G}+b_{\text{add node}}),\\
#
# which is then used to parametrize a Bernoulli distribution for deciding whether
# to add a new node.
#
# If a new node is to be added, initialize its feature with
#
# .. math::
#
# \textbf{W}_{\text{init}}\text{concat}([\textbf{h}_{\text{init}} , \textbf{h}_{G}])+\textbf{b}_{\text{init}},\\
#
# where :math:`\textbf{h}_{\text{init}}` is a learnable embedding module for
# untyped nodes.
#
import torch.nn.functional as F
from torch.distributions import Bernoulli
def bernoulli_action_log_prob(logit, action):
"""Calculate the log p of an action with respect to a Bernoulli
distribution. Use logit rather than prob for numerical stability."""
if action == 0:
return F.logsigmoid(-logit)
else:
return F.logsigmoid(logit)
class AddNode(nn.Module):
def __init__(self, graph_embed_func, node_hidden_size):
super(AddNode, self).__init__()
self.graph_op = {"embed": graph_embed_func}
self.stop = 1
self.add_node = nn.Linear(graph_embed_func.graph_hidden_size, 1)
# If to add a node, initialize its hv
self.node_type_embed = nn.Embedding(1, node_hidden_size)
self.initialize_hv = nn.Linear(
node_hidden_size + graph_embed_func.graph_hidden_size,
node_hidden_size,
)
self.init_node_activation = torch.zeros(1, 2 * node_hidden_size)
def _initialize_node_repr(self, g, node_type, graph_embed):
"""Whenver a node is added, initialize its representation."""
num_nodes = g.number_of_nodes()
hv_init = self.initialize_hv(
torch.cat(
[
self.node_type_embed(torch.LongTensor([node_type])),
graph_embed,
],
dim=1,
)
)
g.nodes[num_nodes - 1].data["hv"] = hv_init
g.nodes[num_nodes - 1].data["a"] = self.init_node_activation
def prepare_training(self):
self.log_prob = []
def forward(self, g, action=None):
graph_embed = self.graph_op["embed"](g)
logit = self.add_node(graph_embed)
prob = torch.sigmoid(logit)
if not self.training:
action = Bernoulli(prob).sample().item()
stop = bool(action == self.stop)
if not stop:
g.add_nodes(1)
self._initialize_node_repr(g, action, graph_embed)
if self.training:
sample_log_prob = bernoulli_action_log_prob(logit, action)
self.log_prob.append(sample_log_prob)
return stop
#######################################################################################
# Action 2: Add edges
# ''''''''''''''''''''''''''
#
# Given the graph embedding vector :math:`\textbf{h}_{G}` and the node
# embedding vector :math:`\textbf{h}_{v}` for the latest node :math:`v`,
# you evaluate
#
# .. math::
#
# \text{Sigmoid}(\textbf{W}_{\text{add edge}}\text{concat}([\textbf{h}_{G}, \textbf{h}_{v}])+b_{\text{add edge}}),\\
#
# which is then used to parametrize a Bernoulli distribution for deciding
# whether to add a new edge starting from :math:`v`.
#
class AddEdge(nn.Module):
def __init__(self, graph_embed_func, node_hidden_size):
super(AddEdge, self).__init__()
self.graph_op = {"embed": graph_embed_func}
self.add_edge = nn.Linear(
graph_embed_func.graph_hidden_size + node_hidden_size, 1
)
def prepare_training(self):
self.log_prob = []
def forward(self, g, action=None):
graph_embed = self.graph_op["embed"](g)
src_embed = g.nodes[g.number_of_nodes() - 1].data["hv"]
logit = self.add_edge(torch.cat([graph_embed, src_embed], dim=1))
prob = torch.sigmoid(logit)
if self.training:
sample_log_prob = bernoulli_action_log_prob(logit, action)
self.log_prob.append(sample_log_prob)
else:
action = Bernoulli(prob).sample().item()
to_add_edge = bool(action == 0)
return to_add_edge
#######################################################################################
# Action 3: Choose a destination
# '''''''''''''''''''''''''''''''''
#
# When action 2 returns `True`, choose a destination for the
# latest node :math:`v`.
#
# For each possible destination :math:`u\in\{0, \cdots, v-1\}`, the
# probability of choosing it is given by
#
# .. math::
#
# \frac{\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{u}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}{\sum_{i=0}^{v-1}\text{exp}(\textbf{W}_{\text{dest}}\text{concat}([\textbf{h}_{i}, \textbf{h}_{v}])+\textbf{b}_{\text{dest}})}\\
#
from torch.distributions import Categorical
class ChooseDestAndUpdate(nn.Module):
def __init__(self, graph_prop_func, node_hidden_size):
super(ChooseDestAndUpdate, self).__init__()
self.graph_op = {"prop": graph_prop_func}
self.choose_dest = nn.Linear(2 * node_hidden_size, 1)
def _initialize_edge_repr(self, g, src_list, dest_list):
# For untyped edges, only add 1 to indicate its existence.
# For multiple edge types, use a one-hot representation
# or an embedding module.
edge_repr = torch.ones(len(src_list), 1)
g.edges[src_list, dest_list].data["he"] = edge_repr
def prepare_training(self):
self.log_prob = []
def forward(self, g, dest):
src = g.number_of_nodes() - 1
possible_dests = range(src)
src_embed_expand = g.nodes[src].data["hv"].expand(src, -1)
possible_dests_embed = g.nodes[possible_dests].data["hv"]
dests_scores = self.choose_dest(
torch.cat([possible_dests_embed, src_embed_expand], dim=1)
).view(1, -1)
dests_probs = F.softmax(dests_scores, dim=1)
if not self.training:
dest = Categorical(dests_probs).sample().item()
if not g.has_edges_between(src, dest):
# For undirected graphs, add edges for both directions
# so that you can perform graph propagation.
src_list = [src, dest]
dest_list = [dest, src]
g.add_edges(src_list, dest_list)
self._initialize_edge_repr(g, src_list, dest_list)
self.graph_op["prop"](g)
if self.training:
if dests_probs.nelement() > 1:
self.log_prob.append(
F.log_softmax(dests_scores, dim=1)[:, dest : dest + 1]
)
#######################################################################################
# Putting it together
# ``````````````````````````
#
# You are now ready to have a complete implementation of the model class.
#
class DGMG(DGMGSkeleton):
def __init__(self, v_max, node_hidden_size, num_prop_rounds):
super(DGMG, self).__init__(v_max)
# Graph embedding module
self.graph_embed = GraphEmbed(node_hidden_size)
# Graph propagation module
self.graph_prop = GraphProp(num_prop_rounds, node_hidden_size)
# Actions
self.add_node_agent = AddNode(self.graph_embed, node_hidden_size)
self.add_edge_agent = AddEdge(self.graph_embed, node_hidden_size)
self.choose_dest_agent = ChooseDestAndUpdate(
self.graph_prop, node_hidden_size
)
# Forward functions
self.forward_train = partial(forward_train, self=self)
self.forward_inference = partial(forward_inference, self=self)
@property
def action_step(self):
old_step_count = self.step_count
self.step_count += 1
return old_step_count
def prepare_for_train(self):
self.step_count = 0
self.add_node_agent.prepare_training()
self.add_edge_agent.prepare_training()
self.choose_dest_agent.prepare_training()
def add_node_and_update(self, a=None):
"""Decide if to add a new node.
If a new node should be added, update the graph."""
return self.add_node_agent(self.g, a)
def add_edge_or_not(self, a=None):
"""Decide if a new edge should be added."""
return self.add_edge_agent(self.g, a)
def choose_dest_and_update(self, a=None):
"""Choose destination and connect it to the latest node.
Add edges for both directions and update the graph."""
self.choose_dest_agent(self.g, a)
def get_log_prob(self):
add_node_log_p = torch.cat(self.add_node_agent.log_prob).sum()
add_edge_log_p = torch.cat(self.add_edge_agent.log_prob).sum()
choose_dest_log_p = torch.cat(self.choose_dest_agent.log_prob).sum()
return add_node_log_p + add_edge_log_p + choose_dest_log_p
#######################################################################################
# Below is an animation where a graph is generated on the fly
# after every 10 batches of training for the first 400 batches. You
# can see how the model improves over time and begins generating cycles.
#
# .. figure:: https://user-images.githubusercontent.com/19576924/48929291-60fe3880-ef22-11e8-832a-fbe56656559a.gif
# :alt:
#
# For generative models, you can evaluate performance by checking the percentage
# of valid graphs among the graphs it generates on the fly.
import torch.utils.model_zoo as model_zoo
# Download a pre-trained model state dict for generating cycles with 10-20 nodes.
state_dict = model_zoo.load_url(
"https://data.dgl.ai/model/dgmg_cycles-5a0c40be.pth"
)
model = DGMG(v_max=20, node_hidden_size=16, num_prop_rounds=2)
model.load_state_dict(state_dict)
model.eval()
def is_valid(g):
# Check if g is a cycle having 10-20 nodes.
def _get_previous(i, v_max):
if i == 0:
return v_max
else:
return i - 1
def _get_next(i, v_max):
if i == v_max:
return 0
else:
return i + 1
size = g.number_of_nodes()
if size < 10 or size > 20:
return False
for node in range(size):
neighbors = g.successors(node)
if len(neighbors) != 2:
return False
if _get_previous(node, size - 1) not in neighbors:
return False
if _get_next(node, size - 1) not in neighbors:
return False
return True
num_valid = 0
for i in range(100):
g = model()
num_valid += is_valid(g)
del model
print("Among 100 graphs generated, {}% are valid.".format(num_valid))
#######################################################################################
# For the complete implementation, see the `DGL DGMG example
# <https://github.com/dmlc/dgl/tree/master/examples/pytorch/dgmg>`__.
#
......@@ -17,276 +17,275 @@ offers a different perspective. The tutorial describes how to implement a Capsul
efficiency. For recommended implementation, please refer to the `official
examples <https://github.com/dmlc/dgl/tree/master/examples>`_.
"""
#######################################################################################
# Key ideas of Capsule
# --------------------
#
# The Capsule model offers two key ideas: Richer representation and dynamic routing.
#
# **Richer representation** -- In classic convolutional networks, a scalar
# value represents the activation of a given feature. By contrast, a
# capsule outputs a vector. The vector's length represents the probability
# of a feature being present. The vector's orientation represents the
# various properties of the feature (such as pose, deformation, texture
# etc.).
#
# |image0|
#
# **Dynamic routing** -- The output of a capsule is sent to
# certain parents in the layer above based on how well the capsule's
# prediction agrees with that of a parent. Such dynamic
# routing-by-agreement generalizes the static routing of max-pooling.
#
# During training, routing is accomplished iteratively. Each iteration adjusts
# routing weights between capsules based on their observed agreements.
# It's a manner similar to a k-means algorithm or `competitive
# learning <https://en.wikipedia.org/wiki/Competitive_learning>`__.
#
# In this tutorial, you see how a capsule's dynamic routing algorithm can be
# naturally expressed as a graph algorithm. The implementation is adapted
# from `Cedric
# Chee <https://github.com/cedrickchee/capsule-net-pytorch>`__, replacing
# only the routing layer. This version achieves similar speed and accuracy.
#
# Model implementation
# ----------------------
# Step 1: Setup and graph initialization
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The connectivity between two layers of capsules form a directed,
# bipartite graph, as shown in the Figure below.
#
# |image1|
#
# Each node :math:`j` is associated with feature :math:`v_j`,
# representing its capsule’s output. Each edge is associated with
# features :math:`b_{ij}` and :math:`\hat{u}_{j|i}`. :math:`b_{ij}`
# determines routing weights, and :math:`\hat{u}_{j|i}` represents the
# prediction of capsule :math:`i` for :math:`j`.
#
# Here's how we set up the graph and initialize node and edge features.
import os
os.environ['DGLBACKEND'] = 'pytorch'
import matplotlib.pyplot as plt
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl
def init_graph(in_nodes, out_nodes, f_size):
u = np.repeat(np.arange(in_nodes), out_nodes)
v = np.tile(np.arange(in_nodes, in_nodes + out_nodes), in_nodes)
g = dgl.DGLGraph((u, v))
# init states
g.ndata["v"] = th.zeros(in_nodes + out_nodes, f_size)
g.edata["b"] = th.zeros(in_nodes * out_nodes, 1)
return g
#########################################################################################
# Step 2: Define message passing functions
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# This is the pseudocode for Capsule's routing algorithm.
#
# |image2|
# Implement pseudocode lines 4-7 in the class `DGLRoutingLayer` as the following steps:
#
# 1. Calculate coupling coefficients.
#
# - Coefficients are the softmax over all out-edge of in-capsules.
# :math:`\textbf{c}_{i,j} = \text{softmax}(\textbf{b}_{i,j})`.
#
# 2. Calculate weighted sum over all in-capsules.
#
# - Output of a capsule is equal to the weighted sum of its in-capsules
# :math:`s_j=\sum_i c_{ij}\hat{u}_{j|i}`
#
# 3. Squash outputs.
#
# - Squash the length of a Capsule's output vector to range (0,1), so it can represent the probability (of some feature being present).
# - :math:`v_j=\text{squash}(s_j)=\frac{||s_j||^2}{1+||s_j||^2}\frac{s_j}{||s_j||}`
#
# 4. Update weights by the amount of agreement.
#
# - The scalar product :math:`\hat{u}_{j|i}\cdot v_j` can be considered as how well capsule :math:`i` agrees with :math:`j`. It is used to update
# :math:`b_{ij}=b_{ij}+\hat{u}_{j|i}\cdot v_j`
import dgl.function as fn
class DGLRoutingLayer(nn.Module):
def __init__(self, in_nodes, out_nodes, f_size):
super(DGLRoutingLayer, self).__init__()
self.g = init_graph(in_nodes, out_nodes, f_size)
self.in_nodes = in_nodes
self.out_nodes = out_nodes
self.in_indx = list(range(in_nodes))
self.out_indx = list(range(in_nodes, in_nodes + out_nodes))
def forward(self, u_hat, routing_num=1):
self.g.edata["u_hat"] = u_hat
for r in range(routing_num):
# step 1 (line 4): normalize over out edges
edges_b = self.g.edata["b"].view(self.in_nodes, self.out_nodes)
self.g.edata["c"] = F.softmax(edges_b, dim=1).view(-1, 1)
self.g.edata["c u_hat"] = self.g.edata["c"] * self.g.edata["u_hat"]
# Execute step 1 & 2
self.g.update_all(fn.copy_e("c u_hat", "m"), fn.sum("m", "s"))
# step 3 (line 6)
self.g.nodes[self.out_indx].data["v"] = self.squash(
self.g.nodes[self.out_indx].data["s"], dim=1
)
# step 4 (line 7)
v = th.cat(
[self.g.nodes[self.out_indx].data["v"]] * self.in_nodes, dim=0
)
self.g.edata["b"] = self.g.edata["b"] + (
self.g.edata["u_hat"] * v
).sum(dim=1, keepdim=True)
@staticmethod
def squash(s, dim=1):
sq = th.sum(s**2, dim=dim, keepdim=True)
s_norm = th.sqrt(sq)
s = (sq / (1.0 + sq)) * (s / s_norm)
return s
############################################################################################################
# Step 3: Testing
# ~~~~~~~~~~~~~~~
#
# Make a simple 20x10 capsule layer.
in_nodes = 20
out_nodes = 10
f_size = 4
u_hat = th.randn(in_nodes * out_nodes, f_size)
routing = DGLRoutingLayer(in_nodes, out_nodes, f_size)
############################################################################################################
# You can visualize a Capsule network's behavior by monitoring the entropy
# of coupling coefficients. They should start high and then drop, as the
# weights gradually concentrate on fewer edges.
entropy_list = []
dist_list = []
for i in range(10):
routing(u_hat)
dist_matrix = routing.g.edata["c"].view(in_nodes, out_nodes)
entropy = (-dist_matrix * th.log(dist_matrix)).sum(dim=1)
entropy_list.append(entropy.data.numpy())
dist_list.append(dist_matrix.data.numpy())
stds = np.std(entropy_list, axis=1)
means = np.mean(entropy_list, axis=1)
plt.errorbar(np.arange(len(entropy_list)), means, stds, marker="o")
plt.ylabel("Entropy of Weight Distribution")
plt.xlabel("Number of Routing")
plt.xticks(np.arange(len(entropy_list)))
plt.close()
############################################################################################################
# |image3|
#
# Alternatively, we can also watch the evolution of histograms.
import matplotlib.animation as animation
import seaborn as sns
fig = plt.figure(dpi=150)
fig.clf()
ax = fig.subplots()
def dist_animate(i):
ax.cla()
sns.distplot(dist_list[i].reshape(-1), kde=False, ax=ax)
ax.set_xlabel("Weight Distribution Histogram")
ax.set_title("Routing: %d" % (i))
ani = animation.FuncAnimation(
fig, dist_animate, frames=len(entropy_list), interval=500
)
plt.close()
############################################################################################################
# |image4|
#
# You can monitor the how lower-level Capsules gradually attach to one of the
# higher level ones.
import networkx as nx
from networkx.algorithms import bipartite
g = routing.g.to_networkx()
X, Y = bipartite.sets(g)
height_in = 10
height_out = height_in * 0.8
height_in_y = np.linspace(0, height_in, in_nodes)
height_out_y = np.linspace((height_in - height_out) / 2, height_out, out_nodes)
pos = dict()
fig2 = plt.figure(figsize=(8, 3), dpi=150)
fig2.clf()
ax = fig2.subplots()
pos.update(
(n, (i, 1)) for i, n in zip(height_in_y, X)
) # put nodes from X at x=1
pos.update(
(n, (i, 2)) for i, n in zip(height_out_y, Y)
) # put nodes from Y at x=2
def weight_animate(i):
ax.cla()
ax.axis("off")
ax.set_title("Routing: %d " % i)
dm = dist_list[i]
nx.draw_networkx_nodes(
g, pos, nodelist=range(in_nodes), node_color="r", node_size=100, ax=ax
)
nx.draw_networkx_nodes(
g,
pos,
nodelist=range(in_nodes, in_nodes + out_nodes),
node_color="b",
node_size=100,
ax=ax,
)
for edge in g.edges():
nx.draw_networkx_edges(
g,
pos,
edgelist=[edge],
width=dm[edge[0], edge[1] - in_nodes] * 1.5,
ax=ax,
)
ani2 = animation.FuncAnimation(
fig2, weight_animate, frames=len(dist_list), interval=500
)
plt.close()
############################################################################################################
# |image5|
#
# The full code of this visualization is provided on
# `GitHub <https://github.com/dmlc/dgl/blob/master/examples/pytorch/capsule/simple_routing.py>`__. The complete
# code that trains on MNIST is also on `GitHub <https://github.com/dmlc/dgl/tree/tutorial/examples/pytorch/capsule>`__.
#
# .. |image0| image:: https://i.imgur.com/55Ovkdh.png
# .. |image1| image:: https://i.imgur.com/9tc6GLl.png
# .. |image2| image:: https://i.imgur.com/mv1W9Rv.png
# .. |image3| image:: https://i.imgur.com/dMvu7p3.png
# .. |image4| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_dist.gif
# .. |image5| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_vis.gif
"""
#######################################################################################
# Key ideas of Capsule
# --------------------
#
# The Capsule model offers two key ideas: Richer representation and dynamic routing.
#
# **Richer representation** -- In classic convolutional networks, a scalar
# value represents the activation of a given feature. By contrast, a
# capsule outputs a vector. The vector's length represents the probability
# of a feature being present. The vector's orientation represents the
# various properties of the feature (such as pose, deformation, texture
# etc.).
#
# |image0|
#
# **Dynamic routing** -- The output of a capsule is sent to
# certain parents in the layer above based on how well the capsule's
# prediction agrees with that of a parent. Such dynamic
# routing-by-agreement generalizes the static routing of max-pooling.
#
# During training, routing is accomplished iteratively. Each iteration adjusts
# routing weights between capsules based on their observed agreements.
# It's a manner similar to a k-means algorithm or `competitive
# learning <https://en.wikipedia.org/wiki/Competitive_learning>`__.
#
# In this tutorial, you see how a capsule's dynamic routing algorithm can be
# naturally expressed as a graph algorithm. The implementation is adapted
# from `Cedric
# Chee <https://github.com/cedrickchee/capsule-net-pytorch>`__, replacing
# only the routing layer. This version achieves similar speed and accuracy.
#
# Model implementation
# ----------------------
# Step 1: Setup and graph initialization
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# The connectivity between two layers of capsules form a directed,
# bipartite graph, as shown in the Figure below.
#
# |image1|
#
# Each node :math:`j` is associated with feature :math:`v_j`,
# representing its capsule’s output. Each edge is associated with
# features :math:`b_{ij}` and :math:`\hat{u}_{j|i}`. :math:`b_{ij}`
# determines routing weights, and :math:`\hat{u}_{j|i}` represents the
# prediction of capsule :math:`i` for :math:`j`.
#
# Here's how we set up the graph and initialize node and edge features.
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import matplotlib.pyplot as plt
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
def init_graph(in_nodes, out_nodes, f_size):
u = np.repeat(np.arange(in_nodes), out_nodes)
v = np.tile(np.arange(in_nodes, in_nodes + out_nodes), in_nodes)
g = dgl.DGLGraph((u, v))
# init states
g.ndata["v"] = th.zeros(in_nodes + out_nodes, f_size)
g.edata["b"] = th.zeros(in_nodes * out_nodes, 1)
return g
#########################################################################################
# Step 2: Define message passing functions
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# This is the pseudocode for Capsule's routing algorithm.
#
# |image2|
# Implement pseudocode lines 4-7 in the class `DGLRoutingLayer` as the following steps:
#
# 1. Calculate coupling coefficients.
#
# - Coefficients are the softmax over all out-edge of in-capsules.
# :math:`\textbf{c}_{i,j} = \text{softmax}(\textbf{b}_{i,j})`.
#
# 2. Calculate weighted sum over all in-capsules.
#
# - Output of a capsule is equal to the weighted sum of its in-capsules
# :math:`s_j=\sum_i c_{ij}\hat{u}_{j|i}`
#
# 3. Squash outputs.
#
# - Squash the length of a Capsule's output vector to range (0,1), so it can represent the probability (of some feature being present).
# - :math:`v_j=\text{squash}(s_j)=\frac{||s_j||^2}{1+||s_j||^2}\frac{s_j}{||s_j||}`
#
# 4. Update weights by the amount of agreement.
#
# - The scalar product :math:`\hat{u}_{j|i}\cdot v_j` can be considered as how well capsule :math:`i` agrees with :math:`j`. It is used to update
# :math:`b_{ij}=b_{ij}+\hat{u}_{j|i}\cdot v_j`
import dgl.function as fn
class DGLRoutingLayer(nn.Module):
def __init__(self, in_nodes, out_nodes, f_size):
super(DGLRoutingLayer, self).__init__()
self.g = init_graph(in_nodes, out_nodes, f_size)
self.in_nodes = in_nodes
self.out_nodes = out_nodes
self.in_indx = list(range(in_nodes))
self.out_indx = list(range(in_nodes, in_nodes + out_nodes))
def forward(self, u_hat, routing_num=1):
self.g.edata["u_hat"] = u_hat
for r in range(routing_num):
# step 1 (line 4): normalize over out edges
edges_b = self.g.edata["b"].view(self.in_nodes, self.out_nodes)
self.g.edata["c"] = F.softmax(edges_b, dim=1).view(-1, 1)
self.g.edata["c u_hat"] = self.g.edata["c"] * self.g.edata["u_hat"]
# Execute step 1 & 2
self.g.update_all(fn.copy_e("c u_hat", "m"), fn.sum("m", "s"))
# step 3 (line 6)
self.g.nodes[self.out_indx].data["v"] = self.squash(
self.g.nodes[self.out_indx].data["s"], dim=1
)
# step 4 (line 7)
v = th.cat(
[self.g.nodes[self.out_indx].data["v"]] * self.in_nodes, dim=0
)
self.g.edata["b"] = self.g.edata["b"] + (
self.g.edata["u_hat"] * v
).sum(dim=1, keepdim=True)
@staticmethod
def squash(s, dim=1):
sq = th.sum(s**2, dim=dim, keepdim=True)
s_norm = th.sqrt(sq)
s = (sq / (1.0 + sq)) * (s / s_norm)
return s
############################################################################################################
# Step 3: Testing
# ~~~~~~~~~~~~~~~
#
# Make a simple 20x10 capsule layer.
in_nodes = 20
out_nodes = 10
f_size = 4
u_hat = th.randn(in_nodes * out_nodes, f_size)
routing = DGLRoutingLayer(in_nodes, out_nodes, f_size)
############################################################################################################
# You can visualize a Capsule network's behavior by monitoring the entropy
# of coupling coefficients. They should start high and then drop, as the
# weights gradually concentrate on fewer edges.
entropy_list = []
dist_list = []
for i in range(10):
routing(u_hat)
dist_matrix = routing.g.edata["c"].view(in_nodes, out_nodes)
entropy = (-dist_matrix * th.log(dist_matrix)).sum(dim=1)
entropy_list.append(entropy.data.numpy())
dist_list.append(dist_matrix.data.numpy())
stds = np.std(entropy_list, axis=1)
means = np.mean(entropy_list, axis=1)
plt.errorbar(np.arange(len(entropy_list)), means, stds, marker="o")
plt.ylabel("Entropy of Weight Distribution")
plt.xlabel("Number of Routing")
plt.xticks(np.arange(len(entropy_list)))
plt.close()
############################################################################################################
# |image3|
#
# Alternatively, we can also watch the evolution of histograms.
import matplotlib.animation as animation
import seaborn as sns
fig = plt.figure(dpi=150)
fig.clf()
ax = fig.subplots()
def dist_animate(i):
ax.cla()
sns.distplot(dist_list[i].reshape(-1), kde=False, ax=ax)
ax.set_xlabel("Weight Distribution Histogram")
ax.set_title("Routing: %d" % (i))
ani = animation.FuncAnimation(
fig, dist_animate, frames=len(entropy_list), interval=500
)
plt.close()
############################################################################################################
# |image4|
#
# You can monitor the how lower-level Capsules gradually attach to one of the
# higher level ones.
import networkx as nx
from networkx.algorithms import bipartite
g = routing.g.to_networkx()
X, Y = bipartite.sets(g)
height_in = 10
height_out = height_in * 0.8
height_in_y = np.linspace(0, height_in, in_nodes)
height_out_y = np.linspace((height_in - height_out) / 2, height_out, out_nodes)
pos = dict()
fig2 = plt.figure(figsize=(8, 3), dpi=150)
fig2.clf()
ax = fig2.subplots()
pos.update(
(n, (i, 1)) for i, n in zip(height_in_y, X)
) # put nodes from X at x=1
pos.update(
(n, (i, 2)) for i, n in zip(height_out_y, Y)
) # put nodes from Y at x=2
def weight_animate(i):
ax.cla()
ax.axis("off")
ax.set_title("Routing: %d " % i)
dm = dist_list[i]
nx.draw_networkx_nodes(
g, pos, nodelist=range(in_nodes), node_color="r", node_size=100, ax=ax
)
nx.draw_networkx_nodes(
g,
pos,
nodelist=range(in_nodes, in_nodes + out_nodes),
node_color="b",
node_size=100,
ax=ax,
)
for edge in g.edges():
nx.draw_networkx_edges(
g,
pos,
edgelist=[edge],
width=dm[edge[0], edge[1] - in_nodes] * 1.5,
ax=ax,
)
ani2 = animation.FuncAnimation(
fig2, weight_animate, frames=len(dist_list), interval=500
)
plt.close()
############################################################################################################
# |image5|
#
# The full code of this visualization is provided on
# `GitHub <https://github.com/dmlc/dgl/blob/master/examples/pytorch/capsule/simple_routing.py>`__. The complete
# code that trains on MNIST is also on `GitHub <https://github.com/dmlc/dgl/tree/tutorial/examples/pytorch/capsule>`__.
#
# .. |image0| image:: https://i.imgur.com/55Ovkdh.png
# .. |image1| image:: https://i.imgur.com/9tc6GLl.png
# .. |image2| image:: https://i.imgur.com/mv1W9Rv.png
# .. |image3| image:: https://i.imgur.com/dMvu7p3.png
# .. |image4| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_dist.gif
# .. |image5| image:: https://github.com/VoVAllen/DGL_Capsule/raw/master/routing_vis.gif
......@@ -104,7 +104,7 @@ Transformer as a Graph Neural Network
# - ``get_o`` maps the updated value after attention to the output
# :math:`o` for post-processing.
#
# .. code::
# .. code::
#
# class MultiHeadAttention(nn.Module):
# "Multi-Head Attention"
......@@ -146,14 +146,14 @@ Transformer as a Graph Neural Network
#
# Construct the graph by mapping tokens of the source and target
# sentence to nodes. The complete Transformer graph is made up of three
# subgraphs:
#
# subgraphs:
#
# **Source language graph**. This is a complete graph, each
# token :math:`s_i` can attend to any other token :math:`s_j` (including
# self-loops). |image0|
# self-loops). |image0|
# **Target language graph**. The graph is
# half-complete, in that :math:`t_i` attends only to :math:`t_j` if
# :math:`i > j` (an output token can not depend on future words). |image1|
# :math:`i > j` (an output token can not depend on future words). |image1|
# **Cross-language graph**. This is a bi-partitie graph, where there is
# an edge from every source token :math:`s_i` to every target token
# :math:`t_j`, meaning every target token can attend on source tokens.
......@@ -191,7 +191,7 @@ Transformer as a Graph Neural Network
#
# Compute ``score`` and send source node’s ``v`` to destination’s mailbox
#
# .. code::
# .. code::
#
# def message_func(edges):
# return {'score': ((edges.src['k'] * edges.dst['q'])
......@@ -203,7 +203,7 @@ Transformer as a Graph Neural Network
#
# Normalize over all in-edges and weighted sum to get output
#
# .. code::
# .. code::
#
# import torch as th
# import torch.nn.functional as F
......@@ -216,7 +216,7 @@ Transformer as a Graph Neural Network
# Execute on specific edges
# '''''''''''''''''''''''''
#
# .. code::
# .. code::
#
# import functools.partial as partial
# def naive_propagate_attention(self, g, eids):
......@@ -269,7 +269,7 @@ Transformer as a Graph Neural Network
#
# The normalization of :math:`\textrm{wv}` is left to post processing.
#
# .. code::
# .. code::
#
# def src_dot_dst(src_field, dst_field, out_field):
# def func(edges):
......@@ -338,7 +338,7 @@ Transformer as a Graph Neural Network
#
# where :math:`\textrm{FFN}` refers to the feed forward function.
#
# .. code::
# .. code::
#
# class Encoder(nn.Module):
# def __init__(self, layer, N):
......@@ -501,7 +501,7 @@ Transformer as a Graph Neural Network
# Task and the dataset
# ~~~~~~~~~~~~~~~~~~~~
#
# The Transformer is a general framework for a variety of NLP tasks. This tutorial focuses
# The Transformer is a general framework for a variety of NLP tasks. This tutorial focuses
# on the sequence to sequence learning: it’s a typical case to illustrate how it works.
#
# As for the dataset, there are two example tasks: copy and sort, together
......@@ -729,7 +729,7 @@ Transformer as a Graph Neural Network
# with this nodes. The following code shows the Universal Transformer
# class in DGL:
#
# .. code::
# .. code::
#
# class UTransformer(nn.Module):
# "Universal Transformer(https://arxiv.org/pdf/1807.03819.pdf) with ACT(https://arxiv.org/pdf/1603.08983.pdf)."
......@@ -849,10 +849,10 @@ Transformer as a Graph Neural Network
# that are still active:
#
# .. note::
#
#
# - :func:`~dgl.DGLGraph.filter_nodes` takes a predicate and a node
# ID list/tensor as input, then returns a tensor of node IDs that satisfy
# the given predicate.
# the given predicate.
# - :func:`~dgl.DGLGraph.filter_edges` takes a predicate
# and an edge ID list/tensor as input, then returns a tensor of edge IDs
# that satisfy the given predicate.
......@@ -883,6 +883,6 @@ Transformer as a Graph Neural Network
#
# .. note::
# The notebook itself is not executable due to many dependencies.
# Download `7_transformer.py <https://data.dgl.ai/tutorial/7_transformer.py>`__,
# and copy the python script to directory ``examples/pytorch/transformer``
# Download `7_transformer.py <https://data.dgl.ai/tutorial/7_transformer.py>`__,
# and copy the python script to directory ``examples/pytorch/transformer``
# then run ``python 7_transformer.py`` to see how it works.
......@@ -71,7 +71,8 @@ process ID, which should be an integer from `0` to `world_size - 1`.
"""
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import torch.distributed as dist
......
......@@ -26,63 +26,65 @@ models with multi-GPU with ``DistributedDataParallel``.
######################################################################
# Loading Dataset
# ---------------
#
#
# OGB already prepared the data as a ``DGLGraph`` object. The following code is
# copy-pasted from the :doc:`Training GNN with Neighbor Sampling for Node
# Classification <../large/L1_large_node_classification>`
# tutorial.
#
#
import os
os.environ['DGLBACKEND'] = 'pytorch'
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import torch
import numpy as np
import sklearn.metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
from dgl.nn import SAGEConv
from ogb.nodeproppred import DglNodePropPredDataset
import tqdm
import sklearn.metrics
dataset = DglNodePropPredDataset('ogbn-arxiv')
dataset = DglNodePropPredDataset("ogbn-arxiv")
graph, node_labels = dataset[0]
# Add reverse edges since ogbn-arxiv is unidirectional.
graph = dgl.add_reverse_edges(graph)
graph.ndata['label'] = node_labels[:, 0]
graph.ndata["label"] = node_labels[:, 0]
node_features = graph.ndata['feat']
node_features = graph.ndata["feat"]
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
idx_split = dataset.get_idx_split()
train_nids = idx_split['train']
valid_nids = idx_split['valid']
test_nids = idx_split['test'] # Test node IDs, not used in the tutorial though.
train_nids = idx_split["train"]
valid_nids = idx_split["valid"]
test_nids = idx_split["test"] # Test node IDs, not used in the tutorial though.
######################################################################
# Defining Model
# --------------
#
#
# The model will be again identical to the :doc:`Training GNN with Neighbor
# Sampling for Node Classification <../large/L1_large_node_classification>`
# tutorial.
#
#
class Model(nn.Module):
def __init__(self, in_feats, h_feats, num_classes):
super(Model, self).__init__()
self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type="mean")
self.conv2 = SAGEConv(h_feats, num_classes, aggregator_type="mean")
self.h_feats = h_feats
def forward(self, mfgs, x):
h_dst = x[:mfgs[0].num_dst_nodes()]
h_dst = x[: mfgs[0].num_dst_nodes()]
h = self.conv1(mfgs[0], (x, h_dst))
h = F.relu(h)
h_dst = h[:mfgs[1].num_dst_nodes()]
h_dst = h[: mfgs[1].num_dst_nodes()]
h = self.conv2(mfgs[1], (h, h_dst))
return h
......@@ -90,7 +92,7 @@ class Model(nn.Module):
######################################################################
# Defining Training Procedure
# ---------------------------
#
#
# The training procedure will be slightly different from what you saw
# previously, in the sense that you will need to
#
......@@ -98,45 +100,58 @@ class Model(nn.Module):
# * Wrap your model with ``torch.nn.parallel.DistributedDataParallel``.
# * Add a ``use_ddp=True`` argument to the DGL dataloader you wish to run
# together with DDP.
#
#
# You will also need to wrap the training loop inside a function so that
# you can spawn subprocesses to run it.
#
#
def run(proc_id, devices):
# Initialize distributed training context.
dev_id = devices[proc_id]
dist_init_method = 'tcp://{master_ip}:{master_port}'.format(master_ip='127.0.0.1', master_port='12345')
dist_init_method = "tcp://{master_ip}:{master_port}".format(
master_ip="127.0.0.1", master_port="12345"
)
if torch.cuda.device_count() < 1:
device = torch.device('cpu')
device = torch.device("cpu")
torch.distributed.init_process_group(
backend='gloo', init_method=dist_init_method, world_size=len(devices), rank=proc_id)
backend="gloo",
init_method=dist_init_method,
world_size=len(devices),
rank=proc_id,
)
else:
torch.cuda.set_device(dev_id)
device = torch.device('cuda:' + str(dev_id))
device = torch.device("cuda:" + str(dev_id))
torch.distributed.init_process_group(
backend='nccl', init_method=dist_init_method, world_size=len(devices), rank=proc_id)
backend="nccl",
init_method=dist_init_method,
world_size=len(devices),
rank=proc_id,
)
# Define training and validation dataloader, copied from the previous tutorial
# but with one line of difference: use_ddp to enable distributed data parallel
# data loading.
sampler = dgl.dataloading.NeighborSampler([4, 4])
train_dataloader = dgl.dataloading.DataLoader(
# The following arguments are specific to DataLoader.
graph, # The graph
train_nids, # The node IDs to iterate over in minibatches
sampler, # The neighbor sampler
device=device, # Put the sampled MFGs on CPU or GPU
use_ddp=True, # Make it work with distributed data parallel
graph, # The graph
train_nids, # The node IDs to iterate over in minibatches
sampler, # The neighbor sampler
device=device, # Put the sampled MFGs on CPU or GPU
use_ddp=True, # Make it work with distributed data parallel
# The following arguments are inherited from PyTorch DataLoader.
batch_size=1024, # Per-device batch size.
# The effective batch size is this number times the number of GPUs.
shuffle=True, # Whether to shuffle the nodes for every epoch
drop_last=False, # Whether to drop the last incomplete batch
num_workers=0 # Number of sampler processes
batch_size=1024, # Per-device batch size.
# The effective batch size is this number times the number of GPUs.
shuffle=True, # Whether to shuffle the nodes for every epoch
drop_last=False, # Whether to drop the last incomplete batch
num_workers=0, # Number of sampler processes
)
valid_dataloader = dgl.dataloading.DataLoader(
graph, valid_nids, sampler,
graph,
valid_nids,
sampler,
device=device,
use_ddp=False,
batch_size=1024,
......@@ -144,20 +159,24 @@ def run(proc_id, devices):
drop_last=False,
num_workers=0,
)
model = Model(num_features, 128, num_classes).to(device)
# Wrap the model with distributed data parallel module.
if device == torch.device('cpu'):
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=None, output_device=None)
if device == torch.device("cpu"):
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=None, output_device=None
)
else:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[device], output_device=device
)
# Define optimizer
opt = torch.optim.Adam(model.parameters())
best_accuracy = 0
best_model_path = './model.pt'
best_model_path = "./model.pt"
# Copied from previous tutorial with changes highlighted.
for epoch in range(10):
model.train()
......@@ -165,8 +184,8 @@ def run(proc_id, devices):
with tqdm.tqdm(train_dataloader) as tq:
for step, (input_nodes, output_nodes, mfgs) in enumerate(tq):
# feature copy from CPU to GPU takes place here
inputs = mfgs[0].srcdata['feat']
labels = mfgs[-1].dstdata['label']
inputs = mfgs[0].srcdata["feat"]
labels = mfgs[-1].dstdata["label"]
predictions = model(mfgs, inputs)
......@@ -175,9 +194,15 @@ def run(proc_id, devices):
loss.backward()
opt.step()
accuracy = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.argmax(1).detach().cpu().numpy())
accuracy = sklearn.metrics.accuracy_score(
labels.cpu().numpy(),
predictions.argmax(1).detach().cpu().numpy(),
)
tq.set_postfix({'loss': '%.03f' % loss.item(), 'acc': '%.03f' % accuracy}, refresh=False)
tq.set_postfix(
{"loss": "%.03f" % loss.item(), "acc": "%.03f" % accuracy},
refresh=False,
)
model.eval()
......@@ -187,13 +212,15 @@ def run(proc_id, devices):
labels = []
with tqdm.tqdm(valid_dataloader) as tq, torch.no_grad():
for input_nodes, output_nodes, mfgs in tq:
inputs = mfgs[0].srcdata['feat']
labels.append(mfgs[-1].dstdata['label'].cpu().numpy())
predictions.append(model(mfgs, inputs).argmax(1).cpu().numpy())
inputs = mfgs[0].srcdata["feat"]
labels.append(mfgs[-1].dstdata["label"].cpu().numpy())
predictions.append(
model(mfgs, inputs).argmax(1).cpu().numpy()
)
predictions = np.concatenate(predictions)
labels = np.concatenate(labels)
accuracy = sklearn.metrics.accuracy_score(labels, predictions)
print('Epoch {} Validation Accuracy {}'.format(epoch, accuracy))
print("Epoch {} Validation Accuracy {}".format(epoch, accuracy))
if best_accuracy < accuracy:
best_accuracy = accuracy
torch.save(model.state_dict(), best_model_path)
......@@ -205,7 +232,7 @@ def run(proc_id, devices):
######################################################################
# Spawning Trainer Processes
# --------------------------
#
#
# A typical scenario for multi-GPU training with DDP is to replicate the
# model once per GPU, and spawn one trainer process per GPU.
#
......@@ -219,15 +246,15 @@ def run(proc_id, devices):
# or ``out_degrees`` is called. To avoid this, you need to create
# all sparse matrix representations beforehand using the ``create_formats_``
# method:
#
#
graph.create_formats_()
######################################################################
# Then you can spawn the subprocesses to train with multiple GPUs.
#
#
#
#
# .. code:: python
#
# # Say you have four GPUs.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment