[Bugfix] Fix gat residual bug (#355)

* fix gat residual bug * fix the residual addition; output heads; add some shape notations; * minor * fix the output head average * add requests package in requirement

[Bugfix] Fix gat residual bug (#355)
* fix gat residual bug * fix the residual addition; output heads; add some shape notations; * minor * fix the output head average * add requests package in requirement
ddc2faa5 · Minjie Wang · GitHub · efae0f97 · ddc2faa5 · ddc2faa5
Unverified Commit ddc2faa5 authored Jan 12, 2019 by Minjie Wang Committed by GitHub Jan 12, 2019
4 changed files
--- a/examples/mxnet/gcn/README.md
+++ b/examples/mxnet/gcn/README.md
@@ -4,6 +4,14 @@ Graph Convolutional Networks (GCN)
 Paper link: [https://arxiv.org/abs/1609.02907](https://arxiv.org/abs/1609.02907)
 Author's code repo: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn)

+Requirements
+------------
+- requests
+
+``bash
+pip install requests
+``
+
 Codes
 -----
 The folder contains two implementations of GCN. `gcn.py` uses user-defined
@@ -47,33 +55,33 @@ new information in the concatenations.

 ```
 # Final accuracy 75.34% MLP without GCN
-DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_batch.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 0
+DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_concat.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 0

 # Final accuracy 86.57% with 10-layer GCN (symmetric normalization)
-DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_batch.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop
+DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_concat.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop

 # Final accuracy 84.42% with 10-layer GCN (unnormalized)
-DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_batch.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 10
+DGLBACKEND=mxnet python examples/mxnet/gcn/gcn_concat.py --dataset "citeseer" --n-epochs 200 --gpu 1 --n-layers 10
 ```

 ```
 # Final accuracy 40.62% MLP without GCN
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 0
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 0

 # Final accuracy 92.63% with 10-layer GCN (symmetric normalization)
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop

 # Final accuracy 86.60% with 10-layer GCN (unnormalized)
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 10
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "cora" --n-epochs 200 --gpu 1 --n-layers 10
 ```

 ```
 # Final accuracy 72.97% MLP without GCN
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 0
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 0

 # Final accuracy 88.33% with 10-layer GCN (symmetric normalization)
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 10 --normalization 'sym' --self-loop

 # Final accuracy 83.80% with 10-layer GCN (unnormalized)
-DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_batch.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 10
+DGLBACKEND=mxnet python3 examples/mxnet/gcn/gcn_concat.py --dataset "pubmed" --n-epochs 200 --gpu 1 --n-layers 10
 ```
--- a/examples/pytorch/gat/train.py
+++ b/examples/pytorch/gat/train.py
@@ -3,7 +3,7 @@ Graph Attention Networks in DGL using SPMV optimization.
 Multiple heads are also batched together for faster training.

 Compared with the original paper, this code does not implement
-multiple output attention heads.
+early stopping.

 References
 ----------
@@ -53,37 +53,38 @@ class GraphAttention(nn.Module):
        self.residual = residual
        if residual:
            if in_dim != out_dim:
-                self.residual_fc = nn.Linear(in_dim, num_heads * out_dim, bias=False)
-                nn.init.xavier_normal_(self.fc.weight.data, gain=1.414)
+                self.res_fc = nn.Linear(in_dim, num_heads * out_dim, bias=False)
+                nn.init.xavier_normal_(self.res_fc.weight.data, gain=1.414)
            else:
-                self.residual_fc = None
+                self.res_fc = None

    def forward(self, inputs):
        # prepare
-        h = inputs
+        h = inputs  # NxD
        if self.feat_drop:
            h = self.feat_drop(h)
-        ft = self.fc(h).reshape((h.shape[0], self.num_heads, -1))
-        head_ft = ft.transpose(0, 1)
-        a1 = torch.bmm(head_ft, self.attn_l).transpose(0, 1)
-        a2 = torch.bmm(head_ft, self.attn_r).transpose(0, 1)
+        ft = self.fc(h).reshape((h.shape[0], self.num_heads, -1))  # NxHxD'
+        head_ft = ft.transpose(0, 1)  # HxNxD'
+        a1 = torch.bmm(head_ft, self.attn_l).transpose(0, 1)  # NxHx1
+        a2 = torch.bmm(head_ft, self.attn_r).transpose(0, 1)  # NxHx1
        if self.feat_drop:
            ft = self.feat_drop(ft)
        self.g.ndata.update({'ft' : ft, 'a1' : a1, 'a2' : a2})
        # 1. compute edge attention
        self.g.apply_edges(self.edge_attention)
-        # 2. compute two results, one is the node features scaled by the dropped,
-        # unnormalized attention values. Another is the normalizer of the attention values.
+        # 2. compute two results: one is the node features scaled by the dropped,
+        # unnormalized attention values; another is the normalizer of the attention values.
        self.g.update_all([fn.src_mul_edge('ft', 'a_drop', 'ft'), fn.copy_edge('a', 'a')],
                          [fn.sum('ft', 'ft'), fn.sum('a', 'z')])
        # 3. apply normalizer
-        ret = self.g.ndata['ft'] / self.g.ndata['z']
+        ret = self.g.ndata['ft'] / self.g.ndata['z']  # NxHxD'
        # 4. residual
        if self.residual:
-            if self.residual_fc:
-                ret = self.residual_fc(h) + ret
+            if self.res_fc is not None:
+                resval = self.res_fc(h).reshape((h.shape[0], self.num_heads, -1))  # NxHxD'
            else:
-                ret = h + ret
+                resval = torch.unsqueeze(h, 1)  # Nx1xD'
+            ret = resval + ret
        return ret

    def edge_attention(self, edges):
@@ -101,7 +102,7 @@ class GAT(nn.Module):
                 in_dim,
                 num_hidden,
                 num_classes,
-                 num_heads,
+                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
@@ -114,16 +115,16 @@ class GAT(nn.Module):
        self.activation = activation
        # input projection (no residual)
        self.gat_layers.append(GraphAttention(
-            g, in_dim, num_hidden, num_heads, feat_drop, attn_drop, alpha, False))
+            g, in_dim, num_hidden, heads[0], feat_drop, attn_drop, alpha, False))
        # hidden layers
-        for l in range(num_layers - 1):
+        for l in range(1, num_layers):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GraphAttention(
-                g, num_hidden * num_heads, num_hidden, num_heads,
+                g, num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, alpha, residual))
        # output projection
        self.gat_layers.append(GraphAttention(
-            g, num_hidden * num_heads, num_classes, 8,
+            g, num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, alpha, residual))

    def forward(self, inputs):
@@ -132,7 +133,7 @@ class GAT(nn.Module):
            h = self.gat_layers[l](h).flatten(1)
            h = self.activation(h)
        # output projection
-        logits = self.gat_layers[-1](h).sum(1)
+        logits = self.gat_layers[-1](h).mean(1)
        return logits

 def accuracy(logits, labels):
@@ -187,12 +188,13 @@ def main(args):
    # add self loop
    g.add_edges(g.nodes(), g.nodes())
    # create model
+    heads = ([args.num_heads] * args.num_layers) + [args.num_out_heads]
    model = GAT(g,
                args.num_layers,
                num_feats,
                args.num_hidden,
                n_classes,
-                args.num_heads,
+                heads,
                F.elu,
                args.in_drop,
                args.attn_drop,

--- a/examples/pytorch/gcn/README.md
+++ b/examples/pytorch/gcn/README.md
@@ -5,9 +5,17 @@ Graph Convolutional Networks (GCN)
 - Author's code repo: [https://github.com/tkipf/gcn](https://github.com/tkipf/gcn). Note that the original code is 
 implemented with Tensorflow for the paper. 

+Requirements
+------------
+- requests
+
+``bash
+pip install requests
+``
+
 Codes
 -----
-The folder contains two implementations of GCN. `gcn_batch.py` uses user-defined
+The folder contains two implementations of GCN. `gcn.py` uses user-defined
 message and reduce functions. `gcn_spmv.py` uses DGL's builtin functions so
 SPMV optimization could be applied.


--- a/python/dgl/graph_index.py
+++ b/python/dgl/graph_index.py
@@ -625,8 +625,6 @@ class GraphIndex(object):
            x = -F.ones((n_entries,), dtype=F.float32, ctx=ctx)
            y = F.ones((n_entries,), dtype=F.float32, ctx=ctx)
            dat = F.cat([x, y], dim=0)
-            print(idx)
-            print(dat)
            inc, shuffle_idx = F.sparse_matrix(dat, ('coo', idx), (n, m))
        else:
            raise DGLError('Invalid incidence matrix type: %s' % str(typestr))