[Example] Rgcn support ogbn-mag dataset. (#1812)

* rgcn support ogbn-mag dataset * upd * multi-gpu val and test * Fix * fix * Add support for ogbn-mag * Fix * Fix * Fix * Fix * Add layer_norm * update * Fix merge * Clean some code * update Readme * upd Co-authored-by: Ubuntu <ubuntu@ip-172-31-68-185.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-87-240.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal>

[Example] Rgcn support ogbn-mag dataset. (#1812)
* rgcn support ogbn-mag dataset * upd * multi-gpu val and test * Fix * fix * Add support for ogbn-mag * Fix * Fix * Fix * Fix * Add layer_norm * update * Fix merge * Clean some code * update Readme * upd Co-authored-by: Ubuntu <ubuntu@ip-172-31-68-185.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-87-240.ec2.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-51-214.ec2.internal>
4ef01dbb · xiang song(charlie.song) · GitHub · e7515773 · 4ef01dbb · 4ef01dbb
Unverified Commit 4ef01dbb authored Aug 09, 2020 by xiang song(charlie.song) Committed by GitHub Aug 09, 2020
6 changed files
--- a/examples/pytorch/rgcn/README.md
+++ b/examples/pytorch/rgcn/README.md
@@ -40,7 +40,7 @@ python3 entity_classify.py -d am --n-bases=40 --n-hidden=10 --l2norm=5e-4 --test
 ### Entity Classification with minibatch
 AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL)
 ```
-python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout=20 --batch-size 128
+python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout='20,20' --batch-size 128
 ```

 MUTAG: accuracy avg(5 runs) 66.77%, best 69.12% (DGL)
@@ -49,16 +49,30 @@ python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gp
 ```

 BGS: accuracy avg(5 runs) 91.72%, best 96.55% (DGL)
-
 ```
-python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 40 --n-epochs=40 --batch-size=128
+python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '40,40' --n-epochs=40 --batch-size=128
 ```

 AM: accuracy avg(5 runs) 88.28%, best 90.40% (DGL)
 ```
-python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout 35 --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40
+python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '35,35' --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40
+```
+
+### Entity Classification on OGBN-MAG
+Test-bd: P3-8xlarge
+
+OGBN-MAG accuracy 46.22
+```
+python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,30' --batch-size 512 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --node-feats --layer-norm
 ```

+OGBN-MAG without node-feats 43.24
+```
+python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,25' --batch-size 256 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --layer-norm
+```
+
+Test-bd: P2-8xlarge
+
 ### Link Prediction
 FB15k-237: MRR 0.151 (DGL), 0.158 (paper)
 ```

--- a/examples/pytorch/rgcn/entity_classify_mp.py
+++ b/examples/pytorch/rgcn/entity_classify_mp.py
--- a/examples/pytorch/rgcn/model.py
+++ b/examples/pytorch/rgcn/model.py
@@ -61,7 +61,7 @@ class RelGraphEmbedLayer(nn.Module):
    num_of_ntype : int
        Number of node types
    input_size : list of int
-        A list of input feature size for each node type. If None, we then 
+        A list of input feature size for each node type. If None, we then
        treat certain input feature as an one-hot encoding feature.
    embed_size : int
        Output embed size
@@ -91,16 +91,15 @@ class RelGraphEmbedLayer(nn.Module):

        for ntype in range(num_of_ntype):
            if input_size[ntype] is not None:
-                loc = node_tids == ntype
-                input_emb_size = node_tids[loc].shape[0]
+                input_emb_size = input_size[ntype].shape[1]
                embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
-                nn.init.xavier_uniform_(embed, gain=nn.init.calculate_gain('relu'))
+                nn.init.xavier_uniform_(embed)
                self.embeds[str(ntype)] = embed

        self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
        nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)

-    def forward(self, node_ids, node_tids, features):
+    def forward(self, node_ids, node_tids, type_ids, features):
        """Forward computation
        Parameters
        ----------
@@ -111,19 +110,21 @@ class RelGraphEmbedLayer(nn.Module):
        features : list of features
            list of initial features for nodes belong to different node type.
            If None, the corresponding features is an one-hot encoding feature,
-            else use the features directly as input feature and matmul a 
+            else use the features directly as input feature and matmul a
            projection matrix.
        Returns
        -------
        tensor
            embeddings as the input of the next layer
        """
-        tsd_idx = node_ids < self.num_nodes
-        tsd_ids = node_ids[tsd_idx]
-        embeds = self.node_embeds(tsd_ids)
+        tsd_ids = node_ids.to(self.node_embeds.weight.device)
+        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id)
        for ntype in range(self.num_of_ntype):
            if features[ntype] is not None:
                loc = node_tids == ntype
-                embeds[loc] = features[ntype] @ self.embeds[str(ntype)]
+                embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id)
+            else:
+                loc = node_tids == ntype
+                embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.dev_id)

-        return embeds.to(self.dev_id)
+        return embeds
--- a/python/dgl/nn/mxnet/conv/relgraphconv.py
+++ b/python/dgl/nn/mxnet/conv/relgraphconv.py
@@ -61,6 +61,8 @@ class RelGraphConv(gluon.Block):
        Default: False.
    dropout : float, optional
        Dropout rate. Default: 0.0
+    layer_norm: float, optional
+        Add layer norm. Default: False
    """
    def __init__(self,
                 in_feat,
@@ -72,7 +74,8 @@ class RelGraphConv(gluon.Block):
                 activation=None,
                 self_loop=False,
                 low_mem=False,
-                 dropout=0.0):
+                 dropout=0.0,
+                 layer_norm=False):
        super(RelGraphConv, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
@@ -86,6 +89,7 @@ class RelGraphConv(gluon.Block):
        self.self_loop = self_loop

        assert low_mem is False, 'MXNet currently does not support low-memory implementation.'
+        assert layer_norm is False, 'MXNet currently does not support layer norm.'

        if regularizer == "basis":
            # add basis weights

--- a/python/dgl/nn/pytorch/conv/relgraphconv.py
+++ b/python/dgl/nn/pytorch/conv/relgraphconv.py
@@ -59,6 +59,8 @@ class RelGraphConv(nn.Module):
        Turn it on when you encounter OOM problem during training or evaluation.
    dropout : float, optional
        Dropout rate. Default: 0.0
+    layer_norm: float, optional
+        Add layer norm. Default: False
    """
    def __init__(self,
                 in_feat,
@@ -70,7 +72,8 @@ class RelGraphConv(nn.Module):
                 activation=None,
                 self_loop=False,
                 low_mem=False,
-                 dropout=0.0):
+                 dropout=0.0,
+                 layer_norm=False):
        super(RelGraphConv, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
@@ -83,6 +86,7 @@ class RelGraphConv(nn.Module):
        self.activation = activation
        self.self_loop = self_loop
        self.low_mem = low_mem
+        self.layer_norm = layer_norm

        if regularizer == "basis":
            # add basis weights
@@ -120,6 +124,10 @@ class RelGraphConv(nn.Module):
            self.h_bias = nn.Parameter(th.Tensor(out_feat))
            nn.init.zeros_(self.h_bias)

+        # layer norm
+        if self.layer_norm:
+            self.layer_norm_weight = nn.LayerNorm(n_hidden, elementwise_affine=True)
+
        # weight for self loop
        if self.self_loop:
            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
@@ -219,6 +227,8 @@ class RelGraphConv(nn.Module):
            g.update_all(self.message_func, fn.sum(msg='msg', out='h'))
            # apply bias and activation
            node_repr = g.dstdata['h']
+            if self.layer_norm:
+                node_repr = self.layer_norm_weight(node_repr)
            if self.bias:
                node_repr = node_repr + self.h_bias
            if self.self_loop:

--- a/python/dgl/nn/tensorflow/conv/relgraphconv.py
+++ b/python/dgl/nn/tensorflow/conv/relgraphconv.py
@@ -59,6 +59,8 @@ class RelGraphConv(layers.Layer):
        Turn it on when you encounter OOM problem during training or evaluation.
    dropout : float, optional
        Dropout rate. Default: 0.0
+    layer_norm: float, optional
+        Add layer norm. Default: False
    """

    def __init__(self,
@@ -71,7 +73,8 @@ class RelGraphConv(layers.Layer):
                 activation=None,
                 self_loop=False,
                 low_mem=False,
-                 dropout=0.0):
+                 dropout=0.0,
+                 layer_norm=False):
        super(RelGraphConv, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
@@ -85,6 +88,8 @@ class RelGraphConv(layers.Layer):
        self.self_loop = self_loop
        self.low_mem = low_mem

+        assert layer_norm is False, 'TensorFlow currently does not support layer norm.'
+
        xinit = tf.keras.initializers.glorot_uniform()
        zeroinit = tf.keras.initializers.zeros()