autoformat (#5322)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>

autoformat (#5322)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-28-63.ap-northeast-1.compute.internal>
9836f78e · Hongzhi (Steve), Chen · GitHub · 704bcaf6 · 9836f78e · 9836f78e
Unverified Commit 9836f78e authored Feb 19, 2023 by Hongzhi (Steve), Chen Committed by GitHub Feb 19, 2023
17 changed files
--- a/benchmarks/benchmarks/model_acc/bench_sage.py
+++ b/benchmarks/benchmarks/model_acc/bench_sage.py
+import dgl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import dgl
 from dgl.nn.pytorch import SAGEConv
 from .. import utils

--- a/benchmarks/benchmarks/model_acc/bench_sage_ns.py
+++ b/benchmarks/benchmarks/model_acc/bench_sage_ns.py
+import time
 import dgl
+import dgl.nn.pytorch as dglnn
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
-import dgl.nn.pytorch as dglnn
-import time
 from .. import utils
 class SAGE(nn.Module):
-    def __init__(self,
+    def __init__(
-                 in_feats,
+        self, in_feats, n_hidden, n_classes, n_layers, activation, dropout
-                 n_hidden,
+    ):
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
-        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
+        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean"))
        for i in range(1, n_layers - 1):
-            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
+            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean"))
-        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
+        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean"))
        self.dropout = nn.Dropout(dropout)
        self.activation = activation
@@ -55,8 +52,10 @@ class SAGE(nn.Module):
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
        for l, layer in enumerate(self.layers):
-            y = th.zeros(g.number_of_nodes(), self.n_hidden if l !=
+            y = th.zeros(
-                         len(self.layers) - 1 else self.n_classes)
+                g.number_of_nodes(),
+                self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
+            )
            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.DataLoader(
@@ -66,7 +65,8 @@ class SAGE(nn.Module):
                batch_size=batch_size,
                shuffle=True,
                drop_last=False,
-                num_workers=4)
+                num_workers=4,
+            )
            for input_nodes, output_nodes, blocks in dataloader:
                block = blocks[0]
@@ -113,20 +113,20 @@ def load_subtensor(g, seeds, input_nodes, device):
    """
    Copys features and labels of a set of nodes onto GPU.
    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
+    batch_inputs = g.ndata["features"][input_nodes].to(device)
-    batch_labels = g.ndata['labels'][seeds].to(device)
+    batch_labels = g.ndata["labels"][seeds].to(device)
    return batch_inputs, batch_labels
-@utils.benchmark('acc', 600)
+@utils.benchmark("acc", 600)
-@utils.parametrize('data', ['ogbn-products', "reddit"])
+@utils.parametrize("data", ["ogbn-products", "reddit"])
 def track_acc(data):
    data = utils.process_data(data)
    device = utils.get_bench_device()
    g = data[0]
-    g.ndata['features'] = g.ndata['feat']
+    g.ndata["features"] = g.ndata["feat"]
-    g.ndata['labels'] = g.ndata['label']
+    g.ndata["labels"] = g.ndata["label"]
-    in_feats = g.ndata['features'].shape[1]
+    in_feats = g.ndata["features"].shape[1]
    n_classes = data.num_classes
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
@@ -136,17 +136,18 @@ def track_acc(data):
    num_epochs = 20
    num_hidden = 16
    num_layers = 2
-    fan_out = '5,10'
+    fan_out = "5,10"
    batch_size = 1024
    lr = 0.003
    dropout = 0.5
    num_workers = 4
-    train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
+    train_nid = th.nonzero(g.ndata["train_mask"], as_tuple=True)[0]
    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
-        [int(fanout) for fanout in fan_out.split(',')])
+        [int(fanout) for fanout in fan_out.split(",")]
+    )
    dataloader = dgl.dataloading.DataLoader(
        g,
        train_nid,
@@ -154,7 +155,8 @@ def track_acc(data):
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
-        num_workers=num_workers)
+        num_workers=num_workers,
+    )
    # Define model and optimizer
    model = SAGE(in_feats, num_hidden, n_classes, num_layers, F.relu, dropout)
@@ -166,10 +168,10 @@ def track_acc(data):
    # dry run one epoch
    for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
        # Load the input features as well as output labels
-        #batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
+        # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
        blocks = [block.int().to(device) for block in blocks]
-        batch_inputs = blocks[0].srcdata['features']
+        batch_inputs = blocks[0].srcdata["features"]
-        batch_labels = blocks[-1].dstdata['labels']
+        batch_labels = blocks[-1].dstdata["labels"]
        # Compute loss and prediction
        batch_pred = model(blocks, batch_inputs)
@@ -184,10 +186,10 @@ def track_acc(data):
        # blocks.
        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
            # Load the input features as well as output labels
-            #batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
+            # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
            blocks = [block.int().to(device) for block in blocks]
-            batch_inputs = blocks[0].srcdata['features']
+            batch_inputs = blocks[0].srcdata["features"]
-            batch_labels = blocks[-1].dstdata['labels']
+            batch_labels = blocks[-1].dstdata["labels"]
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -198,8 +200,16 @@ def track_acc(data):
    test_g = g
    test_nid = th.nonzero(
-        ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
+        ~(test_g.ndata["train_mask"] | test_g.ndata["val_mask"]), as_tuple=True
+    )[0]
    test_acc = evaluate(
-        model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device)
+        model,
+        test_g,
+        test_g.ndata["features"],
+        test_g.ndata["labels"],
+        test_nid,
+        batch_size,
+        device,
+    )
    return test_acc.item()
--- a/benchmarks/benchmarks/model_speed/bench_gat.py
+++ b/benchmarks/benchmarks/model_speed/bench_gat.py
 import time
+import dgl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import dgl
 from dgl.nn.pytorch import GATConv
 from .. import utils

--- a/benchmarks/benchmarks/model_speed/bench_gat_ns.py
+++ b/benchmarks/benchmarks/model_speed/bench_gat_ns.py
+import time
+import traceback
 import dgl
+import dgl.nn.pytorch as dglnn
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
-import dgl.nn.pytorch as dglnn
-import time
-import traceback
 from .. import utils
 class GAT(nn.Module):
-    def __init__(self,
+    def __init__(
-                 in_feats,
+        self,
-                 num_heads,
+        in_feats,
-                 n_hidden,
+        num_heads,
-                 n_classes,
+        n_hidden,
-                 n_layers,
+        n_classes,
-                 activation,
+        n_layers,
-                 dropout=0.):
+        activation,
+        dropout=0.0,
+    ):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
        self.num_heads = num_heads
-        self.layers.append(dglnn.GATConv(in_feats,
+        self.layers.append(
-                                         n_hidden,
+            dglnn.GATConv(
-                                         num_heads=num_heads,
+                in_feats,
-                                         feat_drop=dropout,
+                n_hidden,
-                                         attn_drop=dropout,
+                num_heads=num_heads,
-                                         activation=activation,
+                feat_drop=dropout,
-                                         negative_slope=0.2))
+                attn_drop=dropout,
+                activation=activation,
+                negative_slope=0.2,
+            )
+        )
        for i in range(1, n_layers - 1):
-            self.layers.append(dglnn.GATConv(n_hidden * num_heads,
+            self.layers.append(
-                                             n_hidden,
+                dglnn.GATConv(
-                                             num_heads=num_heads,
+                    n_hidden * num_heads,
-                                             feat_drop=dropout,
+                    n_hidden,
-                                             attn_drop=dropout,
+                    num_heads=num_heads,
-                                             activation=activation,
+                    feat_drop=dropout,
-                                             negative_slope=0.2))
+                    attn_drop=dropout,
-        self.layers.append(dglnn.GATConv(n_hidden * num_heads,
+                    activation=activation,
-                                         n_classes,
+                    negative_slope=0.2,
-                                         num_heads=num_heads,
+                )
-                                         feat_drop=dropout,
+            )
-                                         attn_drop=dropout,
+        self.layers.append(
-                                         activation=None,
+            dglnn.GATConv(
-                                         negative_slope=0.2))
+                n_hidden * num_heads,
+                n_classes,
+                num_heads=num_heads,
+                feat_drop=dropout,
+                attn_drop=dropout,
+                activation=None,
+                negative_slope=0.2,
+            )
+        )
    def forward(self, blocks, x):
        h = x
@@ -58,24 +74,26 @@ class GAT(nn.Module):
        h = h.mean(1)
        return h.log_softmax(dim=-1)
 def load_subtensor(g, seeds, input_nodes, device):
    """
    Copys features and labels of a set of nodes onto GPU.
    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
+    batch_inputs = g.ndata["features"][input_nodes].to(device)
-    batch_labels = g.ndata['labels'][seeds].to(device)
+    batch_labels = g.ndata["labels"][seeds].to(device)
    return batch_inputs, batch_labels
-@utils.benchmark('time', 600)
-@utils.parametrize('data', ['reddit', 'ogbn-products'])
+@utils.benchmark("time", 600)
+@utils.parametrize("data", ["reddit", "ogbn-products"])
 def track_time(data):
    data = utils.process_data(data)
    device = utils.get_bench_device()
    g = data[0]
-    g.ndata['features'] = g.ndata['feat']
+    g.ndata["features"] = g.ndata["feat"]
-    g.ndata['labels'] = g.ndata['label']
+    g.ndata["labels"] = g.ndata["label"]
    g = g.remove_self_loop().add_self_loop()
-    in_feats = g.ndata['features'].shape[1]
+    in_feats = g.ndata["features"].shape[1]
    n_classes = data.num_classes
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
@@ -85,7 +103,7 @@ def track_time(data):
    num_hidden = 16
    num_heads = 8
    num_layers = 2
-    fan_out = '10,25'
+    fan_out = "10,25"
    batch_size = 1024
    lr = 0.003
    dropout = 0.5
@@ -93,11 +111,12 @@ def track_time(data):
    iter_start = 3
    iter_count = 10
-    train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
+    train_nid = th.nonzero(g.ndata["train_mask"], as_tuple=True)[0]
    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
-        [int(fanout) for fanout in fan_out.split(',')])
+        [int(fanout) for fanout in fan_out.split(",")]
+    )
    dataloader = dgl.dataloading.DataLoader(
        g,
        train_nid,
@@ -105,10 +124,13 @@ def track_time(data):
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
-        num_workers=num_workers)
+        num_workers=num_workers,
+    )
    # Define model and optimizer
-    model = GAT(in_feats, num_heads, num_hidden, n_classes, num_layers, F.relu, dropout)
+    model = GAT(
+        in_feats, num_heads, num_hidden, n_classes, num_layers, F.relu, dropout
+    )
    model = model.to(device)
    loss_fcn = nn.CrossEntropyLoss()
    loss_fcn = loss_fcn.to(device)
@@ -118,15 +140,15 @@ def track_time(data):
    with dataloader.enable_cpu_affinity():
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        # Training loop
        avg = 0
        iter_tput = []
        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
            # Load the input features as well as output labels
            blocks = [block.int().to(device) for block in blocks]
-            batch_inputs = blocks[0].srcdata['features']
+            batch_inputs = blocks[0].srcdata["features"]
-            batch_labels = blocks[-1].dstdata['labels']
+            batch_labels = blocks[-1].dstdata["labels"]
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -138,7 +160,9 @@ def track_time(data):
            # start timer at before iter_start
            if step == iter_start - 1:
                t0 = time.time()
-            elif step == iter_count + iter_start - 1:  # time iter_count iterations
+            elif (
+                step == iter_count + iter_start - 1
+            ):  # time iter_count iterations
                break
    t1 = time.time()

--- a/benchmarks/benchmarks/model_speed/bench_gcn_udf.py
+++ b/benchmarks/benchmarks/model_speed/bench_gcn_udf.py
 import time
+import dgl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import dgl
 from .. import utils

--- a/benchmarks/benchmarks/model_speed/bench_pinsage.py
+++ b/benchmarks/benchmarks/model_speed/bench_pinsage.py
@@ -2,15 +2,15 @@ import argparse
 import pickle
 import time
+import dgl
+import dgl.function as fn
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, IterableDataset
-import dgl
-import dgl.function as fn
 from .. import utils

--- a/benchmarks/benchmarks/model_speed/bench_rgcn_hetero_ns.py
+++ b/benchmarks/benchmarks/model_speed/bench_rgcn_hetero_ns.py
-import dgl
 import itertools
+import time
+import traceback
+import dgl
+import dgl.nn.pytorch as dglnn
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
-import dgl.nn.pytorch as dglnn
-import time
-import traceback
 from .. import utils
 class RelGraphConvLayer(nn.Module):
    r"""Relational graph convolution layer.
@@ -36,17 +38,20 @@ class RelGraphConvLayer(nn.Module):
    dropout : float, optional
        Dropout rate. Default: 0.0
    """
-    def __init__(self,
-                 in_feat,
+    def __init__(
-                 out_feat,
+        self,
-                 rel_names,
+        in_feat,
-                 num_bases,
+        out_feat,
-                 *,
+        rel_names,
-                 weight=True,
+        num_bases,
-                 bias=True,
+        *,
-                 activation=None,
+        weight=True,
-                 self_loop=False,
+        bias=True,
-                 dropout=0.0):
+        activation=None,
+        self_loop=False,
+        dropout=0.0
+    ):
        super(RelGraphConvLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
@@ -56,19 +61,29 @@ class RelGraphConvLayer(nn.Module):
        self.activation = activation
        self.self_loop = self_loop
-        self.conv = dglnn.HeteroGraphConv({
+        self.conv = dglnn.HeteroGraphConv(
-                rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False)
+            {
+                rel: dglnn.GraphConv(
+                    in_feat, out_feat, norm="right", weight=False, bias=False
+                )
                for rel in rel_names
-            })
+            }
+        )
        self.use_weight = weight
        self.use_basis = num_bases < len(self.rel_names) and weight
        if self.use_weight:
            if self.use_basis:
-                self.basis = dglnn.WeightBasis((in_feat, out_feat), num_bases, len(self.rel_names))
+                self.basis = dglnn.WeightBasis(
+                    (in_feat, out_feat), num_bases, len(self.rel_names)
+                )
            else:
-                self.weight = nn.Parameter(th.Tensor(len(self.rel_names), in_feat, out_feat))
+                self.weight = nn.Parameter(
-                nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
+                    th.Tensor(len(self.rel_names), in_feat, out_feat)
+                )
+                nn.init.xavier_uniform_(
+                    self.weight, gain=nn.init.calculate_gain("relu")
+                )
        # bias
        if bias:
@@ -78,8 +93,9 @@ class RelGraphConvLayer(nn.Module):
        # weight for self loop
        if self.self_loop:
            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
-            nn.init.xavier_uniform_(self.loop_weight,
+            nn.init.xavier_uniform_(
-                                    gain=nn.init.calculate_gain('relu'))
+                self.loop_weight, gain=nn.init.calculate_gain("relu")
+            )
        self.dropout = nn.Dropout(dropout)
@@ -101,14 +117,18 @@ class RelGraphConvLayer(nn.Module):
        g = g.local_var()
        if self.use_weight:
            weight = self.basis() if self.use_basis else self.weight
-            wdict = {self.rel_names[i] : {'weight' : w.squeeze(0)}
+            wdict = {
-                     for i, w in enumerate(th.split(weight, 1, dim=0))}
+                self.rel_names[i]: {"weight": w.squeeze(0)}
+                for i, w in enumerate(th.split(weight, 1, dim=0))
+            }
        else:
            wdict = {}
        if g.is_block:
            inputs_src = inputs
-            inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
+            inputs_dst = {
+                k: v[: g.number_of_dst_nodes(k)] for k, v in inputs.items()
+            }
        else:
            inputs_src = inputs_dst = inputs
@@ -122,20 +142,24 @@ class RelGraphConvLayer(nn.Module):
            if self.activation:
                h = self.activation(h)
            return self.dropout(h)
-        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}
+        return {ntype: _apply(ntype, h) for ntype, h in hs.items()}
 class RelGraphEmbed(nn.Module):
    r"""Embedding layer for featureless heterograph."""
-    def __init__(self,
-                 g,
+    def __init__(
-                 device,
+        self,
-                 embed_size,
+        g,
-                 num_nodes,
+        device,
-                 node_feats,
+        embed_size,
-                 embed_name='embed',
+        num_nodes,
-                 activation=None,
+        node_feats,
-                 dropout=0.0):
+        embed_name="embed",
+        activation=None,
+        dropout=0.0,
+    ):
        super(RelGraphEmbed, self).__init__()
        self.g = g
        self.device = device
@@ -150,7 +174,9 @@ class RelGraphEmbed(nn.Module):
        self.node_embeds = nn.ModuleDict()
        for ntype in g.ntypes:
            if node_feats[ntype] is None:
-                sparse_emb = th.nn.Embedding(num_nodes[ntype], embed_size, sparse=True)
+                sparse_emb = th.nn.Embedding(
+                    num_nodes[ntype], embed_size, sparse=True
+                )
                nn.init.uniform_(sparse_emb.weight, -1.0, 1.0)
                self.node_embeds[ntype] = sparse_emb
            else:
@@ -177,19 +203,28 @@ class RelGraphEmbed(nn.Module):
        embeds = {}
        for ntype in block.ntypes:
            if self.node_feats[ntype] is None:
-                embeds[ntype] = self.node_embeds[ntype](block.nodes(ntype)).to(self.device)
+                embeds[ntype] = self.node_embeds[ntype](block.nodes(ntype)).to(
+                    self.device
+                )
            else:
-                embeds[ntype] = self.node_feats[ntype][block.nodes(ntype)].to(self.device) @ self.embeds[ntype]
+                embeds[ntype] = (
+                    self.node_feats[ntype][block.nodes(ntype)].to(self.device)
+                    @ self.embeds[ntype]
+                )
        return embeds
 class EntityClassify(nn.Module):
-    def __init__(self,
+    def __init__(
-                 g,
+        self,
-                 h_dim, out_dim,
+        g,
-                 num_bases,
+        h_dim,
-                 num_hidden_layers=1,
+        out_dim,
-                 dropout=0,
+        num_bases,
-                 use_self_loop=False):
+        num_hidden_layers=1,
+        dropout=0,
+        use_self_loop=False,
+    ):
        super(EntityClassify, self).__init__()
        self.g = g
        self.h_dim = h_dim
@@ -206,34 +241,56 @@ class EntityClassify(nn.Module):
        self.layers = nn.ModuleList()
        # i2h
-        self.layers.append(RelGraphConvLayer(
+        self.layers.append(
-            self.h_dim, self.h_dim, self.rel_names,
+            RelGraphConvLayer(
-            self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+                self.h_dim,
-            dropout=self.dropout, weight=False))
+                self.h_dim,
+                self.rel_names,
+                self.num_bases,
+                activation=F.relu,
+                self_loop=self.use_self_loop,
+                dropout=self.dropout,
+                weight=False,
+            )
+        )
        # h2h
        for i in range(self.num_hidden_layers):
-            self.layers.append(RelGraphConvLayer(
+            self.layers.append(
-                self.h_dim, self.h_dim, self.rel_names,
+                RelGraphConvLayer(
-                self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+                    self.h_dim,
-                dropout=self.dropout))
+                    self.h_dim,
+                    self.rel_names,
+                    self.num_bases,
+                    activation=F.relu,
+                    self_loop=self.use_self_loop,
+                    dropout=self.dropout,
+                )
+            )
        # h2o
-        self.layers.append(RelGraphConvLayer(
+        self.layers.append(
-            self.h_dim, self.out_dim, self.rel_names,
+            RelGraphConvLayer(
-            self.num_bases, activation=None,
+                self.h_dim,
-            self_loop=self.use_self_loop))
+                self.out_dim,
+                self.rel_names,
+                self.num_bases,
+                activation=None,
+                self_loop=self.use_self_loop,
+            )
+        )
    def forward(self, h, blocks):
        for layer, block in zip(self.layers, blocks):
            h = layer(block, h)
        return h
-@utils.benchmark('time', 600)
-@utils.parametrize('data', ['ogbn-mag'])
+@utils.benchmark("time", 600)
+@utils.parametrize("data", ["ogbn-mag"])
 def track_time(data):
    dataset = utils.process_data(data)
    device = utils.get_bench_device()
-    if data == 'ogbn-mag':
+    if data == "ogbn-mag":
        n_bases = 2
        l2norm = 0
    else:
@@ -252,35 +309,50 @@ def track_time(data):
    hg = dataset[0]
    category = dataset.predict_category
    num_classes = dataset.num_classes
-    train_mask = hg.nodes[category].data.pop('train_mask')
+    train_mask = hg.nodes[category].data.pop("train_mask")
    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
-    labels = hg.nodes[category].data.pop('labels')
+    labels = hg.nodes[category].data.pop("labels")
    node_feats = {}
    num_nodes = {}
    for ntype in hg.ntypes:
-        node_feats[ntype] = hg.nodes[ntype].data['feat'] if 'feat' in hg.nodes[ntype].data else None
+        node_feats[ntype] = (
+            hg.nodes[ntype].data["feat"]
+            if "feat" in hg.nodes[ntype].data
+            else None
+        )
        num_nodes[ntype] = hg.number_of_nodes(ntype)
    embed_layer = RelGraphEmbed(hg, device, n_hidden, num_nodes, node_feats)
-    model = EntityClassify(hg,
+    model = EntityClassify(
-                           n_hidden,
+        hg,
-                           num_classes,
+        n_hidden,
-                           num_bases=n_bases,
+        num_classes,
-                           num_hidden_layers=n_layers - 2,
+        num_bases=n_bases,
-                           dropout=dropout,
+        num_hidden_layers=n_layers - 2,
-                           use_self_loop=use_self_loop)
+        dropout=dropout,
+        use_self_loop=use_self_loop,
+    )
    embed_layer = embed_layer.to(device)
    model = model.to(device)
-    all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
+    all_params = itertools.chain(
+        model.parameters(), embed_layer.embeds.parameters()
+    )
    optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
-    sparse_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)
+    sparse_optimizer = th.optim.SparseAdam(
+        list(embed_layer.node_embeds.parameters()), lr=lr
+    )
    sampler = dgl.dataloading.MultiLayerNeighborSampler([fanout] * n_layers)
    loader = dgl.dataloading.DataLoader(
-        hg, {category: train_idx}, sampler,
+        hg,
-        batch_size=batch_size, shuffle=True, num_workers=4)
+        {category: train_idx},
+        sampler,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+    )
    print("start training...")
    model.train()
@@ -292,11 +364,13 @@ def track_time(data):
    with loader.enable_cpu_affinity():
        for step, (input_nodes, seeds, blocks) in enumerate(loader):
            blocks = [blk.to(device) for blk in blocks]
-            seeds = seeds[category]     # we only predict the nodes with type "category"
+            seeds = seeds[
+                category
+            ]  # we only predict the nodes with type "category"
            batch_tic = time.time()
            emb = embed_layer(blocks[0])
            lbl = labels[seeds].to(device)
-            emb = {k : e.to(device) for k, e in emb.items()}
+            emb = {k: e.to(device) for k, e in emb.items()}
            logits = model(emb, blocks)[category]
            loss = F.cross_entropy(logits, lbl)
            loss.backward()
@@ -306,7 +380,9 @@ def track_time(data):
            # start timer at before iter_start
            if step == iter_start - 1:
                t0 = time.time()
-            elif step == iter_count + iter_start - 1:  # time iter_count iterations
+            elif (
+                step == iter_count + iter_start - 1
+            ):  # time iter_count iterations
                break
    t1 = time.time()

--- a/benchmarks/benchmarks/model_speed/bench_rgcn_homogeneous_ns.py
+++ b/benchmarks/benchmarks/model_speed/bench_rgcn_homogeneous_ns.py
-import dgl
 import itertools
+import time
+import dgl
+import dgl.nn.pytorch as dglnn
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
-import dgl.nn.pytorch as dglnn
 from dgl.nn import RelGraphConv
-import time
+from torch.utils.data import DataLoader
 from .. import utils
 class EntityClassify(nn.Module):
-    """ Entity classification class for RGCN
+    """Entity classification class for RGCN
    Parameters
    ----------
    device : int
@@ -35,17 +37,20 @@ class EntityClassify(nn.Module):
    use_self_loop : bool
        Use self loop if True, default False.
    """
-    def __init__(self,
-                 device,
+    def __init__(
-                 num_nodes,
+        self,
-                 h_dim,
+        device,
-                 out_dim,
+        num_nodes,
-                 num_rels,
+        h_dim,
-                 num_bases=None,
+        out_dim,
-                 num_hidden_layers=1,
+        num_rels,
-                 dropout=0,
+        num_bases=None,
-                 use_self_loop=False,
+        num_hidden_layers=1,
-                 layer_norm=False):
+        dropout=0,
+        use_self_loop=False,
+        layer_norm=False,
+    ):
        super(EntityClassify, self).__init__()
        self.device = device
        self.num_nodes = num_nodes
@@ -60,22 +65,47 @@ class EntityClassify(nn.Module):
        self.layers = nn.ModuleList()
        # i2h
-        self.layers.append(RelGraphConv(
+        self.layers.append(
-            self.h_dim, self.h_dim, self.num_rels, "basis",
+            RelGraphConv(
-            self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+                self.h_dim,
-            dropout=self.dropout, layer_norm = layer_norm))
+                self.h_dim,
+                self.num_rels,
+                "basis",
+                self.num_bases,
+                activation=F.relu,
+                self_loop=self.use_self_loop,
+                dropout=self.dropout,
+                layer_norm=layer_norm,
+            )
+        )
        # h2h
        for idx in range(self.num_hidden_layers):
-            self.layers.append(RelGraphConv(
+            self.layers.append(
-                self.h_dim, self.h_dim, self.num_rels, "basis",
+                RelGraphConv(
-                self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+                    self.h_dim,
-                dropout=self.dropout, layer_norm = layer_norm))
+                    self.h_dim,
+                    self.num_rels,
+                    "basis",
+                    self.num_bases,
+                    activation=F.relu,
+                    self_loop=self.use_self_loop,
+                    dropout=self.dropout,
+                    layer_norm=layer_norm,
+                )
+            )
        # h2o
-        self.layers.append(RelGraphConv(
+        self.layers.append(
-            self.h_dim, self.out_dim, self.num_rels, "basis",
+            RelGraphConv(
-            self.num_bases, activation=None,
+                self.h_dim,
-            self_loop=self.use_self_loop,
+                self.out_dim,
-            layer_norm = layer_norm))
+                self.num_rels,
+                "basis",
+                self.num_bases,
+                activation=None,
+                self_loop=self.use_self_loop,
+                layer_norm=layer_norm,
+            )
+        )
    def forward(self, blocks, feats, norm=None):
        if blocks is None:
@@ -84,9 +114,10 @@ class EntityClassify(nn.Module):
        h = feats
        for layer, block in zip(self.layers, blocks):
            block = block.to(self.device)
-            h = layer(block, h, block.edata['etype'], block.edata['norm'])
+            h = layer(block, h, block.edata["etype"], block.edata["norm"])
        return h
 class RelGraphEmbedLayer(nn.Module):
    r"""Embedding layer for featureless heterograph.
    Parameters
@@ -107,15 +138,18 @@ class RelGraphEmbedLayer(nn.Module):
    embed_name : str, optional
        Embed name
    """
-    def __init__(self,
-                 device,
+    def __init__(
-                 num_nodes,
+        self,
-                 node_tids,
+        device,
-                 num_of_ntype,
+        num_nodes,
-                 input_size,
+        node_tids,
-                 embed_size,
+        num_of_ntype,
-                 sparse_emb=False,
+        input_size,
-                 embed_name='embed'):
+        embed_size,
+        sparse_emb=False,
+        embed_name="embed",
+    ):
        super(RelGraphEmbedLayer, self).__init__()
        self.device = device
        self.embed_size = embed_size
@@ -135,7 +169,9 @@ class RelGraphEmbedLayer(nn.Module):
                nn.init.xavier_uniform_(embed)
                self.embeds[str(ntype)] = embed
-        self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
+        self.node_embeds = th.nn.Embedding(
+            node_tids.shape[0], self.embed_size, sparse=self.sparse_emb
+        )
        nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)
    def forward(self, node_ids, node_tids, type_ids, features):
@@ -157,35 +193,40 @@ class RelGraphEmbedLayer(nn.Module):
            embeddings as the input of the next layer
        """
        tsd_ids = node_ids.to(self.node_embeds.weight.device)
-        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.device)
+        embeds = th.empty(
+            node_ids.shape[0], self.embed_size, device=self.device
+        )
        for ntype in range(self.num_of_ntype):
            if features[ntype] is not None:
                loc = node_tids == ntype
-                embeds[loc] = features[ntype][type_ids[loc]].to(self.device) @ self.embeds[str(ntype)].to(self.device)
+                embeds[loc] = features[ntype][type_ids[loc]].to(
+                    self.device
+                ) @ self.embeds[str(ntype)].to(self.device)
            else:
                loc = node_tids == ntype
                embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.device)
        return embeds
-@utils.benchmark('time', 600)
-@utils.parametrize('data', ['am', 'ogbn-mag'])
+@utils.benchmark("time", 600)
+@utils.parametrize("data", ["am", "ogbn-mag"])
 def track_time(data):
    dataset = utils.process_data(data)
    device = utils.get_bench_device()
-    if data == 'am':
+    if data == "am":
        batch_size = 64
        n_bases = 40
        l2norm = 5e-4
-    elif data == 'ogbn-mag':
+    elif data == "ogbn-mag":
        batch_size = 1024
        n_bases = 2
        l2norm = 0
    else:
        raise ValueError()
-    fanouts = [25,15]
+    fanouts = [25, 15]
    n_layers = 2
    n_hidden = 64
    dropout = 0.5
@@ -198,18 +239,18 @@ def track_time(data):
    hg = dataset[0]
    category = dataset.predict_category
    num_classes = dataset.num_classes
-    train_mask = hg.nodes[category].data.pop('train_mask')
+    train_mask = hg.nodes[category].data.pop("train_mask")
    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
-    labels = hg.nodes[category].data.pop('labels').to(device)
+    labels = hg.nodes[category].data.pop("labels").to(device)
    num_of_ntype = len(hg.ntypes)
    num_rels = len(hg.canonical_etypes)
    node_feats = []
    for ntype in hg.ntypes:
-        if len(hg.nodes[ntype].data) == 0 or 'feat' not in hg.nodes[ntype].data:
+        if len(hg.nodes[ntype].data) == 0 or "feat" not in hg.nodes[ntype].data:
            node_feats.append(None)
        else:
-            feat = hg.nodes[ntype].data.pop('feat')
+            feat = hg.nodes[ntype].data.pop("feat")
            node_feats.append(feat.share_memory_())
    # get target category id
@@ -218,26 +259,28 @@ def track_time(data):
        if ntype == category:
            category_id = i
    g = dgl.to_homogeneous(hg)
-    u, v, eid = g.all_edges(form='all')
+    u, v, eid = g.all_edges(form="all")
    # global norm
-    _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
+    _, inverse_index, count = th.unique(
+        v, return_inverse=True, return_counts=True
+    )
    degrees = count[inverse_index]
    norm = th.ones(eid.shape[0]) / degrees
    norm = norm.unsqueeze(1)
-    g.edata['norm'] = norm
+    g.edata["norm"] = norm
-    g.edata['etype'] = g.edata[dgl.ETYPE]
+    g.edata["etype"] = g.edata[dgl.ETYPE]
-    g.ndata['type_id'] = g.ndata[dgl.NID]
+    g.ndata["type_id"] = g.ndata[dgl.NID]
-    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
+    g.ndata["ntype"] = g.ndata[dgl.NTYPE]
    node_ids = th.arange(g.number_of_nodes())
    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
-    loc = (node_tids == category_id)
+    loc = node_tids == category_id
    target_nids = node_ids[loc]
    train_nids = target_nids[train_idx]
-    g = g.formats('csc')
+    g = g.formats("csc")
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
    loader = dgl.dataloading.DataLoader(
        g,
@@ -246,38 +289,47 @@ def track_time(data):
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
-        num_workers=num_workers)
+        num_workers=num_workers,
+    )
    # node features
    # None for one-hot feature, if not none, it should be the feature tensor.
    #
-    embed_layer = RelGraphEmbedLayer(device,
+    embed_layer = RelGraphEmbedLayer(
-                                     g.number_of_nodes(),
+        device,
-                                     node_tids,
+        g.number_of_nodes(),
-                                     num_of_ntype,
+        node_tids,
-                                     node_feats,
+        num_of_ntype,
-                                     n_hidden,
+        node_feats,
-                                     sparse_emb=True)
+        n_hidden,
+        sparse_emb=True,
+    )
    # create model
    # all model params are in device.
-    model = EntityClassify(device,
+    model = EntityClassify(
-                           g.number_of_nodes(),
+        device,
-                           n_hidden,
+        g.number_of_nodes(),
-                           num_classes,
+        n_hidden,
-                           num_rels,
+        num_classes,
-                           num_bases=n_bases,
+        num_rels,
-                           num_hidden_layers=n_layers - 2,
+        num_bases=n_bases,
-                           dropout=dropout,
+        num_hidden_layers=n_layers - 2,
-                           use_self_loop=use_self_loop,
+        dropout=dropout,
-                           layer_norm=False)
+        use_self_loop=use_self_loop,
+        layer_norm=False,
+    )
    embed_layer = embed_layer.to(device)
    model = model.to(device)
-    all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
+    all_params = itertools.chain(
+        model.parameters(), embed_layer.embeds.parameters()
+    )
    optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
-    emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)
+    emb_optimizer = th.optim.SparseAdam(
+        list(embed_layer.node_embeds.parameters()), lr=lr
+    )
    print("start training...")
    model.train()
@@ -287,12 +339,14 @@ def track_time(data):
    with loader.enable_cpu_affinity():
        for step, sample_data in enumerate(loader):
            input_nodes, output_nodes, blocks = sample_data
-            feats = embed_layer(input_nodes,
+            feats = embed_layer(
-                                blocks[0].srcdata['ntype'],
+                input_nodes,
-                                blocks[0].srcdata['type_id'],
+                blocks[0].srcdata["ntype"],
-                                node_feats)
+                blocks[0].srcdata["type_id"],
+                node_feats,
+            )
            logits = model(blocks, feats)
-            seed_idx = blocks[-1].dstdata['type_id']
+            seed_idx = blocks[-1].dstdata["type_id"]
            loss = F.cross_entropy(logits, labels[seed_idx])
            optimizer.zero_grad()
            emb_optimizer.zero_grad()
@@ -300,11 +354,13 @@ def track_time(data):
            loss.backward()
            optimizer.step()
            emb_optimizer.step()
            # start timer at before iter_start
            if step == iter_start - 1:
                t0 = time.time()
-            elif step == iter_count + iter_start - 1:  # time iter_count iterations
+            elif (
+                step == iter_count + iter_start - 1
+            ):  # time iter_count iterations
                break
    t1 = time.time()

--- a/benchmarks/benchmarks/model_speed/bench_sage.py
+++ b/benchmarks/benchmarks/model_speed/bench_sage.py
 import time
+import dgl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import dgl
 from dgl.nn.pytorch import SAGEConv
 from .. import utils

--- a/benchmarks/benchmarks/model_speed/bench_sage_ns.py
+++ b/benchmarks/benchmarks/model_speed/bench_sage_ns.py
+import time
 import dgl
+import dgl.nn.pytorch as dglnn
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
-import dgl.nn.pytorch as dglnn
-import time
 from .. import utils
 class SAGE(nn.Module):
-    def __init__(self,
+    def __init__(
-                 in_feats,
+        self, in_feats, n_hidden, n_classes, n_layers, activation, dropout
-                 n_hidden,
+    ):
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
-        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
+        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean"))
        for i in range(1, n_layers - 1):
-            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
+            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean"))
-        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
+        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean"))
        self.dropout = nn.Dropout(dropout)
        self.activation = activation
@@ -39,23 +37,25 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h
 def load_subtensor(g, seeds, input_nodes, device):
    """
    Copys features and labels of a set of nodes onto GPU.
    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
+    batch_inputs = g.ndata["features"][input_nodes].to(device)
-    batch_labels = g.ndata['labels'][seeds].to(device)
+    batch_labels = g.ndata["labels"][seeds].to(device)
    return batch_inputs, batch_labels
-@utils.benchmark('time', 600)
-@utils.parametrize('data', ['reddit', 'ogbn-products'])
+@utils.benchmark("time", 600)
+@utils.parametrize("data", ["reddit", "ogbn-products"])
 def track_time(data):
    data = utils.process_data(data)
    device = utils.get_bench_device()
    g = data[0]
-    g.ndata['features'] = g.ndata['feat']
+    g.ndata["features"] = g.ndata["feat"]
-    g.ndata['labels'] = g.ndata['label']
+    g.ndata["labels"] = g.ndata["label"]
-    in_feats = g.ndata['features'].shape[1]
+    in_feats = g.ndata["features"].shape[1]
    n_classes = data.num_classes
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
@@ -65,7 +65,7 @@ def track_time(data):
    num_epochs = 20
    num_hidden = 16
    num_layers = 2
-    fan_out = '10,25'
+    fan_out = "10,25"
    batch_size = 1024
    lr = 0.003
    dropout = 0.5
@@ -73,11 +73,12 @@ def track_time(data):
    iter_start = 3
    iter_count = 10
-    train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
+    train_nid = th.nonzero(g.ndata["train_mask"], as_tuple=True)[0]
    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
-        [int(fanout) for fanout in fan_out.split(',')])
+        [int(fanout) for fanout in fan_out.split(",")]
+    )
    dataloader = dgl.dataloading.DataLoader(
        g,
        train_nid,
@@ -85,7 +86,8 @@ def track_time(data):
        batch_size=batch_size,
        shuffle=True,
        drop_last=False,
-        num_workers=num_workers)
+        num_workers=num_workers,
+    )
    # Define model and optimizer
    model = SAGE(in_feats, num_hidden, n_classes, num_layers, F.relu, dropout)
@@ -93,7 +95,7 @@ def track_time(data):
    loss_fcn = nn.CrossEntropyLoss()
    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # Enable dataloader cpu affinitization for cpu devices (no effect on gpu)
    with dataloader.enable_cpu_affinity():
        # Training loop
@@ -102,10 +104,10 @@ def track_time(data):
        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
            # Load the input features as well as output labels
-            #batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
+            # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
            blocks = [block.int().to(device) for block in blocks]
-            batch_inputs = blocks[0].srcdata['features']
+            batch_inputs = blocks[0].srcdata["features"]
-            batch_labels = blocks[-1].dstdata['labels']
+            batch_labels = blocks[-1].dstdata["labels"]
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -117,7 +119,9 @@ def track_time(data):
            # start timer at before iter_start
            if step == iter_start - 1:
                t0 = time.time()
-            elif step == iter_count + iter_start - 1:  # time iter_count iterations
+            elif (
+                step == iter_count + iter_start - 1
+            ):  # time iter_count iterations
                break
    t1 = time.time()

--- a/benchmarks/benchmarks/model_speed/bench_sage_unsupervised_ns.py
+++ b/benchmarks/benchmarks/model_speed/bench_sage_unsupervised_ns.py
 import time
+import dgl
+import dgl.function as fn
+import dgl.nn.pytorch as dglnn
 import numpy as np
 import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import dgl
-import dgl.function as fn
-import dgl.nn.pytorch as dglnn
 from .. import utils

--- a/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
+++ b/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
@@ -13,18 +13,18 @@ import time
 from pathlib import Path
 from types import SimpleNamespace
+import dgl
 import numpy as np
 import torch as th
 import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
+from dgl.nn import RelGraphConv
 from torch.multiprocessing import Queue
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
-import dgl
-from dgl.nn import RelGraphConv
 from .. import utils
 # import sys

--- a/benchmarks/benchmarks/multigpu/bench_multigpu_sage.py
+++ b/benchmarks/benchmarks/multigpu/bench_multigpu_sage.py
+import argparse
+import math
+import time
 from types import SimpleNamespace
 from typing import NamedTuple
 import dgl
+import dgl.nn.pytorch as dglnn
 import numpy as np
 import torch as th
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-import dgl.nn.pytorch as dglnn
-import time
-import math
-import argparse
 from torch.nn.parallel import DistributedDataParallel
-import dgl.nn.pytorch as dglnn
 from .. import utils
 class SAGE(nn.Module):
-    def __init__(self,
+    def __init__(
-                 in_feats,
+        self, in_feats, n_hidden, n_classes, n_layers, activation, dropout
-                 n_hidden,
+    ):
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
-        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
+        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, "mean"))
        for i in range(1, n_layers - 1):
-            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
+            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, "mean"))
-        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
+        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, "mean"))
        self.dropout = nn.Dropout(dropout)
        self.activation = activation
@@ -54,6 +51,7 @@ def load_subtensor(nfeat, labels, seeds, input_nodes, dev_id):
    batch_labels = labels[seeds].to(dev_id)
    return batch_inputs, batch_labels
 # Entry point
@@ -61,35 +59,38 @@ def run(result_queue, proc_id, n_gpus, args, devices, data):
    dev_id = devices[proc_id]
    timing_records = []
    if n_gpus > 1:
-        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
+        dist_init_method = "tcp://{master_ip}:{master_port}".format(
-            master_ip='127.0.0.1', master_port='12345')
+            master_ip="127.0.0.1", master_port="12345"
+        )
        world_size = n_gpus
-        th.distributed.init_process_group(backend="nccl",
+        th.distributed.init_process_group(
-                                          init_method=dist_init_method,
+            backend="nccl",
-                                          world_size=world_size,
+            init_method=dist_init_method,
-                                          rank=proc_id)
+            world_size=world_size,
+            rank=proc_id,
+        )
    th.cuda.set_device(dev_id)
    n_classes, train_g, _, _ = data
-    train_nfeat = train_g.ndata.pop('feat')
+    train_nfeat = train_g.ndata.pop("feat")
-    train_labels = train_g.ndata.pop('label')
+    train_labels = train_g.ndata.pop("label")
    train_nfeat = train_nfeat.to(dev_id)
    train_labels = train_labels.to(dev_id)
    in_feats = train_nfeat.shape[1]
-    train_mask = train_g.ndata['train_mask']
+    train_mask = train_g.ndata["train_mask"]
    train_nid = train_mask.nonzero().squeeze()
    # Split train_nid
-    train_nid = th.split(train_nid, math.ceil(
+    train_nid = th.split(train_nid, math.ceil(len(train_nid) / n_gpus))[proc_id]
-        len(train_nid) / n_gpus))[proc_id]
    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
-        [int(fanout) for fanout in args.fan_out.split(',')])
+        [int(fanout) for fanout in args.fan_out.split(",")]
+    )
    dataloader = dgl.dataloading.DataLoader(
        train_g,
        train_nid,
@@ -97,15 +98,23 @@ def run(result_queue, proc_id, n_gpus, args, devices, data):
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=False,
-        num_workers=args.num_workers)
+        num_workers=args.num_workers,
+    )
    # Define model and optimizer
-    model = SAGE(in_feats, args.num_hidden, n_classes,
+    model = SAGE(
-                 args.num_layers, F.relu, args.dropout)
+        in_feats,
+        args.num_hidden,
+        n_classes,
+        args.num_layers,
+        F.relu,
+        args.dropout,
+    )
    model = model.to(dev_id)
    if n_gpus > 1:
        model = DistributedDataParallel(
-            model, device_ids=[dev_id], output_device=dev_id)
+            model, device_ids=[dev_id], output_device=dev_id
+        )
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
@@ -114,8 +123,9 @@ def run(result_queue, proc_id, n_gpus, args, devices, data):
        if proc_id == 0:
            tic_step = time.time()
-        batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
+        batch_inputs, batch_labels = load_subtensor(
-                                                    seeds, input_nodes, dev_id)
+            train_nfeat, train_labels, seeds, input_nodes, dev_id
+        )
        blocks = [block.int().to(dev_id) for block in blocks]
        batch_pred = model(blocks, batch_inputs)
        loss = loss_fcn(batch_pred, batch_labels)
@@ -135,18 +145,18 @@ def run(result_queue, proc_id, n_gpus, args, devices, data):
        result_queue.put(np.array(timing_records))
-@utils.benchmark('time', timeout=600)
+@utils.benchmark("time", timeout=600)
 @utils.skip_if_not_4gpu()
-@utils.parametrize('data', ['reddit', 'ogbn-products'])
+@utils.parametrize("data", ["reddit", "ogbn-products"])
 def track_time(data):
    args = SimpleNamespace(
        num_hidden=16,
-        fan_out = "10,25",
+        fan_out="10,25",
-        batch_size = 1000,
+        batch_size=1000,
-        lr = 0.003,
+        lr=0.003,
-        dropout = 0.5,
+        dropout=0.5,
-        num_layers = 2,
+        num_layers=2,
-        num_workers = 4,
+        num_workers=4,
    )
    devices = [0, 1, 2, 3]
@@ -167,15 +177,17 @@ def track_time(data):
    result_queue = mp.Queue()
    procs = []
    for proc_id in range(n_gpus):
-        p = mp.Process(target=utils.thread_wrapped_func(run),
+        p = mp.Process(
-                       args=(result_queue, proc_id, n_gpus, args, devices, data))
+            target=utils.thread_wrapped_func(run),
+            args=(result_queue, proc_id, n_gpus, args, devices, data),
+        )
        p.start()
        procs.append(p)
    for p in procs:
        p.join()
    time_records = result_queue.get(block=False)
-    num_exclude = 10 # exclude first 10 iterations
+    num_exclude = 10  # exclude first 10 iterations
    if len(time_records) < 15:
        # exclude less if less records
-        num_exclude = int(len(time_records)*0.3)
+        num_exclude = int(len(time_records) * 0.3)
    return np.mean(time_records[num_exclude:])
--- a/benchmarks/benchmarks/multigpu/rgcn_model.py
+++ b/benchmarks/benchmarks/multigpu/rgcn_model.py
+import dgl
 import torch as th
 import torch.nn as nn
-import dgl
 class BaseRGCN(nn.Module):
    def __init__(

--- a/benchmarks/benchmarks/rgcn.py
+++ b/benchmarks/benchmarks/rgcn.py
+import dgl
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import dgl
 from dgl.nn.pytorch import RelGraphConv
 from . import utils

--- a/benchmarks/benchmarks/utils.py
+++ b/benchmarks/benchmarks/utils.py
@@ -8,14 +8,14 @@ import zipfile
 from functools import partial, reduce, wraps
 from timeit import default_timer
+import dgl
 import numpy as np
 import pandas
 import requests
 import torch
 from ogb.nodeproppred import DglNodePropPredDataset
-import dgl
 def _download(url, path, filename):
    fn = os.path.join(path, filename)

--- a/benchmarks/scripts/replace_branch.py
+++ b/benchmarks/scripts/replace_branch.py
@@ -20,7 +20,6 @@ def json_minify(string, strip_space=True):
    index = 0
    for match in re.finditer(tokenizer, string):
        if not (in_multi or in_single):
            tmp = string[index : match.start()]
            if not in_string and strip_space: