Integrate Regression Test with Jenkins (#2448)

* add bench jenkins * instance type * fix * fix * fix * 111 * test * 111 * 111 * fix * test * run * fix * fix * fix * fix * fix * publish results * 111 * regression * launch ec2 script * fix * add * run on master * change * rrr * run gpu * fix * fix * try fix * fix * ff * fix * fix * fix * refactor * fix * fix * update * fix * fix * fix * fix * remove import torchtext * add shm size * update * fix * fix * fix * fix * fix this!!!! * 111 * fix * remove verbose * fix * fix * fix * fix * fix * fix * fix * fix * update readme * fix * fix * fix * change asv default to head * commit sage and rgcn * fix * update

Integrate Regression Test with Jenkins (#2448)
* add bench jenkins * instance type * fix * fix * fix * 111 * test * 111 * 111 * fix * test * run * fix * fix * fix * fix * fix * publish results * 111 * regression * launch ec2 script * fix * add * run on master * change * rrr * run gpu * fix * fix * try fix * fix * ff * fix * fix * fix * refactor * fix * fix * update * fix * fix * fix * fix * remove import torchtext * add shm size * update * fix * fix * fix * fix * fix this!!!! * 111 * fix * remove verbose * fix * fix * fix * fix * fix * fix * fix * fix * update readme * fix * fix * fix * change asv default to head * commit sage and rgcn * fix * update
71283997 · Jinjing Zhou · GitHub · 4e7a646b · 71283997 · 71283997
Unverified Commit 71283997 authored Jan 11, 2021 by Jinjing Zhou Committed by GitHub Jan 11, 2021
13 changed files
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -17,17 +17,19 @@
    // uninstalling the project. See asv.conf.json documentation.
    //
    "build_command": [
-        "/bin/bash {conf_dir}/build_dgl_asv.sh"
+        "/bin/bash {conf_dir}/scripts/build_dgl_asv.sh"
    ],
    "install_command": [
-        "/bin/bash {conf_dir}/install_dgl_asv.sh"
+        "/bin/bash {conf_dir}/scripts/install_dgl_asv.sh"
    ],
    "uninstall_command": [
        "return-code=any python -m pip uninstall -y dgl"
    ],
    // List of branches to benchmark. If not provided, defaults to "master"
    // (for git) or "default" (for mercurial).
-    "branches": ["HEAD", "master"], // for git
+    "branches": [
+        "HEAD"
+    ], // for git
    // The DVCS being used.  If not set, it will be automatically
    // determined from "repo" by looking at the protocol in the URL
    // (if remote), or by looking for special directories, such as

--- a/benchmarks/benchmarks/model_acc/bench_rgcn_ns.py
+++ b/benchmarks/benchmarks/model_acc/bench_rgcn_ns.py
+import dgl
+import itertools
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.multiprocessing as mp
+from torch.utils.data import DataLoader
+import dgl.nn.pytorch as dglnn
+from dgl.nn import RelGraphConv
+import time
+from .. import utils
+class EntityClassify(nn.Module):
+    """ Entity classification class for RGCN
+    Parameters
+    ----------
+    device : int
+        Device to run the layer.
+    num_nodes : int
+        Number of nodes.
+    h_dim : int
+        Hidden dim size.
+    out_dim : int
+        Output dim size.
+    num_rels : int
+        Numer of relation types.
+    num_bases : int
+        Number of bases. If is none, use number of relations.
+    num_hidden_layers : int
+        Number of hidden RelGraphConv Layer
+    dropout : float
+        Dropout
+    use_self_loop : bool
+        Use self loop if True, default False.
+    low_mem : bool
+        True to use low memory implementation of relation message passing function
+        trade speed with memory consumption
+    """
+    def __init__(self,
+                 device,
+                 num_nodes,
+                 h_dim,
+                 out_dim,
+                 num_rels,
+                 num_bases=None,
+                 num_hidden_layers=1,
+                 dropout=0,
+                 use_self_loop=False,
+                 low_mem=False,
+                 layer_norm=False):
+        super(EntityClassify, self).__init__()
+        self.device = device
+        self.num_nodes = num_nodes
+        self.h_dim = h_dim
+        self.out_dim = out_dim
+        self.num_rels = num_rels
+        self.num_bases = None if num_bases < 0 else num_bases
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.use_self_loop = use_self_loop
+        self.low_mem = low_mem
+        self.layer_norm = layer_norm
+        self.layers = nn.ModuleList()
+        # i2h
+        self.layers.append(RelGraphConv(
+            self.h_dim, self.h_dim, self.num_rels, "basis",
+            self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+            low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
+        # h2h
+        for idx in range(self.num_hidden_layers):
+            self.layers.append(RelGraphConv(
+                self.h_dim, self.h_dim, self.num_rels, "basis",
+                self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
+                low_mem=self.low_mem, dropout=self.dropout, layer_norm = layer_norm))
+        # h2o
+        self.layers.append(RelGraphConv(
+            self.h_dim, self.out_dim, self.num_rels, "basis",
+            self.num_bases, activation=None,
+            self_loop=self.use_self_loop,
+            low_mem=self.low_mem, layer_norm = layer_norm))
+    def forward(self, blocks, feats, norm=None):
+        if blocks is None:
+            # full graph training
+            blocks = [self.g] * len(self.layers)
+        h = feats
+        for layer, block in zip(self.layers, blocks):
+            block = block.to(self.device)
+            h = layer(block, h, block.edata['etype'], block.edata['norm'])
+        return h
+class RelGraphEmbedLayer(nn.Module):
+    r"""Embedding layer for featureless heterograph.
+    Parameters
+    ----------
+    device : int
+        Device to run the layer.
+    num_nodes : int
+        Number of nodes.
+    node_tides : tensor
+        Storing the node type id for each node starting from 0
+    num_of_ntype : int
+        Number of node types
+    input_size : list of int
+        A list of input feature size for each node type. If None, we then
+        treat certain input feature as an one-hot encoding feature.
+    embed_size : int
+        Output embed size
+    embed_name : str, optional
+        Embed name
+    """
+    def __init__(self,
+                 device,
+                 num_nodes,
+                 node_tids,
+                 num_of_ntype,
+                 input_size,
+                 embed_size,
+                 sparse_emb=False,
+                 embed_name='embed'):
+        super(RelGraphEmbedLayer, self).__init__()
+        self.device = device
+        self.embed_size = embed_size
+        self.embed_name = embed_name
+        self.num_nodes = num_nodes
+        self.sparse_emb = sparse_emb
+        # create weight embeddings for each node for each relation
+        self.embeds = nn.ParameterDict()
+        self.num_of_ntype = num_of_ntype
+        self.idmap = th.empty(num_nodes).long()
+        for ntype in range(num_of_ntype):
+            if input_size[ntype] is not None:
+                input_emb_size = input_size[ntype].shape[1]
+                embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
+                nn.init.xavier_uniform_(embed)
+                self.embeds[str(ntype)] = embed
+        self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
+        nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)
+    def forward(self, node_ids, node_tids, type_ids, features):
+        """Forward computation
+        Parameters
+        ----------
+        node_ids : tensor
+            node ids to generate embedding for.
+        node_tids : tensor
+            node type ids
+        features : list of features
+            list of initial features for nodes belong to different node type.
+            If None, the corresponding features is an one-hot encoding feature,
+            else use the features directly as input feature and matmul a
+            projection matrix.
+        Returns
+        -------
+        tensor
+            embeddings as the input of the next layer
+        """
+        tsd_ids = node_ids.to(self.node_embeds.weight.device)
+        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.device)
+        for ntype in range(self.num_of_ntype):
+            if features[ntype] is not None:
+                loc = node_tids == ntype
+                embeds[loc] = features[ntype][type_ids[loc]].to(self.device) @ self.embeds[str(ntype)].to(self.device)
+            else:
+                loc = node_tids == ntype
+                embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.device)
+        return embeds
+def evaluate(model, embed_layer, eval_loader, node_feats):
+    model.eval()
+    embed_layer.eval()
+    eval_logits = []
+    eval_seeds = []
+    with th.no_grad():
+        for sample_data in tqdm.tqdm(eval_loader):
+            th.cuda.empty_cache()
+            seeds, blocks = sample_data
+            feats = embed_layer(blocks[0].srcdata[dgl.NID],
+                    blocks[0].srcdata[dgl.NTYPE],
+                    blocks[0].srcdata['type_id'],
+                    node_feats)
+            logits = model(blocks, feats)
+            eval_logits.append(logits.cpu().detach())
+            eval_seeds.append(seeds.cpu().detach())
+    eval_logits = th.cat(eval_logits)
+    eval_seeds = th.cat(eval_seeds)
+    return eval_logits, eval_seeds
+@utils.benchmark('time', 3600)
+@utils.parametrize('data', ['am', 'ogbn-mag'])
+def track_acc(data):
+    dataset = utils.process_data(data)
+    device = utils.get_bench_device()
+    if data == 'am':
+        n_bases = 40
+        l2norm = 5e-4
+    elif data == 'ogbn-mag':
+        n_bases = 2
+        l2norm = 0
+    else:
+        raise ValueError()
+    fanouts = [25,15]
+    n_layers = 2
+    batch_size = 1024
+    n_hidden = 64
+    dropout = 0.5
+    use_self_loop = True
+    lr = 0.01
+    n_epochs = 20
+    low_mem = True
+    num_workers = 4
+    hg = dataset[0]
+    category = dataset.predict_category
+    num_classes = dataset.num_classes
+    train_mask = hg.nodes[category].data.pop('train_mask')
+    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
+    test_mask = hg.nodes[category].data.pop('test_mask')
+    test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
+    labels = hg.nodes[category].data.pop('labels').to(device)
+    num_of_ntype = len(hg.ntypes)
+    num_rels = len(hg.canonical_etypes)
+    node_feats = []
+    for ntype in hg.ntypes:
+        if len(hg.nodes[ntype].data) == 0 or 'feat' not in hg.nodes[ntype].data:
+            node_feats.append(None)
+        else:
+            feat = hg.nodes[ntype].data.pop('feat')
+            node_feats.append(feat.share_memory_())
+    # get target category id
+    category_id = len(hg.ntypes)
+    for i, ntype in enumerate(hg.ntypes):
+        if ntype == category:
+            category_id = i
+    g = dgl.to_homogeneous(hg)
+    u, v, eid = g.all_edges(form='all')
+    # global norm
+    _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
+    degrees = count[inverse_index]
+    norm = th.ones(eid.shape[0]) / degrees
+    norm = norm.unsqueeze(1)
+    g.edata['norm'] = norm
+    g.edata['etype'] = g.edata[dgl.ETYPE]
+    g.ndata['type_id'] = g.ndata[dgl.NID]
+    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
+    node_ids = th.arange(g.number_of_nodes())
+    # find out the target node ids
+    node_tids = g.ndata[dgl.NTYPE]
+    loc = (node_tids == category_id)
+    target_nids = node_ids[loc]
+    train_nids = target_nids[train_idx]
+    # Create csr/coo/csc formats before launching training processes with multi-gpu.
+    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
+    g.create_formats_()
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    collator = dgl.dataloading.NodeCollator(g, train_nids, sampler, return_indices=True)
+    loader = dgl.dataloading.DataLoader(
+        collator.dataset, collate_fn=collator.collate,
+        batch_size=batch_size, shuffle=True, num_workers=4)
+    # test_sampler =  dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    test_loader = DataLoader(dataset=test_idx.numpy(),
+                             batch_size=batch_size,
+                             collate_fn=collator.collate,
+                             shuffle=False,
+                             num_workers=4)
+    # node features
+    # None for one-hot feature, if not none, it should be the feature tensor.
+    #
+    embed_layer = RelGraphEmbedLayer(device,
+                                     g.number_of_nodes(),
+                                     node_tids,
+                                     num_of_ntype,
+                                     node_feats,
+                                     n_hidden,
+                                     sparse_emb=True)
+    # create model
+    # all model params are in device.
+    model = EntityClassify(device,
+                           g.number_of_nodes(),
+                           n_hidden,
+                           num_classes,
+                           num_rels,
+                           num_bases=n_bases,
+                           num_hidden_layers=n_layers - 2,
+                           dropout=dropout,
+                           use_self_loop=use_self_loop,
+                           low_mem=low_mem,
+                           layer_norm=False)
+    embed_layer = embed_layer.to(device)
+    model = model.to(device)
+    all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
+    optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
+    emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)
+    print("start training...")
+    t0 = time.time()
+    for epoch in range(n_epochs):
+        model.train()
+        embed_layer.train()
+        for i, sample_data in enumerate(loader):
+            input_nodes, output_nodes, seed_idx, blocks = sample_data
+            feats = embed_layer(input_nodes,
+                                blocks[0].srcdata['ntype'],
+                                blocks[0].srcdata['type_id'],
+                                node_feats)
+            logits = model(blocks, feats)
+            loss = F.cross_entropy(logits, labels[train_idx][seed_idx])
+            optimizer.zero_grad()
+            emb_optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            emb_optimizer.step()
+    test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
+    test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item()
+    test_acc = th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)
+    t1 = time.time()
+    return test_acc
--- a/benchmarks/benchmarks/model_acc/bench_sage_ns.py
+++ b/benchmarks/benchmarks/model_acc/bench_sage_ns.py
+import dgl
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.multiprocessing as mp
+from torch.utils.data import DataLoader
+import dgl.nn.pytorch as dglnn
+import time
+from .. import utils
+class SAGE(nn.Module):
+    def __init__(self,
+                 in_feats,
+                 n_hidden,
+                 n_classes,
+                 n_layers,
+                 activation,
+                 dropout):
+        super().__init__()
+        self.n_layers = n_layers
+        self.n_hidden = n_hidden
+        self.n_classes = n_classes
+        self.layers = nn.ModuleList()
+        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
+        for i in range(1, n_layers - 1):
+            self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
+        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
+        self.dropout = nn.Dropout(dropout)
+        self.activation = activation
+    def forward(self, blocks, x):
+        h = x
+        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
+            h = layer(block, h)
+            if l != len(self.layers) - 1:
+                h = self.activation(h)
+                h = self.dropout(h)
+        return h
+    def inference(self, g, x, batch_size, device):
+        """
+        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
+        g : the entire graph.
+        x : the input of entire node set.
+        The inference code is written in a fashion that it could handle any number of nodes and
+        layers.
+        """
+        # During inference with sampling, multi-layer blocks are very inefficient because
+        # lots of computations in the first few layers are repeated.
+        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
+        # on each layer are of course splitted in batches.
+        # TODO: can we standardize this?
+        for l, layer in enumerate(self.layers):
+            y = th.zeros(g.number_of_nodes(), self.n_hidden if l !=
+                         len(self.layers) - 1 else self.n_classes)
+            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
+            dataloader = dgl.dataloading.NodeDataLoader(
+                g,
+                th.arange(g.number_of_nodes()),
+                sampler,
+                batch_size=batch_size,
+                shuffle=True,
+                drop_last=False,
+                num_workers=4)
+            for input_nodes, output_nodes, blocks in dataloader:
+                block = blocks[0]
+                block = block.int().to(device)
+                h = x[input_nodes].to(device)
+                h = layer(block, h)
+                if l != len(self.layers) - 1:
+                    h = self.activation(h)
+                    h = self.dropout(h)
+                y[output_nodes] = h.cpu()
+            x = y
+        return y
+def compute_acc(pred, labels):
+    """
+    Compute the accuracy of prediction given the labels.
+    """
+    labels = labels.long()
+    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
+def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
+    """
+    Evaluate the model on the validation set specified by ``val_nid``.
+    g : The entire graph.
+    inputs : The features of all the nodes.
+    labels : The labels of all the nodes.
+    val_nid : the node Ids for validation.
+    batch_size : Number of nodes to compute at the same time.
+    device : The GPU device to evaluate on.
+    """
+    model.eval()
+    with th.no_grad():
+        pred = model.inference(g, inputs, batch_size, device)
+    model.train()
+    return compute_acc(pred[val_nid], labels[val_nid])
+def load_subtensor(g, seeds, input_nodes, device):
+    """
+    Copys features and labels of a set of nodes onto GPU.
+    """
+    batch_inputs = g.ndata['features'][input_nodes].to(device)
+    batch_labels = g.ndata['labels'][seeds].to(device)
+    return batch_inputs, batch_labels
+@utils.benchmark('acc', 3600)
+@utils.parametrize('data', ['ogbn-products', "reddit"])
+def track_acc(data):
+    data = utils.process_data(data)
+    device = utils.get_bench_device()
+    g = data[0]
+    g.ndata['features'] = g.ndata['feat']
+    g.ndata['labels'] = g.ndata['label']
+    in_feats = g.ndata['features'].shape[1]
+    n_classes = data.num_labels
+    # Create csr/coo/csc formats before launching training processes with multi-gpu.
+    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
+    g.create_formats_()
+    num_epochs = 20
+    num_hidden = 16
+    num_layers = 2
+    fan_out = '5,10'
+    batch_size = 1024
+    lr = 0.003
+    dropout = 0.5
+    num_workers = 4
+    train_nid = th.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
+    # Create PyTorch DataLoader for constructing blocks
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(
+        [int(fanout) for fanout in fan_out.split(',')])
+    dataloader = dgl.dataloading.NodeDataLoader(
+        g,
+        train_nid,
+        sampler,
+        batch_size=batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=num_workers)
+    # Define model and optimizer
+    model = SAGE(in_feats, num_hidden, n_classes, num_layers, F.relu, dropout)
+    model = model.to(device)
+    loss_fcn = nn.CrossEntropyLoss()
+    loss_fcn = loss_fcn.to(device)
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    # dry run one epoch
+    for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
+        # Load the input features as well as output labels
+        #batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
+        blocks = [block.int().to(device) for block in blocks]
+        batch_inputs = blocks[0].srcdata['features']
+        batch_labels = blocks[-1].dstdata['labels']
+        # Compute loss and prediction
+        batch_pred = model(blocks, batch_inputs)
+        loss = loss_fcn(batch_pred, batch_labels)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    # Training loop
+    for epoch in range(num_epochs):
+        # Loop over the dataloader to sample the computation dependency graph as a list of
+        # blocks.
+        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
+            # Load the input features as well as output labels
+            #batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
+            blocks = [block.int().to(device) for block in blocks]
+            batch_inputs = blocks[0].srcdata['features']
+            batch_labels = blocks[-1].dstdata['labels']
+            # Compute loss and prediction
+            batch_pred = model(blocks, batch_inputs)
+            loss = loss_fcn(batch_pred, batch_labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+    test_g = g
+    test_nid = th.nonzero(
+        ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
+    test_acc = evaluate(
+        model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device)
+    return test_acc.item()
--- a/benchmarks/benchmarks/model_speed/bench_pinsage.py
+++ b/benchmarks/benchmarks/model_speed/bench_pinsage.py
@@ -5,7 +5,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import IterableDataset, DataLoader
-import torchtext
 import dgl
 import dgl.function as fn

--- a/benchmarks/benchmarks/utils.py
+++ b/benchmarks/benchmarks/utils.py
-import os, pickle
+import json
-import shutil, zipfile
+import os
+import pickle
+import shutil
+import zipfile
 import requests
 import inspect
 import numpy as np
 import pandas
 import dgl
 import torch
-import torchtext
 def _download(url, path, filename):
    fn = os.path.join(path, filename)
@@ -22,15 +25,30 @@ def _download(url, path, filename):
            writer.write(chunk)
    print('Download finished.')
 def get_livejournal():
-    _download('https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz',
+    # Same as https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz
-              '/tmp', 'soc-LiveJournal1.txt.gz')
+    _download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/livejournal/soc-LiveJournal1.txt.gz',
-    df = pandas.read_csv('/tmp/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
+              '/tmp/dataset', 'soc-LiveJournal1.txt.gz')
+    df = pandas.read_csv('/tmp/dataset/soc-LiveJournal1.txt.gz', sep='\t', skiprows=4, header=None,
+                         names=['src', 'dst'], compression='gzip')
+    src = df['src'].values
+    dst = df['dst'].values
+    print('construct the graph')
+    return dgl.graph((src, dst))
+def get_filmbaster():
+    # Same as https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz
+    _download('https://dgl-asv-data.s3-us-west-2.amazonaws.com/dataset/friendster/com-friendster.ungraph.txt.gz',
+              '/tmp/dataset', 'com-friendster.ungraph.txt.gz')
+    df = pandas.read_csv('/tmp/dataset/com-friendster.ungraph.txt.gz', sep='\t', skiprows=4, header=None,
                         names=['src', 'dst'], compression='gzip')
-    src = np.array(df['src'])
+    src = df['src'].values
-    dst = np.array(df['dst'])
+    dst = df['dst'].values
    print('construct the graph')
-    return dgl.DGLGraph((src, dst), readonly=True)
+    return dgl.graph((src, dst))
 def get_graph(name):
    if name == 'livejournal':
@@ -39,6 +57,7 @@ def get_graph(name):
        print(name + " doesn't exist")
        return None
 class OGBDataset(object):
    def __init__(self, g, num_labels, predict_category=None):
        self._g = g
@@ -75,7 +94,8 @@ def load_ogb_product():
    graph.ndata['label'] = labels
    in_feats = graph.ndata['feat'].shape[1]
-    num_labels = len(torch.unique(labels[torch.logical_not(torch.isnan(labels))]))
+    num_labels = len(torch.unique(
+        labels[torch.logical_not(torch.isnan(labels))]))
    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
@@ -148,12 +168,15 @@ class PinsageDataset:
    def __getitem__(self, idx):
        return self._g
 def load_nowplaying_rs():
-    name = 'nowplaying_rs.pkl' # follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
+    import torchtext
+    # follow examples/pytorch/pinsage/README to create nowplaying_rs.pkl
+    name = 'nowplaying_rs.pkl'
    dataset_dir = os.path.join(os.getcwd(), 'dataset')
    os.symlink('/tmp/dataset/', dataset_dir)
-    dataset_path = os.path.join(dataset_dir, name)
+    dataset_path = os.path.join(dataset_dir, "nowplaying_rs", name)
    # Load dataset
    with open(dataset_path, 'rb') as f:
        dataset = pickle.load(f)
@@ -169,14 +192,17 @@ def load_nowplaying_rs():
    # Assign user and movie IDs and use them as features (to learn an individual trainable
    # embedding for each entity)
-    g.nodes[user_ntype].data['id'] = torch.arange(g.number_of_nodes(user_ntype))
+    g.nodes[user_ntype].data['id'] = torch.arange(
-    g.nodes[item_ntype].data['id'] = torch.arange(g.number_of_nodes(item_ntype))
+        g.number_of_nodes(user_ntype))
+    g.nodes[item_ntype].data['id'] = torch.arange(
+        g.number_of_nodes(item_ntype))
    # Prepare torchtext dataset and vocabulary
    fields = {}
    examples = []
    for key, texts in item_texts.items():
-        fields[key] = torchtext.data.Field(include_lengths=True, lower=True, batch_first=True)
+        fields[key] = torchtext.data.Field(
+            include_lengths=True, lower=True, batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = torchtext.data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
@@ -188,6 +214,7 @@ def load_nowplaying_rs():
    return PinsageDataset(g, user_ntype, item_ntype, textset)
 def process_data(name):
    if name == 'cora':
        return dgl.data.CoraGraphDataset()
@@ -212,29 +239,38 @@ def process_data(name):
    else:
        raise ValueError('Invalid dataset name:', name)
 def get_bench_device():
-    return os.environ.get('DGL_BENCH_DEVICE', 'cpu')
+    device = os.environ.get('DGL_BENCH_DEVICE', 'cpu')
+    if device.lower() == "gpu":
+        return "cuda:0"
+    else:
+        return device
 def setup_track_time(*args, **kwargs):
    # fix random seed
    np.random.seed(42)
    torch.random.manual_seed(42)
 def setup_track_acc(*args, **kwargs):
    # fix random seed
    np.random.seed(42)
    torch.random.manual_seed(42)
 TRACK_UNITS = {
-    'time' : 's',
+    'time': 's',
-    'acc' : '%',
+    'acc': '%',
 }
 TRACK_SETUP = {
-    'time' : setup_track_time,
+    'time': setup_track_time,
-    'acc' : setup_track_acc,
+    'acc': setup_track_acc,
 }
 def parametrize(param_name, params):
    """Decorator for benchmarking over a set of parameters.
@@ -297,6 +333,40 @@ def parametrize(param_name, params):
        return func
    return _wrapper
+class TestFilter:
+    def __init__(self):
+        self.conf = None
+        if "DGL_REG_CONF" in os.environ:
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            path = os.path.join(current_dir, "../../",
+                                os.environ["DGL_REG_CONF"])
+            with open(path, "r") as f:
+                self.conf = json.load(f)
+            if "INSTANCE_TYPE" in os.environ:
+                instance_type = os.environ["INSTANCE_TYPE"]
+            else:
+                raise Exception(
+                    "Must set both DGL_REG_CONF and INSTANCE_TYPE as env")
+            self.enabled_tests = self.conf[instance_type]["tests"]
+        else:
+            import logging
+            logging.warning("No regression test conf file specified")
+    def check(self, func):
+        funcfullname = inspect.getmodule(func).__name__ + "." + func.__name__
+        if self.conf is None:
+            return True
+        else:
+            for enabled_testname in self.enabled_tests:
+                if enabled_testname in funcfullname:
+                    return True
+            return False
+filter = TestFilter()
 def benchmark(track_type, timeout=60):
    """Decorator for indicating the benchmark type.
@@ -319,9 +389,13 @@ def benchmark(track_type, timeout=60):
            pass
    """
    assert track_type in ['time', 'acc']
    def _wrapper(func):
        func.unit = TRACK_UNITS[track_type]
        func.setup = TRACK_SETUP[track_type]
        func.timeout = timeout
+        if not filter.check(func):
+            # skip if not enabled
+            func.benchmark_name = "skip_" + func.__name__
        return func
    return _wrapper
--- a/benchmarks/run.sh
+++ b/benchmarks/run.sh
@@ -13,6 +13,7 @@ pip install asv
 pip uninstall -y dgl
 export DGL_BENCH_DEVICE=$DEVICE
+echo "DGL_BENCH_DEVICE=$DGL_BENCH_DEVICE"
 pushd $ROOT/benchmarks
 cat asv.conf.json
 asv machine --yes

--- a/benchmarks/scripts/README.md
+++ b/benchmarks/scripts/README.md
+Regression Test Suite
+========================
+### Spec of task.json
+```json
+# Note the test will be run if the name specified below is a substring of the full test name.
+# The fullname of "benchmarks/model_acc/bench_sage_ns.track_acc" will be "model_acc.bench_sage_ns.track_acc". Test will be run if it contains any keyword.
+# For example, "model_acc" will run all the tests under "model_acc" folder
+# "bench_sage" will run both "bench_sage" and "bench_sage_ns"
+# "bench_sage." will only run "bench_sage"
+# "ns" will run any tests name contains "ms"
+# "" will run all tests
+{
+    "c5.9xlarge": { # The instance type to run the test
+        "tests": [
+            "bench_sage" # The test to be run on this instance
+        ],
+        "env": {
+            "DEVICE": "cpu" # The environment variable passed to publish.sh
+        }
+    },
+    "g4dn.2xlarge": {
+        ...
+    }
+}
+```
+### Environment variable
+- `MOUNT_PATH` specify the directory in the host to be mapped into docker, if exists will map the `MOUNT_PATH`(in host) to `/tmp/dataset`(in docker)
+- `INSTANCE_TYPE` specify the current instance type
+- `DGL_REG_CONF` specify the path to `task.json`, which is relative to the repo root. If specified, must specify `INSTANCE_TYPE` also
\ No newline at end of file
--- a/benchmarks/build_dgl_asv.sh
+++ b/benchmarks/build_dgl_asv.sh
@@ -4,8 +4,15 @@ set -e
 . /opt/conda/etc/profile.d/conda.sh
+# Default building only with cpu
+DEVICE=${DGL_BENCH_DEVICE:-cpu}
 # build
-CMAKE_VARS="-DUSE_CUDA=ON"
+if [[ $DEVICE == "cpu" ]]; then
+    CMAKE_VARS=""
+else
+    CMAKE_VARS="-DUSE_CUDA=ON"
+fi
 mkdir -p build
 pushd build
 cmake $CMAKE_VARS ..

--- a/benchmarks/install_dgl_asv.sh
+++ b/benchmarks/install_dgl_asv.sh
@@ -7,6 +7,7 @@ set -e
 pip install -r /asv/torch_gpu_pip.txt
 pip install pandas rdflib ogb
 # install
 pushd python
 rm -rf build *.egg-info dist

--- a/benchmarks/publish.sh
+++ b/benchmarks/publish.sh
@@ -17,7 +17,6 @@
 #      the host machine.
 #
 if [ $# -eq 2 ]; then
    MACHINE=$1
    DEVICE=$2
@@ -27,15 +26,51 @@ else
 fi
 WS_ROOT=/asv/dgl
+docker pull dgllib/dgl-ci-gpu:conda
+if [ -z "$DGL_REG_CONF"]; then
+    DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
+else
+    DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
+fi
+if [ -z "$INSTANCE_TYPE"]; then
+    DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
+else
+    DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
+fi
+if [ -z "$MOUNT_PATH"]; then
+    DOCKER_MOUNT_OPT=""
+else
+    DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
+fi
+echo $HOME
+echo "Mount Point: ${DOCKER_MOUNT_OPT}"
+echo "Env opt: ${DOCKER_ENV_OPT}"
+echo "DEVICE: ${DEVICE}"
+if [[ $DEVICE == "cpu" ]]; then
+    docker run --name dgl-reg \
+        --rm \
+        $DOCKER_MOUNT_OPT \
+        $DOCKER_ENV_OPT \
+        --shm-size="4g" \
+        --hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
+else
+    docker run --name dgl-reg \
+        --rm --runtime=nvidia \
+        $DOCKER_MOUNT_OPT \
+        $DOCKER_ENV_OPT \
+        --shm-size="4g" \
+        --hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
+fi
-docker run --name dgl-reg                   \
-           --rm --runtime=nvidia            \
-           --hostname=$MACHINE -dit dgllib/dgl-ci-gpu:conda /bin/bash
 docker exec dgl-reg mkdir -p $WS_ROOT
-docker cp ../.git dgl-reg:$WS_ROOT
+docker cp ../../.git dgl-reg:$WS_ROOT
-docker cp . dgl-reg:$WS_ROOT/benchmarks/
+docker cp ../ dgl-reg:$WS_ROOT/benchmarks/
 docker cp torch_gpu_pip.txt dgl-reg:/asv
-docker exec dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE
+docker exec $DOCKER_ENV_OPT dgl-reg bash $WS_ROOT/benchmarks/run.sh $DEVICE
-docker cp dgl-reg:$WS_ROOT/benchmarks/results .
+docker cp dgl-reg:$WS_ROOT/benchmarks/results ../
-docker cp dgl-reg:$WS_ROOT/benchmarks/html .
+docker cp dgl-reg:$WS_ROOT/benchmarks/html ../
 docker stop dgl-reg
--- a/benchmarks/torch_gpu_pip.txt
+++ b/benchmarks/torch_gpu_pip.txt
@@ -10,4 +10,7 @@ networkx
 matplotlib
 nltk
 requests[security]
-tqdm  
+tqdm
\ No newline at end of file
+awscli
+# 0.6.0 is for pytorch 1.5
+torchtext==0.6.0
\ No newline at end of file
--- a/benchmarks/task.json
+++ b/benchmarks/task.json
+{
+    "c5.9xlarge": {
+        "tests": [
+            ""
+        ],
+        "env": {
+            "DEVICE": "cpu"
+        }
+    },
+    "g4dn.2xlarge": {
+        "tests": [
+            ""
+        ],
+        "env": {
+            "DEVICE": "gpu"
+        }
+    }
+}
\ No newline at end of file
--- a/examples/pytorch/gcn/train.py
+++ b/examples/pytorch/gcn/train.py
-import argparse, time
+import argparse
+import time
 import numpy as np
 import networkx as nx
 import torch
@@ -12,6 +13,7 @@ from gcn import GCN
 #from gcn_mp import GCN
 #from gcn_spmv import GCN
 def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
@@ -22,6 +24,7 @@ def evaluate(model, features, labels, mask):
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)
 def main(args):
    # load and preprocess dataset
    if args.dataset == 'cora':
@@ -122,21 +125,21 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='GCN')
    register_data_args(parser)
    parser.add_argument("--dropout", type=float, default=0.5,
-            help="dropout probability")
+                        help="dropout probability")
    parser.add_argument("--gpu", type=int, default=-1,
-            help="gpu")
+                        help="gpu")
    parser.add_argument("--lr", type=float, default=1e-2,
-            help="learning rate")
+                        help="learning rate")
    parser.add_argument("--n-epochs", type=int, default=200,
-            help="number of training epochs")
+                        help="number of training epochs")
    parser.add_argument("--n-hidden", type=int, default=16,
-            help="number of hidden gcn units")
+                        help="number of hidden gcn units")
    parser.add_argument("--n-layers", type=int, default=1,
-            help="number of hidden gcn layers")
+                        help="number of hidden gcn layers")
    parser.add_argument("--weight-decay", type=float, default=5e-4,
-            help="Weight for L2 loss")
+                        help="Weight for L2 loss")
    parser.add_argument("--self-loop", action='store_true',
-            help="graph self-loop (default=False)")
+                        help="graph self-loop (default=False)")
    parser.set_defaults(self_loop=False)
    args = parser.parse_args()
    print(args)