[Example] Move Data to GPU before Minibatch Training (#2453)

* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update

[Example] Move Data to GPU before Minibatch Training (#2453)
* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
927d2b31 · Mufei Li · GitHub · 72ef642f · 927d2b31 · 927d2b31
Unverified Commit 927d2b31 authored Dec 28, 2020 by Mufei Li Committed by GitHub Dec 28, 2020
8 changed files
--- a/examples/pytorch/graphsage/train_sampling.py
+++ b/examples/pytorch/graphsage/train_sampling.py
@@ -4,19 +4,12 @@ import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
-import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import argparse
-from _thread import start_new_thread
-from functools import wraps
-from dgl.data import RedditDataset
 import tqdm
-import traceback

-from load_graph import load_reddit, load_ogb, inductive_split
+from load_graph import load_reddit, inductive_split

 class SAGE(nn.Module):
    def __init__(self,
@@ -47,7 +40,7 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
@@ -62,12 +55,12 @@ class SAGE(nn.Module):
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
        for l, layer in enumerate(self.layers):
-            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+            y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
-                th.arange(g.number_of_nodes()),
+                th.arange(g.num_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
@@ -96,34 +89,35 @@ def compute_acc(pred, labels):
    labels = labels.long()
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

-def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
+def evaluate(model, g, nfeat, labels, val_nid, device):
    """
    Evaluate the model on the validation set specified by ``val_nid``.
    g : The entire graph.
    inputs : The features of all the nodes.
    labels : The labels of all the nodes.
    val_nid : the node Ids for validation.
-    batch_size : Number of nodes to compute at the same time.
    device : The GPU device to evaluate on.
    """
    model.eval()
    with th.no_grad():
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, nfeat, device)
    model.train()
-    return compute_acc(pred[val_nid], labels[val_nid])
+    return compute_acc(pred[val_nid], labels[val_nid].to(pred.device))

-def load_subtensor(g, seeds, input_nodes, device):
+def load_subtensor(nfeat, labels, seeds, input_nodes, device):
    """
-    Copys features and labels of a set of nodes onto GPU.
+    Extracts features and labels for a subset of nodes
    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
-    batch_labels = g.ndata['labels'][seeds].to(device)
+    batch_inputs = nfeat[input_nodes].to(device)
+    batch_labels = labels[seeds].to(device)
    return batch_inputs, batch_labels

 #### Entry point
 def run(args, device, data):
    # Unpack data
-    in_feats, n_classes, train_g, val_g, test_g = data
+    n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
+    val_nfeat, val_labels, test_nfeat, test_labels = data
+    in_feats = train_nfeat.shape[1]
    train_nid = th.nonzero(train_g.ndata['train_mask'], as_tuple=True)[0]
    val_nid = th.nonzero(val_g.ndata['val_mask'], as_tuple=True)[0]
    test_nid = th.nonzero(~(test_g.ndata['train_mask'] | test_g.ndata['val_mask']), as_tuple=True)[0]
@@ -144,7 +138,6 @@ def run(args, device, data):
    model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
    model = model.to(device)
    loss_fcn = nn.CrossEntropyLoss()
-    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
@@ -158,10 +151,9 @@ def run(args, device, data):
        tic_step = time.time()
        for step, (input_nodes, seeds, blocks) in enumerate(dataloader):
            # Load the input features as well as output labels
-            #batch_inputs, batch_labels = load_subtensor(train_g, seeds, input_nodes, device)
+            batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
+                                                        seeds, input_nodes, device)
            blocks = [block.int().to(device) for block in blocks]
-            batch_inputs = blocks[0].srcdata['features']
-            batch_labels = blocks[-1].dstdata['labels']

            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -183,9 +175,9 @@ def run(args, device, data):
        if epoch >= 5:
            avg += toc - tic
        if epoch % args.eval_every == 0 and epoch != 0:
-            eval_acc = evaluate(model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, device)
+            eval_acc = evaluate(model, val_g, val_nfeat, val_labels, val_nid, device)
            print('Eval Acc {:.4f}'.format(eval_acc))
-            test_acc = evaluate(model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, device)
+            test_acc = evaluate(model, test_g, test_nfeat, test_labels, test_nid, device)
            print('Test Acc: {:.4f}'.format(test_acc))

    print('Avg epoch time: {}'.format(avg / (epoch - 4)))
@@ -193,7 +185,7 @@ def run(args, device, data):
 if __name__ == '__main__':
    argparser = argparse.ArgumentParser("multi-gpu training")
    argparser.add_argument('--gpu', type=int, default=0,
-        help="GPU device ID. Use -1 for CPU training")
+                           help="GPU device ID. Use -1 for CPU training")
    argparser.add_argument('--dataset', type=str, default='reddit')
    argparser.add_argument('--num-epochs', type=int, default=20)
    argparser.add_argument('--num-hidden', type=int, default=16)
@@ -205,9 +197,14 @@ if __name__ == '__main__':
    argparser.add_argument('--lr', type=float, default=0.003)
    argparser.add_argument('--dropout', type=float, default=0.5)
    argparser.add_argument('--num-workers', type=int, default=4,
-        help="Number of sampling processes. Use 0 for no extra process.")
+                           help="Number of sampling processes. Use 0 for no extra process.")
    argparser.add_argument('--inductive', action='store_true',
-        help="Inductive learning setting")
+                           help="Inductive learning setting")
+    argparser.add_argument('--data-cpu', action='store_true',
+                           help="By default the script puts all node features and labels "
+                                "on GPU when using it to save time for data copy. This may "
+                                "be undesired if they cannot fit in GPU memory at once. "
+                                "This flag disables that.")
    args = argparser.parse_args()

    if args.gpu >= 0:
@@ -217,17 +214,25 @@ if __name__ == '__main__':

    if args.dataset == 'reddit':
        g, n_classes = load_reddit()
-    elif args.dataset == 'ogb-product':
-        g, n_classes = load_ogb('ogbn-products')
    else:
        raise Exception('unknown dataset')

-    in_feats = g.ndata['features'].shape[1]
-
    if args.inductive:
        train_g, val_g, test_g = inductive_split(g)
+        train_nfeat = train_g.ndata.pop('features')
+        val_nfeat = val_g.ndata.pop('features')
+        test_nfeat = test_g.ndata.pop('features')
+        train_labels = train_g.ndata.pop('labels')
+        val_labels = val_g.ndata.pop('labels')
+        test_labels = test_g.ndata.pop('labels')
    else:
        train_g = val_g = test_g = g
+        train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
+        train_labels = val_labels = test_labels = g.ndata.pop('labels')
+
+    if not args.data_cpu:
+        train_nfeat = train_nfeat.to(device)
+        train_labels = train_labels.to(device)

    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
@@ -235,6 +240,7 @@ if __name__ == '__main__':
    val_g.create_formats_()
    test_g.create_formats_()
    # Pack data
-    data = in_feats, n_classes, train_g, val_g, test_g
+    data = n_classes, train_g, val_g, test_g, train_nfeat, train_labels, \
+           val_nfeat, val_labels, test_nfeat, test_labels

    run(args, device, data)
--- a/examples/pytorch/graphsage/train_sampling_multi_gpu.py
+++ b/examples/pytorch/graphsage/train_sampling_multi_gpu.py
@@ -5,16 +5,12 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
-import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import math
 import argparse
-from dgl.data import RedditDataset
 from torch.nn.parallel import DistributedDataParallel
 import tqdm
-import traceback

 from utils import thread_wrapped_func
 from load_graph import load_reddit, inductive_split
@@ -48,7 +44,7 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
@@ -62,14 +58,13 @@ class SAGE(nn.Module):
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
-        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
-            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+            y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
-                th.arange(g.number_of_nodes()),
+                th.arange(g.num_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
@@ -97,27 +92,26 @@ def compute_acc(pred, labels):
    """
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

-def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
+def evaluate(model, g, nfeat, labels, val_nid, device):
    """
    Evaluate the model on the validation set specified by ``val_nid``.
    g : The entire graph.
    inputs : The features of all the nodes.
    labels : The labels of all the nodes.
    val_nid : A node ID tensor indicating which nodes do we actually compute the accuracy for.
-    batch_size : Number of nodes to compute at the same time.
    device : The GPU device to evaluate on.
    """
    model.eval()
    with th.no_grad():
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, nfeat, device)
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid])

-def load_subtensor(g, labels, seeds, input_nodes, dev_id):
+def load_subtensor(nfeat, labels, seeds, input_nodes, dev_id):
    """
-    Copys features and labels of a set of nodes onto GPU.
+    Extracts features and labels for a subset of nodes.
    """
-    batch_inputs = g.ndata['features'][input_nodes].to(dev_id)
+    batch_inputs = nfeat[input_nodes].to(dev_id)
    batch_labels = labels[seeds].to(dev_id)
    return batch_inputs, batch_labels

@@ -137,7 +131,25 @@ def run(proc_id, n_gpus, args, devices, data):
    th.cuda.set_device(dev_id)

    # Unpack data
-    in_feats, n_classes, train_g, val_g, test_g = data
+    n_classes, train_g, val_g, test_g = data
+
+    if args.inductive:
+        train_nfeat = train_g.ndata.pop('features')
+        val_nfeat = val_g.ndata.pop('features')
+        test_nfeat = test_g.ndata.pop('features')
+        train_labels = train_g.ndata.pop('labels')
+        val_labels = val_g.ndata.pop('labels')
+        test_labels = test_g.ndata.pop('labels')
+    else:
+        train_nfeat = val_nfeat = test_nfeat = g.ndata.pop('features')
+        train_labels = val_labels = test_labels = g.ndata.pop('labels')
+
+    if not args.data_cpu:
+        train_nfeat = train_nfeat.to(dev_id)
+        train_labels = train_labels.to(dev_id)
+
+    in_feats = train_nfeat.shape[1]
+
    train_mask = train_g.ndata['train_mask']
    val_mask = val_g.ndata['val_mask']
    test_mask = ~(test_g.ndata['train_mask'] | test_g.ndata['val_mask'])
@@ -166,7 +178,6 @@ def run(proc_id, n_gpus, args, devices, data):
    if n_gpus > 1:
        model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
    loss_fcn = nn.CrossEntropyLoss()
-    loss_fcn = loss_fcn.to(dev_id)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
@@ -182,7 +193,8 @@ def run(proc_id, n_gpus, args, devices, data):
                tic_step = time.time()

            # Load the input features as well as output labels
-            batch_inputs, batch_labels = load_subtensor(train_g, train_g.ndata['labels'], seeds, input_nodes, dev_id)
+            batch_inputs, batch_labels = load_subtensor(train_nfeat, train_labels,
+                                                        seeds, input_nodes, dev_id)
            blocks = [block.int().to(dev_id) for block in blocks]
            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -209,14 +221,14 @@ def run(proc_id, n_gpus, args, devices, data):
            if epoch % args.eval_every == 0 and epoch != 0:
                if n_gpus == 1:
                    eval_acc = evaluate(
-                        model, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
+                        model, val_g, val_nfeat, val_labels, val_nid, devices[0])
                    test_acc = evaluate(
-                        model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
+                        model, test_g, test_nfeat, test_labels, test_nid, devices[0])
                else:
                    eval_acc = evaluate(
-                        model.module, val_g, val_g.ndata['features'], val_g.ndata['labels'], val_nid, args.batch_size, devices[0])
+                        model.module, val_g, val_nfeat, val_labels, val_nid, devices[0])
                    test_acc = evaluate(
-                        model.module, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, args.batch_size, devices[0])
+                        model.module, test_g, test_nfeat, test_labels, test_nid, devices[0])
                print('Eval Acc {:.4f}'.format(eval_acc))
                print('Test Acc: {:.4f}'.format(test_acc))

@@ -229,7 +241,7 @@ def run(proc_id, n_gpus, args, devices, data):
 if __name__ == '__main__':
    argparser = argparse.ArgumentParser("multi-gpu training")
    argparser.add_argument('--gpu', type=str, default='0',
-        help="Comma separated list of GPU device IDs.")
+                           help="Comma separated list of GPU device IDs.")
    argparser.add_argument('--num-epochs', type=int, default=20)
    argparser.add_argument('--num-hidden', type=int, default=16)
    argparser.add_argument('--num-layers', type=int, default=2)
@@ -240,9 +252,14 @@ if __name__ == '__main__':
    argparser.add_argument('--lr', type=float, default=0.003)
    argparser.add_argument('--dropout', type=float, default=0.5)
    argparser.add_argument('--num-workers', type=int, default=0,
-        help="Number of sampling processes. Use 0 for no extra process.")
+                           help="Number of sampling processes. Use 0 for no extra process.")
    argparser.add_argument('--inductive', action='store_true',
-        help="Inductive learning setting")
+                           help="Inductive learning setting")
+    argparser.add_argument('--data-cpu', action='store_true',
+                           help="By default the script puts all node features and labels "
+                                "on GPU when using it to save time for data copy. This may "
+                                "be undesired if they cannot fit in GPU memory at once. "
+                                "This flag disables that.")
    args = argparser.parse_args()
    
    devices = list(map(int, args.gpu.split(',')))
@@ -251,7 +268,6 @@ if __name__ == '__main__':
    g, n_classes = load_reddit()
    # Construct graph
    g = dgl.as_heterograph(g)
-    in_feats = g.ndata['features'].shape[1]

    if args.inductive:
        train_g, val_g, test_g = inductive_split(g)
@@ -264,7 +280,7 @@ if __name__ == '__main__':
    val_g.create_formats_()
    test_g.create_formats_()
    # Pack data
-    data = in_feats, n_classes, train_g, val_g, test_g
+    data = n_classes, train_g, val_g, test_g

    if n_gpus == 1:
        run(0, n_gpus, args, devices, data)

--- a/examples/pytorch/graphsage/train_sampling_unsupervised.py
+++ b/examples/pytorch/graphsage/train_sampling_unsupervised.py
@@ -5,17 +5,13 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
 import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import argparse
-from _thread import start_new_thread
-from functools import wraps
 from dgl.data import RedditDataset
 from torch.nn.parallel import DistributedDataParallel
 import tqdm
-import traceback
 import sklearn.linear_model as lm
 import sklearn.metrics as skm

@@ -38,13 +34,6 @@ class NegativeSampler(object):
        src = src.repeat_interleave(self.k)
        return src, dst

-def load_subtensor(g, input_nodes, device):
-    """
-    Copys features and labels of a set of nodes onto GPU.
-    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
-    return batch_inputs
-
 class SAGE(nn.Module):
    def __init__(self,
                 in_feats,
@@ -74,7 +63,7 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
@@ -88,14 +77,13 @@ class SAGE(nn.Module):
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
-        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
-            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+            y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
-                th.arange(g.number_of_nodes()),
+                th.arange(g.num_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
@@ -155,24 +143,23 @@ def compute_acc(emb, labels, train_nids, val_nids, test_nids):
    f1_micro_test = skm.f1_score(test_labels, pred[test_nids], average='micro')
    return f1_micro_eval, f1_micro_test

-def evaluate(model, g, inputs, labels, train_nids, val_nids, test_nids, batch_size, device):
+def evaluate(model, g, nfeat, labels, train_nids, val_nids, test_nids, device):
    """
    Evaluate the model on the validation set specified by ``val_mask``.
    g : The entire graph.
    inputs : The features of all the nodes.
    labels : The labels of all the nodes.
    val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
-    batch_size : Number of nodes to compute at the same time.
    device : The GPU device to evaluate on.
    """
    model.eval()
    with th.no_grad():
        # single gpu
        if isinstance(model, SAGE):
-            pred = model.inference(g, inputs, batch_size, device)
+            pred = model.inference(g, nfeat, device)
        # multi gpu
        else:
-            pred = model.module.inference(g, inputs, batch_size, device)
+            pred = model.module.inference(g, nfeat, device)
    model.train()
    return compute_acc(pred, labels, train_nids, val_nids, test_nids)

@@ -188,18 +175,17 @@ def run(proc_id, n_gpus, args, devices, data):
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=proc_id)
-    train_mask, val_mask, test_mask, in_feats, labels, n_classes, g = data
+    train_mask, val_mask, test_mask, n_classes, g = data
+    nfeat = g.ndata.pop('feat')
+    labels = g.ndata.pop('label')
+    in_feats = nfeat.shape[1]

    train_nid = th.LongTensor(np.nonzero(train_mask)).squeeze()
    val_nid = th.LongTensor(np.nonzero(val_mask)).squeeze()
    test_nid = th.LongTensor(np.nonzero(test_mask)).squeeze()

-    #train_nid = th.LongTensor(np.nonzero(train_mask)[0])
-    #val_nid = th.LongTensor(np.nonzero(val_mask)[0])
-    #test_nid = th.LongTensor(np.nonzero(test_mask)[0])
-
    # Create PyTorch DataLoader for constructing blocks
-    n_edges = g.number_of_edges()
+    n_edges = g.num_edges()
    train_seeds = np.arange(n_edges)
    if n_gpus > 0:
        num_per_gpu = (train_seeds.shape[0] + n_gpus -1) // n_gpus
@@ -230,7 +216,6 @@ def run(proc_id, n_gpus, args, devices, data):
    if n_gpus > 1:
        model = DistributedDataParallel(model, device_ids=[device], output_device=device)
    loss_fcn = CrossEntropyLoss()
-    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    # Training loop
@@ -249,7 +234,7 @@ def run(proc_id, n_gpus, args, devices, data):

        tic_step = time.time()
        for step, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(dataloader):
-            batch_inputs = load_subtensor(g, input_nodes, device)
+            batch_inputs = nfeat[input_nodes].to(device)
            d_step = time.time()

            pos_graph = pos_graph.to(device)
@@ -263,8 +248,8 @@ def run(proc_id, n_gpus, args, devices, data):
            optimizer.step()

            t = time.time()
-            pos_edges = pos_graph.number_of_edges()
-            neg_edges = neg_graph.number_of_edges()
+            pos_edges = pos_graph.num_edges()
+            neg_edges = neg_graph.num_edges()
            iter_pos.append(pos_edges / (t - tic_step))
            iter_neg.append(neg_edges / (t - tic_step))
            iter_d.append(d_step - tic_step)
@@ -276,34 +261,37 @@ def run(proc_id, n_gpus, args, devices, data):
            tic_step = time.time()

            if step % args.eval_every == 0 and proc_id == 0:
-                eval_acc, test_acc = evaluate(model, g, g.ndata['features'], labels, train_nid, val_nid, test_nid, args.batch_size, device)
+                eval_acc, test_acc = evaluate(model, g, nfeat, labels, train_nid, val_nid, test_nid, device)
                print('Eval Acc {:.4f} Test Acc {:.4f}'.format(eval_acc, test_acc))
                if eval_acc > best_eval_acc:
                    best_eval_acc = eval_acc
                    best_test_acc = test_acc
                print('Best Eval Acc {:.4f} Test Acc {:.4f}'.format(best_eval_acc, best_test_acc))
+        toc = time.time()
+        if proc_id == 0:
+            print('Epoch Time(s): {:.4f}'.format(toc - tic))
+        if epoch >= 5:
+            avg += toc - tic
        if n_gpus > 1:
            th.distributed.barrier()
-    print('Avg epoch time: {}'.format(avg / (epoch - 4)))
+
+    if proc_id == 0:
+        print('Avg epoch time: {}'.format(avg / (epoch - 4)))

 def main(args, devices):
    # load reddit data
    data = RedditDataset(self_loop=False)
    n_classes = data.num_classes
    g = data[0]
-    features = g.ndata['feat']
-    in_feats = features.shape[1]
-    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
-    g.ndata['features'] = features

    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()
    # Pack data
-    data = train_mask, val_mask, test_mask, in_feats, labels, n_classes, g
+    data = train_mask, val_mask, test_mask, n_classes, g

    n_gpus = len(devices)
    if devices[0] == -1:
@@ -324,13 +312,14 @@ def main(args, devices):
 if __name__ == '__main__':
    argparser = argparse.ArgumentParser("multi-gpu training")
    argparser.add_argument("--gpu", type=str, default='0',
-            help="GPU, can be a list of gpus for multi-gpu trianing, e.g., 0,1,2,3; -1 for CPU")
+                           help="GPU, can be a list of gpus for multi-gpu trianing,"
+                                " e.g., 0,1,2,3; -1 for CPU")
    argparser.add_argument('--num-epochs', type=int, default=20)
    argparser.add_argument('--num-hidden', type=int, default=16)
    argparser.add_argument('--num-layers', type=int, default=2)
    argparser.add_argument('--num-negs', type=int, default=1)
    argparser.add_argument('--neg-share', default=False, action='store_true',
-        help="sharing neg nodes for positive nodes")
+                           help="sharing neg nodes for positive nodes")
    argparser.add_argument('--fan-out', type=str, default='10,25')
    argparser.add_argument('--batch-size', type=int, default=10000)
    argparser.add_argument('--log-every', type=int, default=20)
@@ -338,7 +327,7 @@ if __name__ == '__main__':
    argparser.add_argument('--lr', type=float, default=0.003)
    argparser.add_argument('--dropout', type=float, default=0.5)
    argparser.add_argument('--num-workers', type=int, default=0,
-        help="Number of sampling processes. Use 0 for no extra process.")
+                           help="Number of sampling processes. Use 0 for no extra process.")
    args = argparser.parse_args()

    devices = list(map(int, args.gpu.split(',')))

--- a/examples/pytorch/ogb/cluster-gat/main.py
+++ b/examples/pytorch/ogb/cluster-gat/main.py
@@ -5,24 +5,15 @@ import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
-import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import argparse
-from _thread import start_new_thread
-from functools import wraps
-from dgl.data import RedditDataset
 import tqdm
-import traceback
 from ogb.nodeproppred import DglNodePropPredDataset

 from sampler import ClusterIter, subgraph_collate_fn

-#### Neighbor sampler
-
-
 class GAT(nn.Module):
    def __init__(self,
                 in_feats,
@@ -79,16 +70,15 @@ class GAT(nn.Module):
        layers.
        """
        num_heads = self.num_heads
-        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
            if l < self.n_layers - 1:
-                y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
+                y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
            else:
-                y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+                y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                    g,
-                    th.arange(g.number_of_nodes()),
+                    th.arange(g.num_nodes()),
                    sampler,
                    batch_size=batch_size,
                    shuffle=False,
@@ -98,7 +88,6 @@ class GAT(nn.Module):
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                block = blocks[0].int().to(device)
                h = x[input_nodes].to(device)
-                h_dst = h[:block.number_of_dst_nodes()].to(device)
                if l < self.n_layers - 1:
                   h = layer(block, h).flatten(1)
                else:
@@ -116,7 +105,7 @@ def compute_acc(pred, labels):
    """
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

-def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
+def evaluate(model, g, nfeat, labels, val_nid, test_nid, batch_size, device):
    """
    Evaluate the model on the validation set specified by ``val_mask``.
    g : The entire graph.
@@ -128,8 +117,7 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
    """
    model.eval()
    with th.no_grad():
-        inputs = g.ndata['feat']
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, nfeat, batch_size, device)
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred

@@ -142,7 +130,8 @@ def model_param_summary(model):
 def run(args, device, data):
    # Unpack data
    train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, cluster_iterator = data
-
+    labels = labels.to(device)
+    nfeat = g.ndata.pop('feat').to(device)

    # Define model and optimizer
    model = GAT(in_feats, args.num_heads, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
@@ -164,16 +153,18 @@ def run(args, device, data):
        # blocks.
        tic_start = time.time()
        for step, cluster in enumerate(cluster_iterator):
-            cluster = cluster.int().to(device)
-            mask = cluster.ndata['train_mask']
+            mask = cluster.ndata.pop('train_mask')
            if mask.sum() == 0:
                continue
-            feat = cluster.ndata['feat']
-            batch_labels = cluster.ndata['labels']
+            cluster.edata.pop(dgl.EID)
+            cluster = cluster.int().to(device)
+            input_nodes = cluster.ndata[dgl.NID]
+            batch_inputs = nfeat[input_nodes]
+            batch_labels = labels[input_nodes]
            tic_step = time.time()

            # Compute loss and prediction
-            batch_pred = model(cluster, feat)
+            batch_pred = model(cluster, batch_inputs)
            batch_pred = batch_pred[mask]
            batch_labels = batch_labels[mask]
            loss = nn.functional.nll_loss(batch_pred, batch_labels)
@@ -199,7 +190,7 @@ def run(args, device, data):
            avg += toc - tic

        if epoch % args.eval_every == 0 and epoch != 0:
-            eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device)
+            eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, args.val_batch_size, device)
            model = model.to(device)
            if args.save_pred:
                np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
@@ -229,6 +220,11 @@ if __name__ == '__main__':
    argparser.add_argument('--wd', type=float, default=0)
    argparser.add_argument('--num_partitions', type=int, default=15000)
    argparser.add_argument('--num-workers', type=int, default=0)
+    argparser.add_argument('--data-cpu', action='store_true',
+                           help="By default the script puts all node features and labels "
+                                "on GPU when using it to save time for data copy. This may "
+                                "be undesired if they cannot fit in GPU memory at once. "
+                                "This flag disables that.")
    args = argparser.parse_args()

    if args.gpu >= 0:
@@ -242,22 +238,15 @@ if __name__ == '__main__':
    train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    graph, labels = data[0]
    labels = labels[:, 0]
-    print('Total edges before adding self-loop {}'.format(graph.number_of_edges()))
+    print('Total edges before adding self-loop {}'.format(graph.num_edges()))
    graph = dgl.remove_self_loop(graph)
    graph = dgl.add_self_loop(graph)
-    print('Total edges after adding self-loop {}'.format(graph.number_of_edges()))
+    print('Total edges after adding self-loop {}'.format(graph.num_edges()))
    num_nodes = train_idx.shape[0] + val_idx.shape[0] + test_idx.shape[0]
-    assert num_nodes == graph.number_of_nodes()
-    graph.ndata['labels'] = labels
+    assert num_nodes == graph.num_nodes()
    mask = th.zeros(num_nodes, dtype=th.bool)
    mask[train_idx] = True
    graph.ndata['train_mask'] = mask
-    mask = th.zeros(num_nodes, dtype=th.bool)
-    mask[val_idx] = True
-    graph.ndata['valid_mask'] = mask
-    mask = th.zeros(num_nodes, dtype=th.bool)
-    mask[test_idx] = True
-    graph.ndata['test_mask'] = mask

    graph.in_degrees(0)
    graph.out_degrees(0)
@@ -265,7 +254,9 @@ if __name__ == '__main__':

    cluster_iter_data = ClusterIter(
            'ogbn-products', graph, args.num_partitions, args.batch_size)
-    cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4, collate_fn=partial(subgraph_collate_fn, graph))
+    cluster_iterator = DataLoader(cluster_iter_data, batch_size=args.batch_size, shuffle=True,
+                                  pin_memory=True, num_workers=4,
+                                  collate_fn=partial(subgraph_collate_fn, graph))

    in_feats = graph.ndata['feat'].shape[1]
    n_classes = (labels.max() + 1).item()

--- a/examples/pytorch/ogb/cluster-gat/sampler.py
+++ b/examples/pytorch/ogb/cluster-gat/sampler.py
 import os
-import random

-import dgl.function as fn
 import torch
-import time

 from partition_utils import *


--- a/examples/pytorch/ogb/ogbn-products/gat/main.py
+++ b/examples/pytorch/ogb/ogbn-products/gat/main.py
@@ -4,17 +4,10 @@ import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
-import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import argparse
-from _thread import start_new_thread
-from functools import wraps
-from dgl.data import RedditDataset
 import tqdm
-import traceback
 from ogb.nodeproppred import DglNodePropPredDataset


@@ -25,17 +18,18 @@ class GAT(nn.Module):
                 n_classes,
                 n_layers,
                 num_heads,
-                 activation,
-                 dropout):
+                 activation):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
-        self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2))
+        self.layers.append(dglnn.GATConv((in_feats, in_feats), n_hidden, num_heads=num_heads, activation=activation))
        for i in range(1, n_layers - 1):
-            self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=activation, negative_slope=0.2))
-        self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes, num_heads=num_heads, feat_drop=0., attn_drop=0., activation=None, negative_slope=0.2))
+            self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_hidden,
+                                             num_heads=num_heads, activation=activation))
+        self.layers.append(dglnn.GATConv((n_hidden * num_heads, n_hidden * num_heads), n_classes,
+                                         num_heads=num_heads, activation=None))

    def forward(self, blocks, x):
        h = x
@@ -44,7 +38,7 @@ class GAT(nn.Module):
            # appropriate nodes on the LHS.
            # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
            # would be (num_nodes_RHS, D)
-            h_dst = h[:block.number_of_dst_nodes()]
+            h_dst = h[:block.num_dst_nodes()]
            # Then we compute the updated representation on the RHS.
            # The shape of h now becomes (num_nodes_RHS, D)
            if l < self.n_layers - 1:
@@ -54,7 +48,7 @@ class GAT(nn.Module):
        h = h.mean(1)
        return h.log_softmax(dim=-1)

-    def inference(self, g, x, batch_size, num_heads, device):
+    def inference(self, g, x, num_heads, device):
        """
        Inference with the GAT model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
@@ -67,17 +61,16 @@ class GAT(nn.Module):
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
-        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
            if l < self.n_layers - 1:
-                y = th.zeros(g.number_of_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
+                y = th.zeros(g.num_nodes(), self.n_hidden * num_heads if l != len(self.layers) - 1 else self.n_classes)
            else:
-                y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+                y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
-                th.arange(g.number_of_nodes()),
+                th.arange(g.num_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
@@ -88,7 +81,7 @@ class GAT(nn.Module):
                block = blocks[0].int().to(device)

                h = x[input_nodes].to(device)
-                h_dst = h[:block.number_of_dst_nodes()]
+                h_dst = h[:block.num_dst_nodes()]
                if l < self.n_layers - 1:
                    h = layer(block, (h, h_dst)).flatten(1) 
                else:
@@ -99,7 +92,7 @@ class GAT(nn.Module):
                y[output_nodes] = h.cpu()

            x = y
-        return y
+        return y.to(device)

 def compute_acc(pred, labels):
    """
@@ -107,7 +100,7 @@ def compute_acc(pred, labels):
    """
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

-def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device):
+def evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device):
    """
    Evaluate the model on the validation set specified by ``val_mask``.
    g : The entire graph.
@@ -119,23 +112,22 @@ def evaluate(model, g, labels, val_nid, test_nid, batch_size, num_heads, device)
    """
    model.eval()
    with th.no_grad():
-        inputs = g.ndata['feat']
-        pred = model.inference(g, inputs, batch_size, num_heads, device)
+        pred = model.inference(g, nfeat, num_heads, device)
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred

-def load_subtensor(g, labels, seeds, input_nodes, device):
+def load_subtensor(nfeat, labels, seeds, input_nodes):
    """
-    Copys features and labels of a set of nodes onto GPU.
+    Extracts features and labels for a set of nodes.
    """
-    batch_inputs = g.ndata['feat'][input_nodes].to(device)
-    batch_labels = labels[seeds].to(device)
+    batch_inputs = nfeat[input_nodes]
+    batch_labels = labels[seeds]
    return batch_inputs, batch_labels

 #### Entry point
 def run(args, device, data):
    # Unpack data
-    train_nid, val_nid, test_nid, in_feats, labels, n_classes, g, num_heads = data
+    train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g, num_heads = data

    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
@@ -150,7 +142,7 @@ def run(args, device, data):
        num_workers=args.num_workers)

    # Define model and optimizer
-    model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu, args.dropout)
+    model = GAT(in_feats, args.num_hidden, n_classes, args.num_layers, num_heads, F.relu)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)

@@ -171,7 +163,7 @@ def run(args, device, data):
            blocks = [blk.to(device) for blk in blocks]

            # Load the input features as well as output labels
-            batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device)
+            batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)

            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -192,7 +184,7 @@ def run(args, device, data):
        if epoch >= 5:
            avg += toc - tic
        if epoch % args.eval_every == 0 and epoch != 0:
-            eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, num_heads, device)
+            eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, num_heads, device)
            if args.save_pred:
                np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
            print('Eval Acc {:.4f}'.format(eval_acc))
@@ -217,7 +209,6 @@ if __name__ == '__main__':
    argparser.add_argument('--log-every', type=int, default=20)
    argparser.add_argument('--eval-every', type=int, default=1)
    argparser.add_argument('--lr', type=float, default=0.001)
-    argparser.add_argument('--dropout', type=float, default=0.5)
    argparser.add_argument('--num-workers', type=int, default=8,
        help="Number of sampling processes. Use 0 for no extra process.")
    argparser.add_argument('--save-pred', type=str, default='')
@@ -235,20 +226,21 @@ if __name__ == '__main__':
    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    graph, labels = data[0]
-    labels = labels[:, 0]
+    nfeat = graph.ndata.pop('feat').to(device)
+    labels = labels[:, 0].to(device)

-    print('Total edges before adding self-loop {}'.format(graph.number_of_edges()))
+    print('Total edges before adding self-loop {}'.format(graph.num_edges()))
    graph = graph.remove_self_loop().add_self_loop()
-    print('Total edges after adding self-loop {}'.format(graph.number_of_edges()))
+    print('Total edges after adding self-loop {}'.format(graph.num_edges()))

-    in_feats = graph.ndata['feat'].shape[1]
+    in_feats = nfeat.shape[1]
    n_classes = (labels.max() + 1).item()

    # Create csr/coo/csc formats before launching sampling processes
    # This avoids creating certain formats in each data loader process, which saves momory and CPU.
    graph.create_formats_()
    # Pack data
-    data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph, args.head
+    data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph, args.head

    # Run 10 times
    test_accs = []

--- a/examples/pytorch/ogb/ogbn-products/graphsage/main.py
+++ b/examples/pytorch/ogb/ogbn-products/graphsage/main.py
@@ -4,17 +4,10 @@ import torch as th
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
-import dgl.function as fn
 import dgl.nn.pytorch as dglnn
 import time
 import argparse
-from _thread import start_new_thread
-from functools import wraps
-from dgl.data import RedditDataset
 import tqdm
-import traceback
 from ogb.nodeproppred import DglNodePropPredDataset

 class SAGE(nn.Module):
@@ -44,7 +37,7 @@ class SAGE(nn.Module):
            # appropriate nodes on the LHS.
            # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
            # would be (num_nodes_RHS, D)
-            h_dst = h[:block.number_of_dst_nodes()]
+            h_dst = h[:block.num_dst_nodes()]
            # Then we compute the updated representation on the RHS.
            # The shape of h now becomes (num_nodes_RHS, D)
            h = layer(block, (h, h_dst))
@@ -53,7 +46,7 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
@@ -66,14 +59,13 @@ class SAGE(nn.Module):
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
-        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
-            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
+            y = th.zeros(g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes).to(device)

            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
            dataloader = dgl.dataloading.NodeDataLoader(
                g,
-                th.arange(g.number_of_nodes()),
+                th.arange(g.num_nodes()),
                sampler,
                batch_size=args.batch_size,
                shuffle=True,
@@ -83,14 +75,14 @@ class SAGE(nn.Module):
            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
                block = blocks[0].int().to(device)

-                h = x[input_nodes].to(device)
-                h_dst = h[:block.number_of_dst_nodes()]
+                h = x[input_nodes]
+                h_dst = h[:block.num_dst_nodes()]
                h = layer(block, (h, h_dst))
                if l != len(self.layers) - 1:
                    h = self.activation(h)
                    h = self.dropout(h)

-                y[output_nodes] = h.cpu()
+                y[output_nodes] = h

            x = y
        return y
@@ -101,35 +93,33 @@ def compute_acc(pred, labels):
    """
    return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)

-def evaluate(model, g, labels, val_nid, test_nid, batch_size, device):
+def evaluate(model, g, nfeat, labels, val_nid, test_nid, device):
    """
    Evaluate the model on the validation set specified by ``val_mask``.
    g : The entire graph.
    inputs : The features of all the nodes.
    labels : The labels of all the nodes.
    val_mask : A 0-1 mask indicating which nodes do we actually compute the accuracy for.
-    batch_size : Number of nodes to compute at the same time.
    device : The GPU device to evaluate on.
    """
    model.eval()
    with th.no_grad():
-        inputs = g.ndata['feat']
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, nfeat, device)
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid]), pred

-def load_subtensor(g, labels, seeds, input_nodes, device):
+def load_subtensor(nfeat, labels, seeds, input_nodes):
    """
-    Copys features and labels of a set of nodes onto GPU.
+    Extracts features and labels for a set of nodes.
    """
-    batch_inputs = g.ndata['feat'][input_nodes].to(device)
-    batch_labels = labels[seeds].to(device)
+    batch_inputs = nfeat[input_nodes]
+    batch_labels = labels[seeds]
    return batch_inputs, batch_labels

 #### Entry point
 def run(args, device, data):
    # Unpack data
-    train_nid, val_nid, test_nid, in_feats, labels, n_classes, g = data
+    train_nid, val_nid, test_nid, in_feats, labels, n_classes, nfeat, g = data

    # Create PyTorch DataLoader for constructing blocks
    sampler = dgl.dataloading.MultiLayerNeighborSampler(
@@ -147,7 +137,6 @@ def run(args, device, data):
    model = SAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
    model = model.to(device)
    loss_fcn = nn.CrossEntropyLoss()
-    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)

    # Training loop
@@ -167,7 +156,7 @@ def run(args, device, data):
            blocks = [blk.int().to(device) for blk in blocks]

            # Load the input features as well as output labels
-            batch_inputs, batch_labels = load_subtensor(g, labels, seeds, input_nodes, device)
+            batch_inputs, batch_labels = load_subtensor(nfeat, labels, seeds, input_nodes)

            # Compute loss and prediction
            batch_pred = model(blocks, batch_inputs)
@@ -188,7 +177,7 @@ def run(args, device, data):
        if epoch >= 5:
            avg += toc - tic
        if epoch % args.eval_every == 0 and epoch != 0:
-            eval_acc, test_acc, pred = evaluate(model, g, labels, val_nid, test_nid, args.val_batch_size, device)
+            eval_acc, test_acc, pred = evaluate(model, g, nfeat, labels, val_nid, test_nid, device)
            if args.save_pred:
                np.savetxt(args.save_pred + '%02d' % epoch, pred.argmax(1).cpu().numpy(), '%d')
            print('Eval Acc {:.4f}'.format(eval_acc))
@@ -230,15 +219,16 @@ if __name__ == '__main__':
    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    graph, labels = data[0]
-    labels = labels[:, 0]
+    nfeat = graph.ndata.pop('feat').to(device)
+    labels = labels[:, 0].to(device)

-    in_feats = graph.ndata['feat'].shape[1]
+    in_feats = nfeat.shape[1]
    n_classes = (labels.max() + 1).item()
    # Create csr/coo/csc formats before launching sampling processes
    # This avoids creating certain formats in each data loader process, which saves momory and CPU.
    graph.create_formats_()
    # Pack data
-    data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, graph
+    data = train_idx, val_idx, test_idx, in_feats, labels, n_classes, nfeat, graph

    # Run 10 times
    test_accs = []

--- a/examples/pytorch/rgcn-hetero/entity_classify_mb.py
+++ b/examples/pytorch/rgcn-hetero/entity_classify_mb.py
@@ -7,10 +7,7 @@ import itertools
 import numpy as np
 import time
 import torch as th
-import torch.nn as nn
 import torch.nn.functional as F
-from torch.utils.data import DataLoader
-from functools import partial

 import dgl
 from dgl.data.rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
@@ -32,7 +29,7 @@ def evaluate(model, loader, node_embed, labels, category, device):
        blocks = [blk.to(device) for blk in blocks]
        seeds = seeds[category]
        emb = extract_embed(node_embed, input_nodes)
-        emb = {k : e.to(device) for k, e in emb.items()}
+        emb = {k: e.to(device) for k, e in emb.items()}
        lbl = labels[seeds].to(device)
        logits = model(emb, blocks)[category]
        loss = F.cross_entropy(logits, lbl)
@@ -43,6 +40,13 @@ def evaluate(model, loader, node_embed, labels, category, device):
    return total_loss / count, total_acc / count

 def main(args):
+    # check cuda
+    device = 'cpu'
+    use_cuda = args.gpu >= 0 and th.cuda.is_available()
+    if use_cuda:
+        th.cuda.set_device(args.gpu)
+        device = 'cuda:%d' % args.gpu
+
    # load graph data
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
@@ -71,19 +75,13 @@ def main(args):
    else:
        val_idx = train_idx

-    # check cuda
-    device = 'cpu'
-    use_cuda = args.gpu >= 0 and th.cuda.is_available()
-    if use_cuda:
-        th.cuda.set_device(args.gpu)
-        device = 'cuda:%d' % args.gpu
-
-    train_label = labels[train_idx]
-    val_label = labels[val_idx]
-    test_label = labels[test_idx]
-
    # create embeddings
    embed_layer = RelGraphEmbed(g, args.n_hidden)
+
+    if not args.data_cpu:
+        labels = labels.to(device)
+        embed_layer = embed_layer.to(device)
+
    node_embed = embed_layer()
    # create model
    model = EntityClassify(g,
@@ -187,6 +185,11 @@ if __name__ == '__main__':
            help="Mini-batch size. If -1, use full graph training.")
    parser.add_argument("--fanout", type=int, default=4,
            help="Fan-out of neighbor sampling.")
+    parser.add_argument('--data-cpu', action='store_true',
+            help="By default the script puts all node features and labels "
+                 "on GPU when using it to save time for data copy. This may "
+                 "be undesired if they cannot fit in GPU memory at once. "
+                 "This flag disables that.")
    fp = parser.add_mutually_exclusive_group(required=False)
    fp.add_argument('--validation', dest='validation', action='store_true')
    fp.add_argument('--testing', dest='validation', action='store_false')