[Bugfix] Bug fixes in new dataloader (#3727)

* fixes * fix * more fixes * update * oops * lint? * temporarily revert - will fix in another PR * more fixes * skipping mxnet test * address comments * fix DDP * fix edge dataloader exclusion problems * stupid bug * fix * use_uvm option * fix * fixes * fixes * fixes * fixes * add evaluation for cluster gcn and ddp * stupid bug again * fixes * move sanity checks to only support DGLGraphs * pytorch lightning compatibility fixes * remove * poke * more fixes * fix * fix * disable test * docstrings * why is it getting a memory leak? * fix * update * updates and temporarily disable forkingpickler * update * fix? * fix? * oops * oops * fix * lint * huh * uh * update * fix * made it memory efficient * refine exclude interface * fix tutorial * fix tutorial * fix graph duplication in CPU dataloader workers * lint * lint * Revert "lint" This reverts commit 805484dd553695111b5fb37f2125214a6b7276e9. * Revert "lint" This reverts commit 0bce411b2b415c2ab770343949404498436dc8b2. * Revert "fix graph duplication in CPU dataloader workers" This reverts commit 9e3a8cf34c175d3093c773f6bb023b155f2bd27f. Co-authored-by: xiny <xiny@nvidia.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[Bugfix] Bug fixes in new dataloader (#3727)
* fixes * fix * more fixes * update * oops * lint? * temporarily revert - will fix in another PR * more fixes * skipping mxnet test * address comments * fix DDP * fix edge dataloader exclusion problems * stupid bug * fix * use_uvm option * fix * fixes * fixes * fixes * fixes * add evaluation for cluster gcn and ddp * stupid bug again * fixes * move sanity checks to only support DGLGraphs * pytorch lightning compatibility fixes * remove * poke * more fixes * fix * fix * disable test * docstrings * why is it getting a memory leak? * fix * update * updates and temporarily disable forkingpickler * update * fix? * fix? * oops * oops * fix * lint * huh * uh * update * fix * made it memory efficient * refine exclude interface * fix tutorial * fix tutorial * fix graph duplication in CPU dataloader workers * lint * lint * Revert "lint" This reverts commit 805484dd553695111b5fb37f2125214a6b7276e9. * Revert "lint" This reverts commit 0bce411b2b415c2ab770343949404498436dc8b2. * Revert "fix graph duplication in CPU dataloader workers" This reverts commit 9e3a8cf34c175d3093c773f6bb023b155f2bd27f. Co-authored-by: xiny <xiny@nvidia.com> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
3f138eba · Quan (Andy) Gan · GitHub · 7b9afbfa · 3f138eba · 3f138eba
Unverified Commit 3f138eba authored Feb 22, 2022 by Quan (Andy) Gan Committed by GitHub Feb 22, 2022
20 changed files
--- a/examples/pytorch/__temporary__/cluster_gcn/cluster_gcn.py
+++ b/examples/pytorch/__temporary__/cluster_gcn/cluster_gcn.py
@@ -8,8 +8,6 @@ import time
 import numpy as np
 from ogb.nodeproppred import DglNodePropPredDataset

-USE_WRAPPER = True
-
 class SAGE(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes):
        super().__init__()
@@ -40,11 +38,6 @@ graph.ndata['test_mask'] = torch.zeros(graph.num_nodes(), dtype=torch.bool).inde
 model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).cuda()
 opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

-if USE_WRAPPER:
-    import dglnew
-    graph.create_formats_()
-    graph = dglnew.graph.wrapper.DGLGraphStorage(graph)
-
 num_partitions = 1000
 sampler = dgl.dataloading.ClusterGCNSampler(
        graph, num_partitions,
@@ -61,14 +54,13 @@ dataloader = dgl.dataloading.DataLoader(
        batch_size=100,
        shuffle=True,
        drop_last=False,
-        pin_memory=True,
-        num_workers=8,
-        persistent_workers=True,
-        use_prefetch_thread=True)       # TBD: could probably remove this argument
+        num_workers=0,
+        use_uva=True)

 durations = []
 for _ in range(10):
    t0 = time.time()
+    model.train()
    for it, sg in enumerate(dataloader):
        x = sg.ndata['feat']
        y = sg.ndata['label'][:, 0]
@@ -85,4 +77,27 @@ for _ in range(10):
    tt = time.time()
    print(tt - t0)
    durations.append(tt - t0)
+
+    model.eval()
+    with torch.no_grad():
+        val_preds, test_preds = [], []
+        val_labels, test_labels = [], []
+        for it, sg in enumerate(dataloader):
+            x = sg.ndata['feat']
+            y = sg.ndata['label'][:, 0]
+            m_val = sg.ndata['valid_mask']
+            m_test = sg.ndata['test_mask']
+            y_hat = model(sg, x)
+            val_preds.append(y_hat[m_val])
+            val_labels.append(y[m_val])
+            test_preds.append(y_hat[m_test])
+            test_labels.append(y[m_test])
+        val_preds = torch.cat(val_preds, 0)
+        val_labels = torch.cat(val_labels, 0)
+        test_preds = torch.cat(test_preds, 0)
+        test_labels = torch.cat(test_labels, 0)
+        val_acc = MF.accuracy(val_preds, val_labels)
+        test_acc = MF.accuracy(test_preds, test_labels)
+        print('Validation acc:', val_acc.item(), 'Test acc:', test_acc.item())
+
 print(np.mean(durations[4:]), np.std(durations[4:]))
--- a/examples/pytorch/__temporary__/graphsage/ddp.py
+++ b/examples/pytorch/__temporary__/graphsage/ddp.py
@@ -9,8 +9,9 @@ import dgl.nn as dglnn
 import time
 import numpy as np
 from ogb.nodeproppred import DglNodePropPredDataset
+import tqdm

-USE_WRAPPER = False
+USE_UVA = True

 class SAGE(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes):
@@ -20,6 +21,8 @@ class SAGE(nn.Module):
        self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
        self.dropout = nn.Dropout(0.5)
+        self.n_hidden = n_hidden
+        self.n_classes = n_classes

    def forward(self, blocks, x):
        h = x
@@ -30,41 +33,66 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

+    def inference(self, g, device, batch_size, num_workers, buffer_device=None):
+        # The difference between this inference function and the one in the official
+        # example is that the intermediate results can also benefit from prefetching.
+        g.ndata['h'] = g.ndata['feat']
+        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1, prefetch_node_feats=['h'])
+        dataloader = dgl.dataloading.NodeDataLoader(
+                g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device,
+                batch_size=1000, shuffle=False, drop_last=False, num_workers=num_workers,
+                persistent_workers=(num_workers > 0))
+        if buffer_device is None:
+            buffer_device = device

-def train(rank, world_size, graph, num_classes, split_idx):
+        for l, layer in enumerate(self.layers):
+            y = torch.zeros(
+                g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
+                device=buffer_device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = blocks[0].srcdata['h']
+                h = layer(blocks[0], x)
+                if l != len(self.layers) - 1:
+                    h = F.relu(h)
+                    h = self.dropout(h)
+                y[output_nodes] = h.to(buffer_device)
+            g.ndata['h'] = y
+        return y
+
+
+def train(rank, world_size, shared_memory_name, features, num_classes, split_idx):
    torch.cuda.set_device(rank)
    dist.init_process_group('nccl', 'tcp://127.0.0.1:12347', world_size=world_size, rank=rank)

+    graph = dgl.hetero_from_shared_memory(shared_memory_name)
+    feat, labels = features
+    graph.ndata['feat'] = feat
+    graph.ndata['label'] = labels
+
    model = SAGE(graph.ndata['feat'].shape[1], 256, num_classes).cuda()
    model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
    opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

    train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
-    if USE_WRAPPER:
-        import dglnew
-        graph = dglnew.graph.wrapper.DGLGraphStorage(graph)
+
+    if USE_UVA:
+        train_idx = train_idx.to('cuda')

    sampler = dgl.dataloading.NeighborSampler(
-            [5, 5, 5], output_device='cpu', prefetch_node_feats=['feat'],
-            prefetch_labels=['label'])
-    dataloader = dgl.dataloading.NodeDataLoader(
-            graph,
-            train_idx,
-            sampler,
-            device='cuda',
-            batch_size=1000,
-            shuffle=True,
-            drop_last=False,
-            pin_memory=True,
-            num_workers=4,
-            persistent_workers=True,
-            use_ddp=True,
-            use_prefetch_thread=True)       # TBD: could probably remove this argument
+            [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
+    train_dataloader = dgl.dataloading.NodeDataLoader(
+            graph, train_idx, sampler,
+            device='cuda', batch_size=1000, shuffle=True, drop_last=False,
+            num_workers=0, use_ddp=True, use_uva=USE_UVA)
+    valid_dataloader = dgl.dataloading.NodeDataLoader(
+            graph, valid_idx, sampler, device='cuda', batch_size=1024, shuffle=True,
+            drop_last=False, num_workers=0, use_uva=USE_UVA)

    durations = []
    for _ in range(10):
+        model.train()
        t0 = time.time()
-        for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
            x = blocks[0].srcdata['feat']
            y = blocks[-1].dstdata['label'][:, 0]
            y_hat = model(blocks, x)
@@ -80,27 +108,38 @@ def train(rank, world_size, graph, num_classes, split_idx):
        if rank == 0:
            print(tt - t0)
            durations.append(tt - t0)
+
+            model.eval()
+            ys = []
+            y_hats = []
+            for it, (input_nodes, output_nodes, blocks) in enumerate(valid_dataloader):
+                with torch.no_grad():
+                    x = blocks[0].srcdata['feat']
+                    ys.append(blocks[-1].dstdata['label'])
+                    y_hats.append(model.module(blocks, x))
+            acc = MF.accuracy(torch.cat(y_hats), torch.cat(ys))
+            print('Validation acc:', acc.item())
+        dist.barrier()
+
    if rank == 0:
        print(np.mean(durations[4:]), np.std(durations[4:]))
+        model.eval()
+        with torch.no_grad():
+            pred = model.module.inference(graph, 'cuda', 1000, 12, graph.device)
+            acc = MF.accuracy(pred.to(graph.device), graph.ndata['label'])
+            print('Test acc:', acc.item())

 if __name__ == '__main__':
    dataset = DglNodePropPredDataset('ogbn-products')
    graph, labels = dataset[0]
-    graph.ndata['label'] = labels
-    graph.create_formats_()
+    shared_memory_name = 'shm'      # can be any string
+    feat = graph.ndata['feat']
+    graph = graph.shared_memory(shared_memory_name)
    split_idx = dataset.get_idx_split()
    num_classes = dataset.num_classes
    n_procs = 4

    # Tested with mp.spawn and fork.  Both worked and got 4s per epoch with 4 GPUs
    # and 3.86s per epoch with 8 GPUs on p2.8x, compared to 5.2s from official examples.
-    #import torch.multiprocessing as mp
-    #mp.spawn(train, args=(n_procs, graph, num_classes, split_idx), nprocs=n_procs)
-    import dgl.multiprocessing as mp
-    procs = []
-    for i in range(n_procs):
-        p = mp.Process(target=train, args=(i, n_procs, graph, num_classes, split_idx))
-        p.start()
-        procs.append(p)
-    for p in procs:
-        p.join()
+    import torch.multiprocessing as mp
+    mp.spawn(train, args=(n_procs, shared_memory_name, (feat, labels), num_classes, split_idx), nprocs=n_procs)
--- a/examples/pytorch/__temporary__/graphsage/link_pred.py
+++ b/examples/pytorch/__temporary__/graphsage/link_pred.py
@@ -6,20 +6,54 @@ import dgl
 import dgl.nn as dglnn
 import time
 import numpy as np
+import tqdm
 # OGB must follow DGL if both DGL and PyG are installed. Otherwise DataLoader will hang.
 # (This is a long-standing issue)
-from ogb.nodeproppred import DglNodePropPredDataset
+from ogb.linkproppred import DglLinkPropPredDataset

-USE_WRAPPER = True
+USE_UVA = False
+device = 'cuda'
+
+def to_bidirected_with_reverse_mapping(g):
+    """Makes a graph bidirectional, and returns a mapping array ``mapping`` where ``mapping[i]``
+    is the reverse edge of edge ID ``i``.
+    Does not work with graphs that have self-loops.
+    """
+    g_simple, mapping = dgl.to_simple(
+        dgl.add_reverse_edges(g), return_counts='count', writeback_mapping=True)
+    c = g_simple.edata['count']
+    num_edges = g.num_edges()
+    mapping_offset = torch.zeros(g_simple.num_edges() + 1, dtype=g_simple.idtype)
+    mapping_offset[1:] = c.cumsum(0)
+    idx = mapping.argsort()
+    idx_uniq = idx[mapping_offset[:-1]]
+    reverse_idx = torch.where(idx_uniq >= num_edges, idx_uniq - num_edges, idx_uniq + num_edges)
+    reverse_mapping = mapping[reverse_idx]
+
+    # Correctness check
+    src1, dst1 = g_simple.edges()
+    src2, dst2 = g_simple.find_edges(reverse_mapping)
+    assert torch.equal(src1, dst2)
+    assert torch.equal(src2, dst1)
+    return g_simple, reverse_mapping

 class SAGE(nn.Module):
-    def __init__(self, in_feats, n_hidden, n_classes):
+    def __init__(self, in_feats, n_hidden):
        super().__init__()
+        self.n_hidden = n_hidden
        self.layers = nn.ModuleList()
        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
        self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
-        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
-        self.dropout = nn.Dropout(0.5)
+        self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
+        self.predictor = nn.Sequential(
+            nn.Linear(n_hidden, n_hidden),
+            nn.ReLU(),
+            nn.Linear(n_hidden, n_hidden),
+            nn.ReLU(),
+            nn.Linear(n_hidden, 1))
+
+    def predict(self, h_src, h_dst):
+        return self.predictor(h_src * h_dst)

    def forward(self, pair_graph, neg_pair_graph, blocks, x):
        h = x
@@ -27,50 +61,88 @@ class SAGE(nn.Module):
            h = layer(block, h)
            if l != len(self.layers) - 1:
                h = F.relu(h)
-                h = self.dropout(h)
-        with pair_graph.local_scope(), neg_pair_graph.local_scope():
-            pair_graph.ndata['h'] = neg_pair_graph.ndata['h'] = h
-            pair_graph.apply_edges(dgl.function.u_dot_v('h', 'h', 's'))
-            neg_pair_graph.apply_edges(dgl.function.u_dot_v('h', 'h', 's'))
-            return pair_graph.edata['s'], neg_pair_graph.edata['s']
-
-dataset = DglNodePropPredDataset('ogbn-products')
-graph, labels = dataset[0]
-graph.ndata['label'] = labels
-split_idx = dataset.get_idx_split()
-train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']
-
-model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).cuda()
+        pos_src, pos_dst = pair_graph.edges()
+        neg_src, neg_dst = neg_pair_graph.edges()
+        h_pos = self.predict(h[pos_src], h[pos_dst])
+        h_neg = self.predict(h[neg_src], h[neg_dst])
+        return h_pos, h_neg
+
+    def inference(self, g, device, batch_size, num_workers, buffer_device=None):
+        # The difference between this inference function and the one in the official
+        # example is that the intermediate results can also benefit from prefetching.
+        g.ndata['h'] = g.ndata['feat']
+        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1, prefetch_node_feats=['h'])
+        dataloader = dgl.dataloading.NodeDataLoader(
+                g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device,
+                batch_size=1000, shuffle=False, drop_last=False, num_workers=num_workers)
+        if buffer_device is None:
+            buffer_device = device
+
+        for l, layer in enumerate(self.layers):
+            y = torch.zeros(g.num_nodes(), self.n_hidden, device=buffer_device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = blocks[0].srcdata['h']
+                h = layer(blocks[0], x)
+                if l != len(self.layers) - 1:
+                    h = F.relu(h)
+                y[output_nodes] = h.to(buffer_device)
+            g.ndata['h'] = y
+        return y
+
+
+def compute_mrr(model, node_emb, src, dst, neg_dst, device, batch_size=500):
+    rr = torch.zeros(src.shape[0])
+    for start in tqdm.trange(0, src.shape[0], batch_size):
+        end = min(start + batch_size, src.shape[0])
+        all_dst = torch.cat([dst[start:end, None], neg_dst[start:end]], 1)
+        h_src = node_emb[src[start:end]][:, None, :].to(device)
+        h_dst = node_emb[all_dst.view(-1)].view(*all_dst.shape, -1).to(device)
+        pred = model.predict(h_src, h_dst).squeeze(-1)
+        relevance = torch.zeros(*pred.shape, dtype=torch.bool)
+        relevance[:, 0] = True
+        rr[start:end] = MF.retrieval_reciprocal_rank(pred, relevance)
+    return rr.mean()
+
+
+def evaluate(model, edge_split, device, num_workers):
+    with torch.no_grad():
+        node_emb = model.inference(graph, device, 4096, num_workers, 'cpu')
+        results = []
+        for split in ['valid', 'test']:
+            src = edge_split[split]['source_node'].to(device)
+            dst = edge_split[split]['target_node'].to(device)
+            neg_dst = edge_split[split]['target_node_neg'].to(device)
+            results.append(compute_mrr(model, node_emb, src, dst, neg_dst, device))
+    return results
+
+
+dataset = DglLinkPropPredDataset('ogbl-citation2')
+graph = dataset[0]
+graph, reverse_eids = to_bidirected_with_reverse_mapping(graph)
+seed_edges = torch.arange(graph.num_edges())
+edge_split = dataset.get_edge_split()
+
+model = SAGE(graph.ndata['feat'].shape[1], 256).to(device)
 opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

-num_edges = graph.num_edges()
-train_eids = torch.arange(num_edges)
-if USE_WRAPPER:
-    import dglnew
-    graph.create_formats_()
-    graph = dglnew.graph.wrapper.DGLGraphStorage(graph)
+if not USE_UVA:
+    graph = graph.to(device)
+    reverse_eids = reverse_eids.to(device)
+    seed_edges = torch.arange(graph.num_edges()).to(device)

-sampler = dgl.dataloading.NeighborSampler(
-        [5, 5, 5], output_device='cpu', prefetch_node_feats=['feat'],
-        prefetch_labels=['label'])
+sampler = dgl.dataloading.NeighborSampler([15, 10, 5], prefetch_node_feats=['feat'])
 dataloader = dgl.dataloading.EdgeDataLoader(
-        graph,
-        train_eids,
-        sampler,
-        device='cuda',
-        batch_size=1000,
-        shuffle=True,
-        drop_last=False,
-        pin_memory=True,
-        num_workers=8,
-        persistent_workers=True,
-        use_prefetch_thread=True,       # TBD: could probably remove this argument
+        graph, seed_edges, sampler,
+        device=device, batch_size=512, shuffle=True,
+        drop_last=False, num_workers=0,
        exclude='reverse_id',
-        reverse_eids=torch.arange(num_edges) ^ 1,
-        negative_sampler=dgl.dataloading.negative_sampler.Uniform(5))
+        reverse_eids=reverse_eids,
+        negative_sampler=dgl.dataloading.negative_sampler.Uniform(1),
+        use_uva=USE_UVA)

 durations = []
-for _ in range(10):
+for epoch in range(10):
+    model.train()
    t0 = time.time()
    for it, (input_nodes, pair_graph, neg_pair_graph, blocks) in enumerate(dataloader):
        x = blocks[0].srcdata['feat']
@@ -83,12 +155,16 @@ for _ in range(10):
        opt.zero_grad()
        loss.backward()
        opt.step()
-        if it % 20 == 0:
-            acc = MF.auroc(score, labels.long())
+        if (it + 1) % 20 == 0:
            mem = torch.cuda.max_memory_allocated() / 1000000
-            print('Loss', loss.item(), 'Acc', acc.item(), 'GPU Mem', mem, 'MB')
+            print('Loss', loss.item(), 'GPU Mem', mem, 'MB')
+            if (it + 1) == 1000:
                tt = time.time()
                print(tt - t0)
-            t0 = time.time()
                durations.append(tt - t0)
+                break
+    if epoch % 10 == 0:
+        model.eval()
+        valid_mrr, test_mrr = evaluate(model, edge_split, device, 12)
+        print('Validation MRR:', valid_mrr.item(), 'Test MRR:', test_mrr.item())
 print(np.mean(durations[4:]), np.std(durations[4:]))
--- a/examples/pytorch/__temporary__/graphsage/normal.py
+++ b/examples/pytorch/__temporary__/graphsage/normal.py
@@ -7,8 +7,10 @@ import dgl.nn as dglnn
 import time
 import numpy as np
 from ogb.nodeproppred import DglNodePropPredDataset
+import tqdm
+import argparse

-USE_WRAPPER = True
+USE_UVA = True    # Set to True for UVA sampling

 class SAGE(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes):
@@ -18,6 +20,8 @@ class SAGE(nn.Module):
        self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
        self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
        self.dropout = nn.Dropout(0.5)
+        self.n_hidden = n_hidden
+        self.n_classes = n_classes

    def forward(self, blocks, x):
        h = x
@@ -28,42 +32,64 @@ class SAGE(nn.Module):
                h = self.dropout(h)
        return h

+    def inference(self, g, device, batch_size, num_workers, buffer_device=None):
+        # The difference between this inference function and the one in the official
+        # example is that the intermediate results can also benefit from prefetching.
+        g.ndata['h'] = g.ndata['feat']
+        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1, prefetch_node_feats=['h'])
+        dataloader = dgl.dataloading.NodeDataLoader(
+                g, torch.arange(g.num_nodes()).to(g.device), sampler, device=device,
+                batch_size=batch_size, shuffle=False, drop_last=False, num_workers=num_workers,
+                persistent_workers=(num_workers > 0))
+        if buffer_device is None:
+            buffer_device = device
+
+        for l, layer in enumerate(self.layers):
+            y = torch.zeros(
+                g.num_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes,
+                device=buffer_device)
+            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+                x = blocks[0].srcdata['h']
+                h = layer(blocks[0], x)
+                if l != len(self.layers) - 1:
+                    h = F.relu(h)
+                    h = self.dropout(h)
+                y[output_nodes] = h.to(buffer_device)
+            g.ndata['h'] = y
+        return y
+
 dataset = DglNodePropPredDataset('ogbn-products')
 graph, labels = dataset[0]
-graph.ndata['label'] = labels
+graph.ndata['label'] = labels.squeeze()
 split_idx = dataset.get_idx_split()
 train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']

-model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).cuda()
-opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
+if not USE_UVA:
+    graph = graph.to('cuda')
+    train_idx = train_idx.to('cuda')
+    valid_idx = valid_idx.to('cuda')
+    test_idx = test_idx.to('cuda')
+device = 'cuda'

-if USE_WRAPPER:
-    import dglnew
-    graph.create_formats_()
-    graph = dglnew.graph.wrapper.DGLGraphStorage(graph)
+model = SAGE(graph.ndata['feat'].shape[1], 256, dataset.num_classes).to(device)
+opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

 sampler = dgl.dataloading.NeighborSampler(
-        [5, 5, 5], output_device='cpu', prefetch_node_feats=['feat'],
-        prefetch_labels=['label'])
-dataloader = dgl.dataloading.NodeDataLoader(
-        graph,
-        train_idx,
-        sampler,
-        device='cuda',
-        batch_size=1000,
-        shuffle=True,
-        drop_last=False,
-        pin_memory=True,
-        num_workers=16,
-        persistent_workers=True,
-        use_prefetch_thread=True)       # TBD: could probably remove this argument
+        [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
+train_dataloader = dgl.dataloading.NodeDataLoader(
+        graph, train_idx, sampler, device=device, batch_size=1024, shuffle=True,
+        drop_last=False, num_workers=0, use_uva=USE_UVA)
+valid_dataloader = dgl.dataloading.NodeDataLoader(
+        graph, valid_idx, sampler, device=device, batch_size=1024, shuffle=True,
+        drop_last=False, num_workers=0, use_uva=USE_UVA)

 durations = []
 for _ in range(10):
+    model.train()
    t0 = time.time()
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+    for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
        x = blocks[0].srcdata['feat']
-        y = blocks[-1].dstdata['label'][:, 0]
+        y = blocks[-1].dstdata['label']
        y_hat = model(blocks, x)
        loss = F.cross_entropy(y_hat, y)
        opt.zero_grad()
@@ -76,4 +102,23 @@ for _ in range(10):
    tt = time.time()
    print(tt - t0)
    durations.append(tt - t0)
+
+    model.eval()
+    ys = []
+    y_hats = []
+    for it, (input_nodes, output_nodes, blocks) in enumerate(valid_dataloader):
+        with torch.no_grad():
+            x = blocks[0].srcdata['feat']
+            ys.append(blocks[-1].dstdata['label'])
+            y_hats.append(model(blocks, x))
+    acc = MF.accuracy(torch.cat(y_hats), torch.cat(ys))
+    print('Validation acc:', acc.item())
+
 print(np.mean(durations[4:]), np.std(durations[4:]))
+
+# Test accuracy and offline inference of all nodes
+model.eval()
+with torch.no_grad():
+    pred = model.inference(graph, device, 4096, 12 if USE_UVA else 0, graph.device)
+    acc = MF.accuracy(pred.to(graph.device), graph.ndata['label'])
+    print('Test acc:', acc.item())
--- a/examples/pytorch/__temporary__/rgat/rgat.py
+++ b/examples/pytorch/__temporary__/rgat/rgat.py
@@ -11,7 +11,7 @@ import numpy as np
 from ogb.nodeproppred import DglNodePropPredDataset
 import tqdm

-USE_WRAPPER = True
+USE_UVA = False

 class HeteroGAT(nn.Module):
    def __init__(self, etypes, in_feats, n_hidden, n_classes, n_heads=4):
@@ -52,44 +52,62 @@ graph = dgl.AddReverse()(graph)
 graph.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='rev_writes')
 graph.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='has_topic')
 graph.update_all(fn.copy_u('feat', 'm'), fn.mean('m', 'feat'), etype='affiliated_with')
-graph.edges['cites'].data['weight'] = torch.ones(graph.num_edges('cites'))  # dummy edge weights

 model = HeteroGAT(graph.etypes, graph.ndata['feat']['paper'].shape[1], 256, dataset.num_classes).cuda()
 opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

-if USE_WRAPPER:
-    import dglnew
-    graph.create_formats_()
-    graph = dglnew.graph.wrapper.DGLGraphStorage(graph)
-
 split_idx = dataset.get_idx_split()
 train_idx, valid_idx, test_idx = split_idx['train'], split_idx['valid'], split_idx['test']

-sampler = dgl.dataloading.NeighborSampler(
-        [5, 5, 5], output_device='cpu',
+if not USE_UVA:
+    graph = graph.to('cuda')
+    train_idx = recursive_apply(train_idx, lambda x: x.to('cuda'))
+    valid_idx = recursive_apply(valid_idx, lambda x: x.to('cuda'))
+    test_idx = recursive_apply(test_idx, lambda x: x.to('cuda'))
+
+train_sampler = dgl.dataloading.NeighborSampler(
+        [5, 5, 5],
        prefetch_node_feats={k: ['feat'] for k in graph.ntypes},
-        prefetch_labels={'paper': ['label']},
-        prefetch_edge_feats={'cites': ['weight']})
-dataloader = dgl.dataloading.NodeDataLoader(
-        graph,
-        train_idx,
-        sampler,
-        device='cuda',
-        batch_size=1000,
-        shuffle=True,
-        drop_last=False,
-        pin_memory=True,
-        num_workers=8,
-        persistent_workers=True,
-        use_prefetch_thread=True)       # TBD: could probably remove this argument
+        prefetch_labels={'paper': ['label']})
+valid_sampler = dgl.dataloading.NeighborSampler(
+        [10, 10, 10],   # Slightly more
+        prefetch_node_feats={k: ['feat'] for k in graph.ntypes},
+        prefetch_labels={'paper': ['label']})
+train_dataloader = dgl.dataloading.NodeDataLoader(
+        graph, train_idx, train_sampler,
+        device='cuda', batch_size=1000, shuffle=True,
+        drop_last=False, num_workers=0, use_uva=USE_UVA)
+valid_dataloader = dgl.dataloading.NodeDataLoader(
+        graph, valid_idx, valid_sampler,
+        device='cuda', batch_size=1000, shuffle=False,
+        drop_last=False, num_workers=0, use_uva=USE_UVA)
+test_dataloader = dgl.dataloading.NodeDataLoader(
+        graph, test_idx, valid_sampler,
+        device='cuda', batch_size=1000, shuffle=False,
+        drop_last=False, num_workers=0, use_uva=USE_UVA)
+
+def evaluate(model, dataloader):
+    preds = []
+    labels = []
+    with torch.no_grad():
+        for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
+            x = blocks[0].srcdata['feat']
+            y = blocks[-1].dstdata['label']['paper'][:, 0]
+            y_hat = model(blocks, x)
+            preds.append(y_hat)
+            labels.append(y)
+        preds = torch.cat(preds, 0)
+        labels = torch.cat(labels, 0)
+        acc = MF.accuracy(preds, labels)
+        return acc

 durations = []
 for _ in range(10):
+    model.train()
    t0 = time.time()
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
+    for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
        x = blocks[0].srcdata['feat']
        y = blocks[-1].dstdata['label']['paper'][:, 0]
-        assert y.min() >= 0 and y.max() < dataset.num_classes
        y_hat = model(blocks, x)
        loss = F.cross_entropy(y_hat, y)
        opt.zero_grad()
@@ -102,4 +120,9 @@ for _ in range(10):
    tt = time.time()
    print(tt - t0)
    durations.append(tt - t0)
+
+    model.eval()
+    valid_acc = evaluate(model, valid_dataloader)
+    test_acc = evaluate(model, test_dataloader)
+    print('Validation acc:', valid_acc, 'Test acc:', test_acc)
 print(np.mean(durations[4:]), np.std(durations[4:]))
--- a/include/dgl/aten/coo.h
+++ b/include/dgl/aten/coo.h
@@ -134,7 +134,7 @@ struct COOMatrix {
  * \brief Pin the row, col and data (if not Null) of the matrix.
  * \note This is an in-place method. Behavior depends on the current context,
  *       kDLCPU: will be pinned;
-  *       kDLCPUPinned: directly return;
+  *       IsPinned: directly return;
  *       kDLGPU: invalid, will throw an error.
  *       The context check is deferred to pinning the NDArray.
  */
@@ -149,7 +149,7 @@ struct COOMatrix {
  /*!
  * \brief Unpin the row, col and data (if not Null) of the matrix.
  * \note This is an in-place method. Behavior depends on the current context,
-  *       kDLCPUPinned: will be unpinned;
+  *       IsPinned: will be unpinned;
  *       others: directly return.
  *       The context check is deferred to unpinning the NDArray.
  */

--- a/include/dgl/aten/csr.h
+++ b/include/dgl/aten/csr.h
@@ -127,7 +127,7 @@ struct CSRMatrix {
  * \brief Pin the indptr, indices and data (if not Null) of the matrix.
  * \note This is an in-place method. Behavior depends on the current context,
  *       kDLCPU: will be pinned;
-  *       kDLCPUPinned: directly return;
+  *       IsPinned: directly return;
  *       kDLGPU: invalid, will throw an error.
  *       The context check is deferred to pinning the NDArray.
  */
@@ -142,7 +142,7 @@ struct CSRMatrix {
  /*!
  * \brief Unpin the indptr, indices and data (if not Null) of the matrix.
  * \note This is an in-place method. Behavior depends on the current context,
-  *       kDLCPUPinned: will be unpinned;
+  *       IsPinned: will be unpinned;
  *       others: directly return.
  *       The context check is deferred to unpinning the NDArray.
  */

--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -43,7 +43,7 @@
 */
 #ifdef DGL_USE_CUDA
 #define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...) do {            \
-  if ((val) == kDLCPU || (val) == kDLCPUPinned) {               \
+  if ((val) == kDLCPU) {                                        \
    constexpr auto XPU = kDLCPU;                                \
    {__VA_ARGS__}                                               \
  } else if ((val) == kDLGPU) {                                 \
@@ -233,7 +233,7 @@
  });

 #define CHECK_VALID_CONTEXT(VAR1, VAR2)                                                   \
-  CHECK(((VAR1)->ctx == (VAR2)->ctx) || ((VAR1)->ctx.device_type == kDLCPUPinned))        \
+  CHECK(((VAR1)->ctx == (VAR2)->ctx) || (VAR1).IsPinned())                                \
    << "Expected " << (#VAR2) << "(" << (VAR2)->ctx << ")" << " to have the same device " \
    << "context as " << (#VAR1) << "(" << (VAR1)->ctx << "). "                            \
    << "Or " << (#VAR1) << "(" << (VAR1)->ctx << ")" << " is pinned";

--- a/include/dgl/base_heterograph.h
+++ b/include/dgl/base_heterograph.h
@@ -23,6 +23,7 @@ namespace dgl {

 // Forward declaration
 class BaseHeteroGraph;
+class HeteroPickleStates;
 typedef std::shared_ptr<BaseHeteroGraph> HeteroGraphPtr;

 struct FlattenedHeteroGraph;
@@ -436,6 +437,21 @@ class BaseHeteroGraph : public runtime::Object {
   */
  virtual aten::CSRMatrix GetCSCMatrix(dgl_type_t etype) const = 0;

+  /*!
+   * \brief Set the COO matrix representation for a given edge type.
+   */
+  virtual void SetCOOMatrix(dgl_type_t etype, aten::COOMatrix coo) = 0;
+
+  /*!
+   * \brief Set the CSR matrix representation for a given edge type.
+   */
+  virtual void SetCSRMatrix(dgl_type_t etype, aten::CSRMatrix csr) = 0;
+
+  /*!
+   * \brief Set the CSC matrix representation for a given edge type.
+   */
+  virtual void SetCSCMatrix(dgl_type_t etype, aten::CSRMatrix csc) = 0;
+
  /*!
   * \brief Extract the induced subgraph by the given vertices.
   *
@@ -864,6 +880,25 @@ HeteroPickleStates HeteroPickle(HeteroGraphPtr graph);
 */
 HeteroGraphPtr HeteroUnpickleOld(const HeteroPickleStates& states);

+/*!
+ * \brief Create heterograph from pickling states pickled by ForkingPickler.
+ *
+ * This is different from HeteroUnpickle where
+ * (1) Backward compatibility is not required,
+ * (2) All graph formats are pickled instead of only one.
+ */
+HeteroGraphPtr HeteroForkingUnpickle(const HeteroPickleStates& states);
+
+/*!
+ * \brief Get the pickling states of the relation graph structure in backend tensors for
+ * ForkingPickler.
+ *
+ * This is different from HeteroPickle where
+ * (1) Backward compatibility is not required,
+ * (2) All graph formats are pickled instead of only one.
+ */
+HeteroPickleStates HeteroForkingPickle(HeteroGraphPtr graph);
+
 #define FORMAT_HAS_CSC(format) \
  ((format) & CSC_CODE)


--- a/include/dgl/runtime/device_api.h
+++ b/include/dgl/runtime/device_api.h
@@ -160,6 +160,13 @@ class DeviceAPI {
   */ 
  DGL_DLL virtual void UnpinData(void* ptr);

+  /*!
+   * \brief Check whether the memory is in pinned memory.
+   */
+  DGL_DLL virtual bool IsPinned(const void* ptr) {
+    return false;
+  }
+
  /*!
   * \brief Allocate temporal workspace for backend execution.
   *

--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -176,7 +176,7 @@ class NDArray {
   *        on the underlying DLTensor.
   * \note This is an in-place method. Behavior depends on the current context,
   *       kDLCPU: will be pinned;
-   *       kDLCPUPinned: directly return;
+   *       IsPinned: directly return;
   *       kDLGPU: invalid, will throw an error.
   */
  inline void PinMemory_();
@@ -184,7 +184,7 @@ class NDArray {
   * \brief In-place method to unpin the current array by calling UnpinData
   *        on the underlying DLTensor.
   * \note This is an in-place method. Behavior depends on the current context,
-   *       kDLCPUPinned: will be unpinned;
+   *       IsPinned: will be unpinned;
   *       others: directly return.
   */
  inline void UnpinMemory_();
@@ -299,7 +299,7 @@ class NDArray {
   * \note Data of the given array will be pinned inplace.
   *       Behavior depends on the current context,
   *       kDLCPU: will be pinned;
-   *       kDLCPUPinned: directly return;
+   *       IsPinned: directly return;
   *       kDLGPU: invalid, will throw an error.
   */
  DGL_DLL static void PinData(DLTensor* tensor);
@@ -309,11 +309,18 @@ class NDArray {
   * \param tensor The array to be unpinned.
   * \note Data of the given array will be unpinned inplace.
   *       Behavior depends on the current context,
-   *       kDLCPUPinned: will be unpinned;
+   *       IsPinned: will be unpinned;
   *       others: directly return.
   */
  DGL_DLL static void UnpinData(DLTensor* tensor);

+  /*!
+   * \brief Function check if the data of a DLTensor is pinned.
+   * \param tensor The array to be checked.
+   * \return true if pinned.
+   */
+  DGL_DLL static bool IsDataPinned(DLTensor* tensor);
+
  // internal namespace
  struct Internal;
 private:
@@ -485,7 +492,7 @@ inline void NDArray::UnpinMemory_() {

 inline bool NDArray::IsPinned() const {
  CHECK(data_ != nullptr);
-  return data_->dl_tensor.ctx.device_type == kDLCPUPinned;
+  return IsDataPinned(&(data_->dl_tensor));
 }

 inline int NDArray::use_count() const {

--- a/python/dgl/_ffi/_ctypes/ndarray.py
+++ b/python/dgl/_ffi/_ctypes/ndarray.py
@@ -82,8 +82,6 @@ class NDArrayBase(object):
        Indicates the alignment requirement when converting to dlpack. Will copy to a
        new tensor if the alignment requirement is not satisfied.
        0 means no alignment requirement.
-        Will copy to a new tensor if the array is pinned because some backends,
-        e.g., pytorch, do not support kDLCPUPinned device type.


        Returns

--- a/python/dgl/_ffi/ndarray.py
+++ b/python/dgl/_ffi/ndarray.py
@@ -316,25 +316,15 @@ class NDArrayBase(_NDArrayBase):
            raise ValueError("Unsupported target type %s" % str(type(target)))
        return target

-    def pin_memory_(self, ctx):
+    def pin_memory_(self):
        """Pin host memory and map into GPU address space (in-place)
-
-        Parameters
-        ----------
-        ctx : DGLContext
-            The target GPU to map the host memory space
        """
-        check_call(_LIB.DGLArrayPinData(self.handle, ctx))
+        check_call(_LIB.DGLArrayPinData(self.handle))

-    def unpin_memory_(self, ctx):
+    def unpin_memory_(self):
        """Unpin host memory pinned by pin_memory_()
-
-        Parameters
-        ----------
-        ctx : DGLContext
-            The target GPU to map the host memory space
        """
-        check_call(_LIB.DGLArrayUnpinData(self.handle, ctx))
+        check_call(_LIB.DGLArrayUnpinData(self.handle))


 def free_extension_handle(handle, type_code):

--- a/python/dgl/backend/backend.py
+++ b/python/dgl/backend/backend.py
@@ -330,6 +330,21 @@ def copy_to(input, ctx, **kwargs):
    """
    pass

+def is_pinned(input):
+    """Check whether the tensor is in pinned memory.
+
+    Parameters
+    ----------
+    input : Tensor
+        The tensor.
+
+    Returns
+    -------
+    bool
+        Whether the tensor is in pinned memory.
+    """
+    pass
+
 ###############################################################################
 # Tensor functions on feature data
 # --------------------------------

--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -144,6 +144,9 @@ def asnumpy(input):
 def copy_to(input, ctx, **kwargs):
    return input.as_in_context(ctx)

+def is_pinned(input):
+    return input.context == mx.cpu_pinned()
+
 def sum(input, dim, keepdims=False):
    if len(input) == 0:
        return nd.array([0.], dtype=input.dtype, ctx=input.context)

--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -120,6 +120,9 @@ def copy_to(input, ctx, **kwargs):
    else:
        raise RuntimeError('Invalid context', ctx)

+def is_pinned(input):
+    return input.is_pinned()
+
 def sum(input, dim, keepdims=False):
    return th.sum(input, dim=dim, keepdim=keepdims)


--- a/python/dgl/backend/tensorflow/tensor.py
+++ b/python/dgl/backend/tensorflow/tensor.py
@@ -162,6 +162,8 @@ def copy_to(input, ctx, **kwargs):
        new_tensor = tf.identity(input)
    return new_tensor

+def is_pinned(input):
+    return False        # not sure how to do this

 def sum(input, dim, keepdims=False):
    if input.dtype == tf.bool:

--- a/python/dgl/contrib/unified_tensor.py
+++ b/python/dgl/contrib/unified_tensor.py
@@ -75,7 +75,7 @@ class UnifiedTensor: #UnifiedTensor
        self._array = F.zerocopy_to_dgl_ndarray(self._input)
        self._device = device

-        self._array.pin_memory_(utils.to_dgl_context(self._device))
+        self._array.pin_memory_()

    def __len__(self):
        return len(self._array)
@@ -105,7 +105,7 @@ class UnifiedTensor: #UnifiedTensor

    def __del__(self):
        if hasattr(self, '_array') and self._array != None:
-            self._array.unpin_memory_(utils.to_dgl_context(self._device))
+            self._array.unpin_memory_()
            self._array = None

        if hasattr(self, '_input'):

--- a/python/dgl/dataloading/base.py
+++ b/python/dgl/dataloading/base.py
@@ -124,6 +124,8 @@ def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
 def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
    if exclude_mode is None:
        return None
+    elif callable(exclude_mode):
+        return exclude_mode(eids)
    elif F.is_tensor(exclude_mode) or (
            isinstance(exclude_mode, Mapping) and
            all(F.is_tensor(v) for v in exclude_mode.values())):
@@ -151,9 +153,6 @@ def find_exclude_eids(g, seed_edges, exclude, reverse_eids=None, reverse_etypes=
        None (default)
            Does not exclude any edge.

-        Tensor or dict[etype, Tensor]
-            Exclude the given edge IDs.
-
        'self'
            Exclude the given edges themselves but nothing else.

@@ -176,6 +175,10 @@ def find_exclude_eids(g, seed_edges, exclude, reverse_eids=None, reverse_etypes=

            This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
            will have ID ``e`` and type ``reverse_etype_map[etype]``.
+
+        callable
+            Any function that takes in a single argument :attr:`seed_edges` and returns
+            a tensor or dict of tensors.
    eids : Tensor or dict[etype, Tensor]
        The edge IDs.
    reverse_eids : Tensor or dict[etype, Tensor]
@@ -191,9 +194,8 @@ def find_exclude_eids(g, seed_edges, exclude, reverse_eids=None, reverse_etypes=
        seed_edges,
        reverse_eid_map=reverse_eids,
        reverse_etype_map=reverse_etypes)
-    if exclude_eids is not None:
-        exclude_eids = recursive_apply(
-            exclude_eids, lambda x: x.to(output_device))
+    if exclude_eids is not None and output_device is not None:
+        exclude_eids = recursive_apply(exclude_eids, lambda x: F.copy_to(x, output_device))
    return exclude_eids


@@ -202,8 +204,8 @@ class EdgeBlockSampler(object):
    classification and link prediction.
    """
    def __init__(self, block_sampler, exclude=None, reverse_eids=None,
-                 reverse_etypes=None, negative_sampler=None, prefetch_node_feats=None,
-                 prefetch_labels=None, prefetch_edge_feats=None):
+                 reverse_etypes=None, negative_sampler=None,
+                 prefetch_node_feats=None, prefetch_labels=None, prefetch_edge_feats=None,):
        self.reverse_eids = reverse_eids
        self.reverse_etypes = reverse_etypes
        self.exclude = exclude
@@ -249,6 +251,8 @@ class EdgeBlockSampler(object):
        If :attr:`negative_sampler` is given, also returns another graph containing the
        negative pairs as edges.
        """
+        if isinstance(seed_edges, Mapping):
+            seed_edges = {g.to_canonical_etype(k): v for k, v in seed_edges.items()}
        exclude = self.exclude
        pair_graph = g.edge_subgraph(
            seed_edges, relabel_nodes=False, output_device=self.output_device)

--- a/python/dgl/dataloading/cluster_gcn.py
+++ b/python/dgl/dataloading/cluster_gcn.py
@@ -55,7 +55,7 @@ class ClusterGCNSampler(object):
            partition_node_ids = np.argsort(partition_ids)
            partition_size = F.zerocopy_from_numpy(np.bincount(partition_ids, minlength=k))
            partition_offset = F.zerocopy_from_numpy(np.insert(np.cumsum(partition_size), 0, 0))
-            partition_node_ids = F.zerocopy_from_numpy(partition_ids)
+            partition_node_ids = F.zerocopy_from_numpy(partition_node_ids)
            with open(cache_path, 'wb') as f:
                pickle.dump((partition_offset, partition_node_ids), f)
            self.partition_offset = partition_offset