[Distributed] Distributed heterograph training (#3069)

* support hetero RGCN. * fix. * simplify code. * sample_neighbors return heterograph directly. * avoid using to_heterogeneous. * compute canonical etypes in advance. * fix tests. * fix. * fix distributed data loader for heterograph. * use NodeDataLoader. * fix bugs in partitioning on heterogeneous graphs. * fix lint. * fix tests. * fix. * fix. * fix bugs. * fix tests. * fix. * enable coo for distributed. * fix. * fix. * fix. * fix. * fix. Co-authored-by: Ubuntu <ubuntu@ip-172-31-71-112.ec2.internal> Co-authored-by: Zheng <dzzhen@3c22fba32af5.ant.amazon.com>

[Distributed] Distributed heterograph training (#3069)
* support hetero RGCN. * fix. * simplify code. * sample_neighbors return heterograph directly. * avoid using to_heterogeneous. * compute canonical etypes in advance. * fix tests. * fix. * fix distributed data loader for heterograph. * use NodeDataLoader. * fix bugs in partitioning on heterogeneous graphs. * fix lint. * fix tests. * fix. * fix. * fix bugs. * fix tests. * fix. * enable coo for distributed. * fix. * fix. * fix. * fix. * fix. Co-authored-by: Ubuntu <ubuntu@ip-172-31-71-112.ec2.internal> Co-authored-by: Zheng <dzzhen@3c22fba32af5.ant.amazon.com>
34426a98 · Da Zheng · GitHub · 905c0aa5 · 34426a98 · 34426a98
Unverified Commit 34426a98 authored Jul 17, 2021 by Da Zheng Committed by GitHub Jul 17, 2021
10 changed files
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -21,15 +21,125 @@ from torch.multiprocessing import Queue
 from torch.nn.parallel import DistributedDataParallel
 from torch.utils.data import DataLoader
 import dgl
+from dgl import nn as dglnn
 from dgl import DGLGraph
 from dgl.distributed import DistDataLoader
 from functools import partial

-from dgl.nn import RelGraphConv
 import tqdm

 from ogb.nodeproppred import DglNodePropPredDataset

+
+class RelGraphConvLayer(nn.Module):
+    r"""Relational graph convolution layer.
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    rel_names : list[str]
+        Relation names.
+    num_bases : int, optional
+        Number of bases. If is none, use number of relations. Default: None.
+    weight : bool, optional
+        True if a linear layer is applied after message passing. Default: True
+    bias : bool, optional
+        True if bias is added. Default: True
+    activation : callable, optional
+        Activation function. Default: None
+    self_loop : bool, optional
+        True to include self loop message. Default: False
+    dropout : float, optional
+        Dropout rate. Default: 0.0
+    """
+    def __init__(self,
+                 in_feat,
+                 out_feat,
+                 rel_names,
+                 num_bases,
+                 *,
+                 weight=True,
+                 bias=True,
+                 activation=None,
+                 self_loop=False,
+                 dropout=0.0):
+        super(RelGraphConvLayer, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.rel_names = rel_names
+        self.num_bases = num_bases
+        self.bias = bias
+        self.activation = activation
+        self.self_loop = self_loop
+
+        self.conv = dglnn.HeteroGraphConv({
+                rel : dglnn.GraphConv(in_feat, out_feat, norm='right', weight=False, bias=False)
+                for rel in rel_names
+            })
+
+        self.use_weight = weight
+        self.use_basis = num_bases < len(self.rel_names) and weight
+        if self.use_weight:
+            if self.use_basis:
+                self.basis = dglnn.WeightBasis((in_feat, out_feat), num_bases, len(self.rel_names))
+            else:
+                self.weight = nn.Parameter(th.Tensor(len(self.rel_names), in_feat, out_feat))
+                nn.init.xavier_uniform_(self.weight, gain=nn.init.calculate_gain('relu'))
+
+        # bias
+        if bias:
+            self.h_bias = nn.Parameter(th.Tensor(out_feat))
+            nn.init.zeros_(self.h_bias)
+
+        # weight for self loop
+        if self.self_loop:
+            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
+            nn.init.xavier_uniform_(self.loop_weight,
+                                    gain=nn.init.calculate_gain('relu'))
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, g, inputs):
+        """Forward computation
+        Parameters
+        ----------
+        g : DGLHeteroGraph
+            Input graph.
+        inputs : dict[str, torch.Tensor]
+            Node feature for each node type.
+        Returns
+        -------
+        dict[str, torch.Tensor]
+            New node features for each node type.
+        """
+        g = g.local_var()
+        if self.use_weight:
+            weight = self.basis() if self.use_basis else self.weight
+            wdict = {self.rel_names[i] : {'weight' : w.squeeze(0)}
+                     for i, w in enumerate(th.split(weight, 1, dim=0))}
+        else:
+            wdict = {}
+
+        if g.is_block:
+            inputs_src = inputs
+            inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
+        else:
+            inputs_src = inputs_dst = inputs
+
+        hs = self.conv(g, inputs, mod_kwargs=wdict)
+
+        def _apply(ntype, h):
+            if self.self_loop:
+                h = h + th.matmul(inputs_dst[ntype], self.loop_weight)
+            if self.bias:
+                h = h + self.h_bias
+            if self.activation:
+                h = self.activation(h)
+            return self.dropout(h)
+        return {ntype : _apply(ntype, h) for ntype, h in hs.items()}
+
 class EntityClassify(nn.Module):
    """ Entity classification class for RGCN
    Parameters
@@ -42,8 +152,8 @@ class EntityClassify(nn.Module):
        Hidden dim size.
    out_dim : int
        Output dim size.
-    num_rels : int
-        Numer of relation types.
+    rel_names : list of str
+        A list of relation names.
    num_bases : int
        Number of bases. If is none, use number of relations.
    num_hidden_layers : int
@@ -52,51 +162,43 @@ class EntityClassify(nn.Module):
        Dropout
    use_self_loop : bool
        Use self loop if True, default False.
-    low_mem : bool
-        True to use low memory implementation of relation message passing function
-        trade speed with memory consumption
    """
    def __init__(self,
                 device,
                 h_dim,
                 out_dim,
-                 num_rels,
+                 rel_names,
                 num_bases=None,
                 num_hidden_layers=1,
                 dropout=0,
                 use_self_loop=False,
-                 low_mem=False,
                 layer_norm=False):
        super(EntityClassify, self).__init__()
        self.device = device
        self.h_dim = h_dim
        self.out_dim = out_dim
-        self.num_rels = num_rels
        self.num_bases = None if num_bases < 0 else num_bases
        self.num_hidden_layers = num_hidden_layers
        self.dropout = dropout
        self.use_self_loop = use_self_loop
-        self.low_mem = low_mem
        self.layer_norm = layer_norm

        self.layers = nn.ModuleList()
        # i2h
-        self.layers.append(RelGraphConv(
-            self.h_dim, self.h_dim, self.num_rels, "basis",
+        self.layers.append(RelGraphConvLayer(
+            self.h_dim, self.h_dim, rel_names,
            self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
-            low_mem=self.low_mem, dropout=self.dropout))
+            dropout=self.dropout))
        # h2h
        for idx in range(self.num_hidden_layers):
-            self.layers.append(RelGraphConv(
-                self.h_dim, self.h_dim, self.num_rels, "basis",
+            self.layers.append(RelGraphConvLayer(
+                self.h_dim, self.h_dim, rel_names,
                self.num_bases, activation=F.relu, self_loop=self.use_self_loop,
-                low_mem=self.low_mem, dropout=self.dropout))
+                dropout=self.dropout))
        # h2o
-        self.layers.append(RelGraphConv(
-            self.h_dim, self.out_dim, self.num_rels, "basis",
-            self.num_bases, activation=None,
-            self_loop=self.use_self_loop,
-            low_mem=self.low_mem))
+        self.layers.append(RelGraphConvLayer(
+            self.h_dim, self.out_dim, rel_names,
+            self.num_bases, activation=None, self_loop=self.use_self_loop))

    def forward(self, blocks, feats, norm=None):
        if blocks is None:
@@ -105,7 +207,7 @@ class EntityClassify(nn.Module):
        h = feats
        for layer, block in zip(self.layers, blocks):
            block = block.to(self.device)
-            h = layer(block, h, block.edata[dgl.ETYPE], block.edata['norm'])
+            h = layer(block, h)
        return h

 def init_emb(shape, dtype):
@@ -182,27 +284,23 @@ class DistEmbedLayer(nn.Module):
                    self.node_embeds[ntype] = th.nn.Embedding(g.number_of_nodes(ntype), self.embed_size)
                    nn.init.uniform_(self.node_embeds[ntype].weight, -1.0, 1.0)

-    def forward(self, node_ids, ntype_ids):
+    def forward(self, node_ids):
        """Forward computation
        Parameters
        ----------
-        node_ids : Tensor
+        node_ids : dict of Tensor
            node ids to generate embedding for.
-        ntype_ids : Tensor
-            node type ids
        Returns
        -------
        tensor
            embeddings as the input of the next layer
        """
-        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id)
-        for ntype_id in th.unique(ntype_ids).tolist():
-            ntype = self.ntype_id_map[int(ntype_id)]
-            loc = ntype_ids == ntype_id
+        embeds = {}
+        for ntype in node_ids:
            if self.feat_name in self.g.nodes[ntype].data:
-                embeds[loc] = self.node_projs[ntype](self.g.nodes[ntype].data[self.feat_name][node_ids[ntype_ids == ntype_id]].to(self.dev_id))
+                embeds[ntype] = self.node_projs[ntype](self.g.nodes[ntype].data[self.feat_name][node_ids[ntype]].to(self.dev_id))
            else:
-                embeds[loc] = self.node_embeds[ntype](node_ids[ntype_ids == ntype_id]).to(self.dev_id)
+                embeds[ntype] = self.node_embeds[ntype](node_ids[ntype]).to(self.dev_id)
        return embeds

 def compute_acc(results, labels):
@@ -212,14 +310,6 @@ def compute_acc(results, labels):
    labels = labels.long()
    return (results == labels).float().sum() / len(results)

-def gen_norm(g):
-    _, v, eid = g.all_edges(form='all')
-    _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
-    degrees = count[inverse_index]
-    norm = th.ones(eid.shape[0], device=eid.device) / degrees
-    norm = norm.unsqueeze(1)
-    g.edata['norm'] = norm
-
 def evaluate(g, model, embed_layer, labels, eval_loader, test_loader, all_val_nid, all_test_nid):
    model.eval()
    embed_layer.eval()
@@ -231,11 +321,12 @@ def evaluate(g, model, embed_layer, labels, eval_loader, test_loader, all_val_ni
    with th.no_grad():
        th.cuda.empty_cache()
        for sample_data in tqdm.tqdm(eval_loader):
-            seeds, blocks = sample_data
-            for block in blocks:
-                gen_norm(block)
-            feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE])
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds['paper']
+            feats = embed_layer(input_nodes)
            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits['paper']
            eval_logits.append(logits.cpu().detach())
            assert np.all(seeds.numpy() < g.number_of_nodes('paper'))
            eval_seeds.append(seeds.cpu().detach())
@@ -248,11 +339,12 @@ def evaluate(g, model, embed_layer, labels, eval_loader, test_loader, all_val_ni
    with th.no_grad():
        th.cuda.empty_cache()
        for sample_data in tqdm.tqdm(test_loader):
-            seeds, blocks = sample_data
-            for block in blocks:
-                gen_norm(block)
-            feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE])
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds['paper']
+            feats = embed_layer(input_nodes)
            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits['paper']
            test_logits.append(logits.cpu().detach())
            assert np.all(seeds.numpy() < g.number_of_nodes('paper'))
            test_seeds.append(seeds.cpu().detach())
@@ -267,90 +359,36 @@ def evaluate(g, model, embed_layer, labels, eval_loader, test_loader, all_val_ni
    else:
        return -1, -1

-class NeighborSampler:
-    """Neighbor sampler
-    Parameters
-    ----------
-    g : DGLHeterograph
-        Full graph
-    target_idx : tensor
-        The target training node IDs in g
-    fanouts : list of int
-        Fanout of each hop starting from the seed nodes. If a fanout is None,
-        sample full neighbors.
-    """
-    def __init__(self, g, fanouts, sample_neighbors):
-        self.g = g
-        self.fanouts = fanouts
-        self.sample_neighbors = sample_neighbors
-
-    def sample_blocks(self, seeds):
-        """Do neighbor sample
-        Parameters
-        ----------
-        seeds :
-            Seed nodes
-        Returns
-        -------
-        tensor
-            Seed nodes, also known as target nodes
-        blocks
-            Sampled subgraphs
-        """
-        blocks = []
-        etypes = []
-        norms = []
-        ntypes = []
-        seeds = th.LongTensor(np.asarray(seeds))
-        gpb = self.g.get_partition_book()
-        # We need to map the per-type node IDs to homogeneous IDs.
-        cur = gpb.map_to_homo_nid(seeds, 'paper')
-        for fanout in self.fanouts:
-            # For a heterogeneous input graph, the returned frontier is stored in
-            # the homogeneous graph format.
-            frontier = self.sample_neighbors(self.g, cur, fanout, replace=False)
-            block = dgl.to_block(frontier, cur)
-            cur = block.srcdata[dgl.NID]
-
-            block.edata[dgl.EID] = frontier.edata[dgl.EID]
-            # Map the homogeneous edge Ids to their edge type.
-            block.edata[dgl.ETYPE], block.edata[dgl.EID] = gpb.map_to_per_etype(block.edata[dgl.EID])
-            # Map the homogeneous node Ids to their node types and per-type Ids.
-            block.srcdata[dgl.NTYPE], block.srcdata[dgl.NID] = gpb.map_to_per_ntype(block.srcdata[dgl.NID])
-            block.dstdata[dgl.NTYPE], block.dstdata[dgl.NID] = gpb.map_to_per_ntype(block.dstdata[dgl.NID])
-            blocks.insert(0, block)
-        return seeds, blocks
-
 def run(args, device, data):
    g, num_classes, train_nid, val_nid, test_nid, labels, all_val_nid, all_test_nid = data
-    num_rels = len(g.etypes)

    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')]
-    sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors)
-    # Create DataLoader for constructing blocks
-    dataloader = DistDataLoader(
-        dataset=train_nid,
+
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    dataloader = dgl.dataloading.NodeDataLoader(
+        g,
+        {'paper': train_nid},
+        sampler,
        batch_size=args.batch_size,
-        collate_fn=sampler.sample_blocks,
        shuffle=True,
        drop_last=False)

-    valid_sampler = NeighborSampler(g, val_fanouts, dgl.distributed.sample_neighbors)
-    # Create DataLoader for constructing blocks
-    valid_dataloader = DistDataLoader(
-        dataset=val_nid,
+    valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
+    valid_dataloader = dgl.dataloading.NodeDataLoader(
+        g,
+        {'paper': val_nid},
+        valid_sampler,
        batch_size=args.batch_size,
-        collate_fn=valid_sampler.sample_blocks,
        shuffle=False,
        drop_last=False)

-    test_sampler = NeighborSampler(g, [-1] * args.n_layers, dgl.distributed.sample_neighbors)
-    # Create DataLoader for constructing blocks
-    test_dataloader = DistDataLoader(
-        dataset=test_nid,
+    test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
+    test_dataloader = dgl.dataloading.NodeDataLoader(
+        g,
+        {'paper': test_nid},
+        test_sampler,
        batch_size=args.eval_batch_size,
-        collate_fn=test_sampler.sample_blocks,
        shuffle=False,
        drop_last=False)

@@ -364,12 +402,11 @@ def run(args, device, data):
    model = EntityClassify(device,
                           args.n_hidden,
                           num_classes,
-                           num_rels,
+                           g.etypes,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers-2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
-                           low_mem=args.low_mem,
                           layer_norm=args.layer_norm)
    model = model.to(device)

@@ -442,22 +479,23 @@ def run(args, device, data):
        # blocks.
        step_time = []
        for step, sample_data in enumerate(dataloader):
-            seeds, blocks = sample_data
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds['paper']
            number_train += seeds.shape[0]
            number_input += np.sum([blocks[0].num_src_nodes(ntype) for ntype in blocks[0].ntypes])
            tic_step = time.time()
            sample_time += tic_step - start
            sample_t.append(tic_step - start)

-            for block in blocks:
-                gen_norm(block)
-            feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE])
+            feats = embed_layer(input_nodes)
            label = labels[seeds].to(device)
            copy_time = time.time()
            feat_copy_t.append(copy_time - tic_step)

            # forward
            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits['paper']
            loss = F.cross_entropy(logits, label)
            forward_end = time.time()


--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -390,6 +390,7 @@ def zerocopy_to_numpy(arr):
    return arr.asnumpy()

 def zerocopy_from_numpy(np_data):
+    np_data = np.asarray(np_data, order='C')
    return mx.nd.from_numpy(np_data, zero_copy=True)

 def zerocopy_to_dgl_ndarray(arr):

--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
@@ -361,7 +361,8 @@ class Collator(ABC):
 def _prepare_tensor_dict(g, data, name, is_distributed):
    if is_distributed:
        x = F.tensor(next(iter(data.values())))
-        return {k: F.copy_to(F.astype(v, F.dtype(x)), F.context(x)) for k, v in data.items()}
+        return {k: F.copy_to(F.astype(F.tensor(v), F.dtype(x)), F.context(x)) \
+                for k, v in data.items()}
    else:
        return utils.prepare_tensor_dict(g, data, name)


--- a/python/dgl/distributed/dist_dataloader.py
+++ b/python/dgl/distributed/dist_dataloader.py
@@ -64,7 +64,7 @@ class DistDataLoader:
    Parameters
    ----------
    dataset: a tensor
-        A tensor of node IDs or edge IDs.
+        Tensors of node IDs or edge IDs.
    batch_size: int
        The number of samples per batch to load.
    shuffle: bool, optional
@@ -127,7 +127,8 @@ class DistDataLoader:
        self.shuffle = shuffle
        self.is_closed = False

-        self.dataset = F.tensor(dataset)
+        self.dataset = dataset
+        self.data_idx = F.arange(0, len(dataset))
        self.expected_idxs = len(dataset) // self.batch_size
        if not self.drop_last and len(dataset) % self.batch_size != 0:
            self.expected_idxs += 1
@@ -176,7 +177,7 @@ class DistDataLoader:

    def __iter__(self):
        if self.shuffle:
-            self.dataset = F.rand_shuffle(self.dataset)
+            self.data_idx = F.rand_shuffle(self.data_idx)
        self.recv_idxs = 0
        self.current_pos = 0
        self.num_pending = 0
@@ -205,6 +206,7 @@ class DistDataLoader:
                end_pos = len(self.dataset)
        else:
            end_pos = self.current_pos + self.batch_size
-        ret = self.dataset[self.current_pos:end_pos]
+        idx = self.data_idx[self.current_pos:end_pos].tolist()
+        ret = [self.dataset[i] for i in idx]
        self.current_pos = end_pos
        return ret
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -296,7 +296,7 @@ class DistGraphServer(KVServer):
    '''
    def __init__(self, server_id, ip_config, num_servers,
                 num_clients, part_config, disable_shared_mem=False,
-                 graph_format='csc'):
+                 graph_format=('csc', 'coo')):
        super(DistGraphServer, self).__init__(server_id=server_id,
                                              ip_config=ip_config,
                                              num_servers=num_servers,
@@ -482,6 +482,25 @@ class DistGraph:
        self._ntype_map = {ntype:i for i, ntype in enumerate(self.ntypes)}
        self._etype_map = {etype:i for i, etype in enumerate(self.etypes)}

+        # Get canonical edge types.
+        # TODO(zhengda) this requires the server to store the graph with coo format.
+        eid = []
+        for etype in self.etypes:
+            type_eid = F.zeros((1,), F.int64, F.cpu())
+            eid.append(self._gpb.map_to_homo_eid(type_eid, etype))
+        eid = F.cat(eid, 0)
+        src, dst = dist_find_edges(self, eid)
+        src_tids, _ = self._gpb.map_to_per_ntype(src)
+        dst_tids, _ = self._gpb.map_to_per_ntype(dst)
+        self._canonical_etypes = []
+        etype_ids = F.arange(0, len(self.etypes))
+        for src_tid, etype_id, dst_tid in zip(src_tids, etype_ids, dst_tids):
+            src_tid = F.as_scalar(src_tid)
+            etype_id = F.as_scalar(etype_id)
+            dst_tid = F.as_scalar(dst_tid)
+            self._canonical_etypes.append((self.ntypes[src_tid], self.etypes[etype_id],
+                                           self.ntypes[dst_tid]))
+
    def _init(self):
        self._client = get_kvstore()
        assert self._client is not None, \
@@ -576,7 +595,7 @@ class DistGraph:
        int
        """
        # TODO(da?): describe when self._g is None and idtype shouldn't be called.
-        return self._g.idtype
+        return F.int64

    @property
    def device(self):
@@ -598,7 +617,7 @@ class DistGraph:
        Device context object
        """
        # TODO(da?): describe when self._g is None and device shouldn't be called.
-        return self._g.device
+        return F.cpu()

    @property
    def ntypes(self):
@@ -635,6 +654,42 @@ class DistGraph:
        # Currently, we only support a graph with one edge type.
        return self._gpb.etypes

+    @property
+    def canonical_etypes(self):
+        """Return all the canonical edge types in the graph.
+
+        A canonical edge type is a string triplet ``(str, str, str)``
+        for source node type, edge type and destination node type.
+
+        Returns
+        -------
+        list[(str, str, str)]
+            All the canonical edge type triplets in a list.
+
+        Notes
+        -----
+        DGL internally assigns an integer ID for each edge type. The returned
+        edge type names are sorted according to their IDs.
+
+        See Also
+        --------
+        etypes
+
+        Examples
+        --------
+        The following example uses PyTorch backend.
+
+        >>> import dgl
+        >>> import torch
+
+        >>> g = DistGraph("test")
+        >>> g.canonical_etypes
+        [('user', 'follows', 'user'),
+         ('user', 'follows', 'game'),
+         ('user', 'plays', 'game')]
+        """
+        return self._canonical_etypes
+
    def get_ntype_id(self, ntype):
        """Return the ID of the given node type.


--- a/python/dgl/distributed/graph_partition_book.py
+++ b/python/dgl/distributed/graph_partition_book.py
@@ -770,16 +770,20 @@ class RangePartitionBook(GraphPartitionBook):
        """
        ids = utils.toindex(ids).tousertensor()
        partids = self.nid2partid(ids, ntype)
-        end_diff = F.tensor(self._typed_max_node_ids[ntype])[partids] - ids
-        return F.tensor(self._typed_nid_range[ntype][:, 1])[partids] - end_diff
+        typed_max_nids = F.zerocopy_from_numpy(self._typed_max_node_ids[ntype])
+        end_diff = F.gather_row(typed_max_nids, partids) - ids
+        typed_nid_range = F.zerocopy_from_numpy(self._typed_nid_range[ntype][:, 1])
+        return F.gather_row(typed_nid_range, partids) - end_diff

    def map_to_homo_eid(self, ids, etype):
        """Map per-edge-type IDs to global edge IDs in the homoenegeous format.
        """
        ids = utils.toindex(ids).tousertensor()
        partids = self.eid2partid(ids, etype)
-        end_diff = F.tensor(self._typed_max_edge_ids[etype][partids]) - ids
-        return F.tensor(self._typed_eid_range[etype][:, 1])[partids] - end_diff
+        typed_max_eids = F.zerocopy_from_numpy(self._typed_max_edge_ids[etype])
+        end_diff = F.gather_row(typed_max_eids, partids) - ids
+        typed_eid_range = F.zerocopy_from_numpy(self._typed_eid_range[etype][:, 1])
+        return F.gather_row(typed_eid_range, partids) - end_diff

    def nid2partid(self, nids, ntype='_N'):
        """From global node IDs to partition IDs

--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -5,7 +5,7 @@ from .rpc import Request, Response, send_requests_to_machine, recv_responses
 from ..sampling import sample_neighbors as local_sample_neighbors
 from ..subgraph import in_subgraph as local_in_subgraph
 from .rpc import register_service
-from ..convert import graph
+from ..convert import graph, heterograph
 from ..base import NID, EID
 from ..utils import toindex
 from .. import backend as F
@@ -337,19 +337,8 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
    Node/edge features are not preserved. The original IDs of
    the sampled edges are stored as the `dgl.EID` feature in the returned graph.

-    This version provides an experimental support for heterogeneous graphs.
-    When the input graph is heterogeneous, the sampled subgraph is still stored in
-    the homogeneous graph format. That is, all nodes and edges are assigned with
-    unique IDs (in contrast, we typically use a type name and a node/edge ID to
-    identify a node or an edge in ``DGLGraph``). We refer to this type of IDs
-    as *homogeneous ID*.
-    Users can use :func:`dgl.distributed.GraphPartitionBook.map_to_per_ntype`
-    and :func:`dgl.distributed.GraphPartitionBook.map_to_per_etype`
-    to identify their node/edge types and node/edge IDs of that type.
-
-    For heterogeneous graphs, ``nodes`` can be a dictionary whose key is node type
-    and the value is type-specific node IDs; ``nodes`` can also be a tensor of
-    *homogeneous ID*.
+    For heterogeneous graphs, ``nodes`` is a dictionary whose key is node type
+    and the value is type-specific node IDs.

    Parameters
    ----------
@@ -388,7 +377,8 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
        A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
    """
    gpb = g.get_partition_book()
-    if isinstance(nodes, dict):
+    if len(gpb.etypes) > 1:
+        assert isinstance(nodes, dict)
        homo_nids = []
        for ntype in nodes:
            assert ntype in g.ntypes, 'The sampled node type does not exist in the input graph'
@@ -398,13 +388,45 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
                typed_nodes = toindex(nodes[ntype]).tousertensor()
            homo_nids.append(gpb.map_to_homo_nid(typed_nodes, ntype))
        nodes = F.cat(homo_nids, 0)
+    elif isinstance(nodes, dict):
+        assert len(nodes) == 1
+        nodes = list(nodes.values())[0]
+
    def issue_remote_req(node_ids):
        return SamplingRequest(node_ids, fanout, edge_dir=edge_dir,
                               prob=prob, replace=replace)
    def local_access(local_g, partition_book, local_nids):
        return _sample_neighbors(local_g, partition_book, local_nids,
                                 fanout, edge_dir, prob, replace)
-    return _distributed_access(g, nodes, issue_remote_req, local_access)
+    frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    if len(gpb.etypes) > 1:
+        etype_ids, frontier.edata[EID] = gpb.map_to_per_etype(frontier.edata[EID])
+        src, dst = frontier.edges()
+        etype_ids, idx = F.sort_1d(etype_ids)
+        src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
+        eid = F.gather_row(frontier.edata[EID], idx)
+        _, src = gpb.map_to_per_ntype(src)
+        _, dst = gpb.map_to_per_ntype(dst)
+
+        data_dict = dict()
+        edge_ids = {}
+        for etid in range(len(g.etypes)):
+            etype = g.etypes[etid]
+            canonical_etype = g.canonical_etypes[etid]
+            type_idx = etype_ids == etid
+            if F.sum(type_idx, 0) > 0:
+                data_dict[canonical_etype] = (F.boolean_mask(src, type_idx), \
+                        F.boolean_mask(dst, type_idx))
+                edge_ids[etype] = F.boolean_mask(eid, type_idx)
+        hg = heterograph(data_dict,
+                         {ntype: g.number_of_nodes(ntype) for ntype in g.ntypes},
+                         idtype=g.idtype)
+
+        for etype in edge_ids:
+            hg.edges[etype].data[EID] = edge_ids[etype]
+        return hg
+    else:
+        return frontier

 def _distributed_edge_access(g, edges, issue_remote_req, local_access):
    """A routine that fetches local edges from distributed graph.

--- a/tests/distributed/test_dist_graph_store.py
+++ b/tests/distributed/test_dist_graph_store.py
@@ -55,7 +55,8 @@ def create_random_graph(n):
 def run_server(graph_name, server_id, server_count, num_clients, shared_mem):
    g = DistGraphServer(server_id, "kv_ip_config.txt", server_count, num_clients,
                        '/tmp/dist_graph/{}.json'.format(graph_name),
-                        disable_shared_mem=not shared_mem)
+                        disable_shared_mem=not shared_mem,
+                        graph_format=['csc', 'coo'])
    print('start server', server_id)
    g.start()

@@ -469,6 +470,13 @@ def check_dist_graph_hetero(g, num_clients, num_nodes, num_edges):
    for etype in num_edges:
        assert etype in g.etypes
        assert num_edges[etype] == g.number_of_edges(etype)
+    etypes = [('n1', 'r1', 'n2'),
+              ('n1', 'r2', 'n3'),
+              ('n2', 'r3', 'n3')]
+    for i, etype in enumerate(g.canonical_etypes):
+        assert etype[0] == etypes[i][0]
+        assert etype[1] == etypes[i][1]
+        assert etype[2] == etypes[i][2]
    assert g.number_of_nodes() == sum([num_nodes[ntype] for ntype in num_nodes])
    assert g.number_of_edges() == sum([num_edges[etype] for etype in num_edges])

@@ -584,7 +592,6 @@ def test_server_client():
    check_server_client(True, 1, 1)
    check_server_client(False, 1, 1)
    check_server_client(True, 2, 2)
-    check_server_client(False, 2, 2)

 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
 @unittest.skipIf(dgl.backend.backend_name == "tensorflow", reason="TF doesn't support distributed DistEmbedding")

--- a/tests/distributed/test_distributed_sampling.py
+++ b/tests/distributed/test_distributed_sampling.py
@@ -16,7 +16,7 @@ from scipy import sparse as spsp
 from dgl.distributed import DistGraphServer, DistGraph


-def start_server(rank, tmpdir, disable_shared_mem, graph_name, graph_format='csc'):
+def start_server(rank, tmpdir, disable_shared_mem, graph_name, graph_format=['csc', 'coo']):
    g = DistGraphServer(rank, "rpc_ip_config.txt", 1, 1,
                        tmpdir / (graph_name + '.json'), disable_shared_mem=disable_shared_mem,
                        graph_format=graph_format)
@@ -284,7 +284,6 @@ def start_hetero_sample_client(rank, tmpdir, disable_shared_mem):
    try:
        nodes = {'n3': [0, 10, 99, 66, 124, 208]}
        sampled_graph = sample_neighbors(dist_graph, nodes, 3)
-        nodes = gpb.map_to_homo_nid(nodes['n3'], 'n3')
        block = dgl.to_block(sampled_graph, nodes)
        block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
    except Exception as e:
@@ -320,47 +319,36 @@ def check_rpc_hetero_sampling_shuffle(tmpdir, num_server):
    for p in pserver_list:
        p.join()

-    orig_nid_map = F.zeros((g.number_of_nodes(),), dtype=F.int64)
-    orig_eid_map = F.zeros((g.number_of_edges(),), dtype=F.int64)
+    orig_nid_map = {ntype: F.zeros((g.number_of_nodes(ntype),), dtype=F.int64) for ntype in g.ntypes}
+    orig_eid_map = {etype: F.zeros((g.number_of_edges(etype),), dtype=F.int64) for etype in g.etypes}
    for i in range(num_server):
        part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
-        F.scatter_row_inplace(orig_nid_map, part.ndata[dgl.NID], part.ndata['orig_id'])
-        F.scatter_row_inplace(orig_eid_map, part.edata[dgl.EID], part.edata['orig_id'])
-
-    src, dst = block.edges()
-    # These are global Ids after shuffling.
-    shuffled_src = F.gather_row(block.srcdata[dgl.NID], src)
-    shuffled_dst = F.gather_row(block.dstdata[dgl.NID], dst)
-    shuffled_eid = block.edata[dgl.EID]
-    # Get node/edge types.
-    etype, _ = gpb.map_to_per_etype(shuffled_eid)
-    src_type, _ = gpb.map_to_per_ntype(shuffled_src)
-    dst_type, _ = gpb.map_to_per_ntype(shuffled_dst)
-    etype = F.asnumpy(etype)
-    src_type = F.asnumpy(src_type)
-    dst_type = F.asnumpy(dst_type)
-    # These are global Ids in the original graph.
-    orig_src = F.asnumpy(F.gather_row(orig_nid_map, shuffled_src))
-    orig_dst = F.asnumpy(F.gather_row(orig_nid_map, shuffled_dst))
-    orig_eid = F.asnumpy(F.gather_row(orig_eid_map, shuffled_eid))
-
-    etype_map = {g.get_etype_id(etype):etype for etype in g.etypes}
-    etype_to_eptype = {g.get_etype_id(etype):(src_ntype, dst_ntype) for src_ntype, etype, dst_ntype in g.canonical_etypes}
-    for e in np.unique(etype):
-        src_t = src_type[etype == e]
-        dst_t = dst_type[etype == e]
-        assert np.all(src_t == src_t[0])
-        assert np.all(dst_t == dst_t[0])
+        ntype_ids, type_nids = gpb.map_to_per_ntype(part.ndata[dgl.NID])
+        for ntype_id, ntype in enumerate(g.ntypes):
+            idx = ntype_ids == ntype_id
+            F.scatter_row_inplace(orig_nid_map[ntype], F.boolean_mask(type_nids, idx),
+                                  F.boolean_mask(part.ndata['orig_id'], idx))
+        etype_ids, type_eids = gpb.map_to_per_etype(part.edata[dgl.EID])
+        for etype_id, etype in enumerate(g.etypes):
+            idx = etype_ids == etype_id
+            F.scatter_row_inplace(orig_eid_map[etype], F.boolean_mask(type_eids, idx),
+                                  F.boolean_mask(part.edata['orig_id'], idx))
+
+    for src_type, etype, dst_type in block.canonical_etypes:
+        src, dst = block.edges(etype=etype)
+        # These are global Ids after shuffling.
+        shuffled_src = F.gather_row(block.srcnodes[src_type].data[dgl.NID], src)
+        shuffled_dst = F.gather_row(block.dstnodes[dst_type].data[dgl.NID], dst)
+        shuffled_eid = block.edges[etype].data[dgl.EID]
+
+        orig_src = F.asnumpy(F.gather_row(orig_nid_map[src_type], shuffled_src))
+        orig_dst = F.asnumpy(F.gather_row(orig_nid_map[dst_type], shuffled_dst))
+        orig_eid = F.asnumpy(F.gather_row(orig_eid_map[etype], shuffled_eid))

        # Check the node Ids and edge Ids.
-        orig_src1, orig_dst1 = g.find_edges(orig_eid[etype == e], etype=etype_map[e])
-        assert np.all(F.asnumpy(orig_src1) == orig_src[etype == e])
-        assert np.all(F.asnumpy(orig_dst1) == orig_dst[etype == e])
-
-        # Check the node types.
-        src_ntype, dst_ntype = etype_to_eptype[e]
-        assert np.all(src_t == g.get_ntype_id(src_ntype))
-        assert np.all(dst_t == g.get_ntype_id(dst_ntype))
+        orig_src1, orig_dst1 = g.find_edges(orig_eid, etype=etype)
+        assert np.all(F.asnumpy(orig_src1) == orig_src)
+        assert np.all(F.asnumpy(orig_dst1) == orig_dst)

 # Wait non shared memory graph store
 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')

--- a/tests/distributed/test_mp_dataloader.py
+++ b/tests/distributed/test_mp_dataloader.py
@@ -41,7 +41,8 @@ def start_server(rank, tmpdir, disable_shared_mem, num_clients):
    import dgl
    print('server: #clients=' + str(num_clients))
    g = DistGraphServer(rank, "mp_ip_config.txt", 1, num_clients,
-                        tmpdir / 'test_sampling.json', disable_shared_mem=disable_shared_mem)
+                        tmpdir / 'test_sampling.json', disable_shared_mem=disable_shared_mem,
+                        graph_format=['csc', 'coo'])
    g.start()