Merge branch 'master' into dist_part

e9b624fe · Minjie Wang · GitHub · 8086d1ed · a88e7f7e · e9b624fe
Unverified Commit e9b624fe authored Aug 11, 2022 by Minjie Wang Committed by GitHub Aug 11, 2022
20 changed files
--- a/python/dgl/nn/pytorch/hetero.py
+++ b/python/dgl/nn/pytorch/hetero.py
@@ -360,6 +360,13 @@ class HeteroEmbedding(nn.Module):
        """
        return {self.raw_keys[typ]: emb.weight for typ, emb in self.embeds.items()}
+    def reset_parameters(self):
+        """
+        Use the xavier method in nn.init module to make the parameters uniformly distributed
+        """
+        for typ in self.embeds.keys():
+            nn.init.xavier_uniform_(self.embeds[typ].weight)
    def forward(self, input_ids):
        """Forward function

--- a/python/dgl/ops/edge_softmax.py
+++ b/python/dgl/ops/edge_softmax.py
@@ -135,6 +135,7 @@ def edge_softmax(graph, logits, eids=ALL, norm_by='dst'):
                                     eids=eids, norm_by=norm_by)
    else:
        logits_list = [None] * graph._graph.number_of_etypes()
+        logits = {graph.to_canonical_etype(k): v for k, v in logits.items()}
        for rel in graph.canonical_etypes:
            etid = graph.get_etype_id(rel)
            logits_list[etid] = logits[rel]

--- a/python/dgl/readout.py
+++ b/python/dgl/readout.py
 """Classes and functions for batching multiple graphs together."""
 from __future__ import absolute_import
-from .base import DGLError
+from .base import DGLError, dgl_warning
 from . import backend as F
 from .ops import segment
@@ -365,8 +365,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
    graph : DGLGraph
        The graph.
    graph_feat : tensor
-        The feature to broadcast. Tensor shape is :math:`(*)` for single graph, and
+        The feature to broadcast. Tensor shape is :math:`(B, *)` for batched graph,
-        :math:`(B, *)` for batched graph.
+        where :math:`B` is the batch size.
    ntype : str, optional
        Node type. Can be omitted if there is only one node type.
@@ -403,9 +404,11 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
            [0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
            [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
-    Broadcast feature to all nodes in the single graph.
+    Broadcast feature to all nodes in the single graph (the feature tensor shape
+    to broadcast should be :math:`(1, *)`).
-    >>> dgl.broadcast_nodes(g1, feat[0])
+    >>> feat0 = th.unsqueeze(feat[0], 0)
+    >>> dgl.broadcast_nodes(g1, feat0)
    tensor([[0.4325, 0.7710, 0.5541, 0.0544, 0.9368],
            [0.4325, 0.7710, 0.5541, 0.0544, 0.9368]])
@@ -413,7 +416,9 @@ def broadcast_nodes(graph, graph_feat, *, ntype=None):
    --------
    broadcast_edges
    """
-    if len(F.shape(graph_feat)) == 1:
+    if (F.shape(graph_feat)[0] != graph.batch_size and graph.batch_size == 1):
+        dgl_warning('For a single graph, use a tensor of shape (1, *) for graph_feat.'
+                    ' The support of shape (*) will be deprecated.')
        graph_feat = F.unsqueeze(graph_feat, dim=0)
    return F.repeat(graph_feat, graph.batch_num_nodes(ntype), dim=0)
@@ -434,8 +439,8 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
    graph : DGLGraph
        The graph.
    graph_feat : tensor
-        The feature to broadcast. Tensor shape is :math:`(*)` for single graph, and
+        The feature to broadcast. Tensor shape is :math:`(B, *)` for batched graph,
-        :math:`(B, *)` for batched graph.
+        where :math:`B` is the batch size.
    etype : str, typle of str, optional
        Edge type. Can be omitted if there is only one edge type in the graph.
@@ -470,9 +475,11 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
            [0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
            [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
-    Broadcast feature to all edges in the single graph.
+    Broadcast feature to all edges in the single graph (the feature tensor shape
+    to broadcast should be :math:`(1, *)`).
-    >>> dgl.broadcast_edges(g2, feat[1])
+    >>> feat1 = th.unsqueeze(feat[1], 0)
+    >>> dgl.broadcast_edges(g2, feat1)
    tensor([[0.2721, 0.4629, 0.7269, 0.0724, 0.1014],
            [0.2721, 0.4629, 0.7269, 0.0724, 0.1014]])
@@ -480,7 +487,9 @@ def broadcast_edges(graph, graph_feat, *, etype=None):
    --------
    broadcast_nodes
    """
-    if len(F.shape(graph_feat)) == 1:
+    if (F.shape(graph_feat)[0] != graph.batch_size and graph.batch_size == 1):
+        dgl_warning('For a single graph, use a tensor of shape (1, *) for graph_feat.'
+                    ' The support of shape (*) will be deprecated.')
        graph_feat = F.unsqueeze(graph_feat, dim=0)
    return F.repeat(graph_feat, graph.batch_num_edges(etype), dim=0)

--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -54,8 +54,6 @@ def sample_etype_neighbors(g, nodes, etype_field, fanout, edge_dir='in', prob=No
        The features must be non-negative floats, and the sum of the features of
        inbound/outbound edges for every node must be positive (though they don't have
        to sum up to one).  Otherwise, the result will be undefined.
-        If :attr:`prob` is not None, GPU sampling is not supported.
    replace : bool, optional
        If True, sample with replacement.
    copy_ndata: bool, optional
@@ -163,6 +161,9 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
    Node/edge features are not preserved. The original IDs of
    the sampled edges are stored as the `dgl.EID` feature in the returned graph.
+    GPU sampling is supported for this function. Refer to :ref:`guide-minibatch-gpu-sampling`
+    for more details.
    Parameters
    ----------
    g : DGLGraph
@@ -193,8 +194,6 @@ def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False,
        The features must be non-negative floats, and the sum of the features of
        inbound/outbound edges for every node must be positive (though they don't have
        to sum up to one).  Otherwise, the result will be undefined.
-        If :attr:`prob` is not None, GPU sampling is not supported.
    exclude_edges: tensor or dict
        Edge IDs to exclude during sampling neighbors for the seed nodes.

--- a/python/dgl/sampling/pinsage.py
+++ b/python/dgl/sampling/pinsage.py
@@ -38,6 +38,9 @@ class RandomWalkNeighborSampler(object):
    This is a generalization of PinSAGE sampler which only works on bidirectional bipartite
    graphs.
+    UVA and GPU sampling is supported for this sampler.
+    Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
    Parameters
    ----------
    G : DGLGraph
@@ -104,13 +107,14 @@ class RandomWalkNeighborSampler(object):
            A tensor of given node IDs of node type ``ntype`` to generate neighbors from.  The
            node type ``ntype`` is the beginning and ending node type of the given metapath.
-            It must be on CPU and have the same dtype as the ID type of the graph.
+            It must be on the same device as the graph and have the same dtype
+            as the ID type of the graph.
        Returns
        -------
        g : DGLGraph
            A homogeneous graph constructed by selecting neighbors for each given node according
-            to the algorithm above.  The returned graph is on CPU.
+            to the algorithm above.
        """
        seed_nodes = utils.prepare_tensor(self.G, seed_nodes, 'seed_nodes')
        self.restart_prob = F.copy_to(self.restart_prob, F.context(seed_nodes))
@@ -147,6 +151,9 @@ class PinSAGESampler(RandomWalkNeighborSampler):
    The edges of the returned homogeneous graph will connect to the given nodes from their most
    commonly visited nodes, with a feature indicating the number of visits.
+    UVA and GPU sampling is supported for this sampler.
+    Refer to :ref:`guide-minibatch-gpu-sampling` for more details.
    Parameters
    ----------
    G : DGLGraph

--- a/python/dgl/sampling/randomwalks.py
+++ b/python/dgl/sampling/randomwalks.py
@@ -30,6 +30,8 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
    If a random walk stops in advance, DGL pads the trace with -1 to have the same
    length.
+    This function supports the graph on GPU and UVA sampling.
    Parameters
    ----------
    g : DGLGraph
@@ -37,8 +39,9 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
    nodes : Tensor
        Node ID tensor from which the random walk traces starts.
-        The tensor must have the same dtype as the ID type
+        The tensor must have the same dtype as the ID type of the graph.
-        of the graph.
+        The tensor must be on the same device as the graph or
+        on the GPU when the graph is pinned (UVA sampling).
    metapath : list[str or tuple of str], optional
        Metapath, specified as a list of edge types.
@@ -60,12 +63,15 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
        must be positive for the outbound edges of all nodes (although they don't have
        to sum up to one).  The result will be undefined otherwise.
+        The feature tensor must be on the same device as the graph.
        If omitted, DGL assumes that the neighbors are picked uniformly.
    restart_prob : float or Tensor, optional
        Probability to terminate the current trace before each transition.
-        If a tensor is given, :attr:`restart_prob` should have the same length as
+        If a tensor is given, :attr:`restart_prob` should be on the same device as the graph
-        :attr:`metapath` or :attr:`length`.
+        or on the GPU when the graph is pinned (UVA sampling),
+        and have the same length as :attr:`metapath` or :attr:`length`.
    return_eids : bool, optional
        If True, additionally return the edge IDs traversed.
@@ -176,19 +182,16 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
    metapath = F.to_dgl_nd(F.astype(F.tensor(metapath), g.idtype))
    # Load the probability tensor from the edge frames
+    ctx = utils.to_dgl_context(g.device)
    if prob is None:
-        p_nd = [nd.array([], ctx=nodes.ctx) for _ in g.canonical_etypes]
+        p_nd = [nd.array([], ctx=ctx) for _ in g.canonical_etypes]
    else:
        p_nd = []
        for etype in g.canonical_etypes:
            if prob in g.edges[etype].data:
                prob_nd = F.to_dgl_nd(g.edges[etype].data[prob])
-                if prob_nd.ctx != nodes.ctx:
-                    raise ValueError(
-                        'context of seed node array and edges[%s].data[%s] are different' %
-                        (etype, prob))
            else:
-                prob_nd = nd.array([], ctx=nodes.ctx)
+                prob_nd = nd.array([], ctx=ctx)
            p_nd.append(prob_nd)
    # Actual random walk
@@ -198,9 +201,11 @@ def random_walk(g, nodes, *, metapath=None, length=None, prob=None, restart_prob
        restart_prob = F.to_dgl_nd(restart_prob)
        traces, eids, types = _CAPI_DGLSamplingRandomWalkWithStepwiseRestart(
            gidx, nodes, metapath, p_nd, restart_prob)
-    else:
+    elif isinstance(restart_prob, float):
        traces, eids, types = _CAPI_DGLSamplingRandomWalkWithRestart(
            gidx, nodes, metapath, p_nd, restart_prob)
+    else:
+        raise TypeError("restart_prob should be float or Tensor.")
    traces = F.from_dgl_nd(traces)
    types = F.from_dgl_nd(types)

--- a/python/dgl/sparse.py
+++ b/python/dgl/sparse.py
@@ -365,11 +365,11 @@ def _gspmm_hetero(gidx, op, reduce_op, u_len, u_and_e_tuple):
    for l, arg_u_nd in enumerate(list_arg_u_nd):
        # TODO(Israt): l or src_id as index of lhs
        list_arg_u[l] = None if list_arg_u[l] is None else F.zerocopy_from_dgl_ndarray(arg_u_nd)
-        if expand_u and use_cmp:
+        if list_arg_u[l] is not None and expand_u and use_cmp:
            list_arg_u[l] = F.squeeze(list_arg_u[l], -1)
    for l, arg_e_nd in enumerate(list_arg_e_nd):
        list_arg_e[l] = None if list_arg_e[l] is None else F.zerocopy_from_dgl_ndarray(arg_e_nd)
-        if expand_e and use_cmp:
+        if list_arg_e[l] is not None and expand_e and use_cmp:
            list_arg_e[l] = F.squeeze(list_arg_e[l], -1)
    for l, arg_u_ntype_nd in enumerate(list_arg_u_ntype_nd):
        list_arg_u_ntype[l] = None if arg_u_ntype_nd is None \
@@ -562,7 +562,7 @@ def _gsddmm_hetero(gidx, op, lhs_len, lhs_target='u', rhs_target='v', lhs_and_rh
        e = out_list[l]
        e = F.tensor([]) if e is None else e
        if (expand_lhs or not use_lhs) and (expand_rhs or not use_rhs):
-            e = F.squeeze(v, -1)
+            e = F.squeeze(e, -1)
        out_list[l] = e
    out = tuple(out_list)
    return out

--- a/python/dgl/transforms/functional.py
+++ b/python/dgl/transforms/functional.py
@@ -40,6 +40,7 @@ from ..partition import metis_partition_assignment
 from ..partition import partition_graph_with_halo
 from ..partition import metis_partition
 from .. import subgraph
+from .. import function
 # TO BE DEPRECATED
 from .._deprecate.graph import DGLGraph as DGLGraphStale
@@ -1764,13 +1765,24 @@ def remove_nodes(g, nids, ntype=None, store_ids=False):
    g.remove_nodes(nids, ntype=ntype, store_ids=store_ids)
    return g
-def add_self_loop(g, etype=None):
+def add_self_loop(g, edge_feat_names=None, fill_data=1., etype=None):
    r"""Add self-loops for each node in the graph and return a new graph.
    Parameters
    ----------
    g : DGLGraph
        The graph.
+    edge_feat_names : list[str], optional
+        The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
+        to all self-loop features. Default: None.
+    fill_data : int, float or str, optional
+        The value to fill the self-loop features. Default: 1.
+        * If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
+          ``fill_data``.
+        * if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
+          features of the incoming edges of the corresponding nodes. The supported aggregation are:
+          ``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
    etype : str or (str, str, str), optional
        The type names of the edges. The allowed type name formats are:
@@ -1792,7 +1804,6 @@ def add_self_loop(g, etype=None):
    * The function adds self-loops regardless of whether they already exist or not.
      If one wishes to have exactly one self-loop for every node,
      call :func:`remove_self_loop` before invoking :func:`add_self_loop`.
-    * Features of the new edges (self-loop edges) will be filled with zeros.
    * This function discards the batch information. Please use
      :func:`dgl.DGLGraph.set_batch_num_nodes`
      and :func:`dgl.DGLGraph.set_batch_num_edges` on the transformed graph
@@ -1808,7 +1819,7 @@ def add_self_loop(g, etype=None):
    >>> g = dgl.graph((torch.tensor([0, 0, 2]), torch.tensor([2, 1, 0])))
    >>> g.ndata['hv'] = torch.arange(3).float().reshape(-1, 1)
    >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
-    >>> g = dgl.add_self_loop(g)
+    >>> g = dgl.add_self_loop(g, fill_data='sum')
    >>> g
    Graph(num_nodes=3, num_edges=6,
        ndata_schemes={'hv': Scheme(shape=(1,), dtype=torch.float32)}
@@ -1817,8 +1828,8 @@ def add_self_loop(g, etype=None):
    tensor([[0.],
            [1.],
            [2.],
-            [0.],
+            [2.],
-            [0.],
+            [1.],
            [0.]])
    **Heterogeneous Graphs**
@@ -1831,17 +1842,49 @@ def add_self_loop(g, etype=None):
    >>> g = dgl.add_self_loop(g, etype='follows')
    >>> g
    Graph(num_nodes={'user': 3, 'game': 2},
-        num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
+          num_edges={('user', 'plays', 'game'): 2, ('user', 'follows', 'user'): 5},
-        metagraph=[('user', 'user'), ('user', 'game')])
+          metagraph=[('user', 'user'), ('user', 'game')])
    """
    etype = g.to_canonical_etype(etype)
+    data = {}
+    reduce_funcs = {'sum': function.sum,
+                    'mean': function.mean,
+                    'max': function.max,
+                    'min': function.min}
+    if edge_feat_names is None:
+        edge_feat_names = g.edges[etype].data.keys()
    if etype[0] != etype[2]:
        raise DGLError(
            'add_self_loop does not support unidirectional bipartite graphs: {}.' \
            'Please make sure the types of head node and tail node are identical.' \
            ''.format(etype))
+    for feat_name in edge_feat_names:
+        if isinstance(fill_data, (int, float)):
+            dtype = g.edges[etype].data[feat_name].dtype
+            dshape = g.edges[etype].data[feat_name].shape
+            tmp_fill_data = F.copy_to(F.astype(F.tensor([fill_data]), dtype), g.device)
+            if len(dshape) > 1:
+                data[feat_name] = F.zeros((g.num_nodes(etype[0]), *dshape[1:]), dtype,
+                                          g.device) + tmp_fill_data
+            else:
+                data[feat_name] = F.zeros((g.num_nodes(etype[0]),), dtype, g.device) + tmp_fill_data
+        elif isinstance(fill_data, str):
+            if fill_data not in reduce_funcs.keys():
+                raise DGLError('Unsupported aggregation: {}'.format(fill_data))
+            reducer = reduce_funcs[fill_data]
+            with g.local_scope():
+                g.update_all(function.copy_e(feat_name, "h"), reducer('h', 'h'), etype=etype)
+                data[feat_name] = g.nodes[etype[0]].data['h']
    nodes = g.nodes(etype[0])
-    new_g = add_edges(g, nodes, nodes, etype=etype)
+    if len(data):
+        new_g = add_edges(g, nodes, nodes, data=data, etype=etype)
+    else:
+        new_g = add_edges(g, nodes, nodes, etype=etype)
    return new_g
 DGLHeteroGraph.add_self_loop = utils.alias_func(add_self_loop)
@@ -2872,6 +2915,8 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
    ``tag_type`` is ``node``.
    >>> import dgl
+    >>> import torch
    >>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
    >>> g.adjacency_matrix(scipy_fmt='csr').nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
@@ -2890,11 +2935,10 @@ def sort_csr_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
    ``tag_type`` is ``edge``.
-    >>> from dgl import backend as F
    >>> g = dgl.graph(([0,0,0,0,0,1,1,1],[0,1,2,3,4,0,1,2]))
    >>> g.edges()
    (tensor([0, 0, 0, 0, 0, 1, 1, 1]), tensor([0, 1, 2, 3, 4, 0, 1, 2]))
-    >>> tag = F.tensor([1, 1, 0, 2, 0, 1, 1, 0])
+    >>> tag = torch.tensor([1, 1, 0, 2, 0, 1, 1, 0])
    >>> g_sorted = dgl.sort_csr_by_tag(g, tag, tag_type='edge')
    >>> g_sorted.adj(scipy_fmt='csr').nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))
@@ -2995,6 +3039,7 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
    ``tag_type`` is ``node``.
    >>> import dgl
+    >>> import torch
    >>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
    >>> g.adjacency_matrix(scipy_fmt='csr', transpose=True).nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32),
@@ -3013,9 +3058,8 @@ def sort_csc_by_tag(g, tag, tag_offset_name='_TAG_OFFSET', tag_type='node'):
    ``tag_type`` is ``edge``.
-    >>> from dgl import backend as F
    >>> g = dgl.graph(([0,1,2,3,4,0,1,2],[0,0,0,0,0,1,1,1]))
-    >>> tag = F.tensor([1, 1, 0, 2, 0, 1, 1, 0])
+    >>> tag = torch.tensor([1, 1, 0, 2, 0, 1, 1, 0])
    >>> g_sorted = dgl.sort_csc_by_tag(g, tag, tag_type='edge')
    >>> g_sorted.adj(scipy_fmt='csr', transpose=True).nonzero()
    (array([0, 0, 0, 0, 0, 1, 1, 1], dtype=int32), array([2, 4, 0, 1, 3, 2, 0, 1], dtype=int32))

--- a/python/dgl/transforms/module.py
+++ b/python/dgl/transforms/module.py
@@ -415,6 +415,17 @@ class AddSelfLoop(BaseTransform):
        If False, it will first remove self-loops to prevent duplicate self-loops.
    new_etypes : bool, optional
        If True, it will add an edge type 'self' per node type, which holds self-loops.
+    edge_feat_names : list[str], optional
+        The names of the self-loop features to apply `fill_data`. If None, it will apply `fill_data`
+        to all self-loop features. Default: None.
+    fill_data : int, float or str, optional
+        The value to fill the self-loop features. Default: 1.
+        * If ``fill_data`` is ``int`` or ``float``, self-loop features will be directly given by
+          ``fill_data``.
+        * if ``fill_data`` is ``str``, self-loop features will be generated by aggregating the
+          features of the incoming edges of the corresponding nodes. The supported aggregation are:
+          ``'mean'``, ``'sum'``, ``'max'``, ``'min'``.
    Example
    -------
@@ -424,23 +435,39 @@ class AddSelfLoop(BaseTransform):
    Case1: Add self-loops for a homogeneous graph
-    >>> transform = AddSelfLoop()
+    >>> transform = AddSelfLoop(fill_data='sum')
-    >>> g = dgl.graph(([1, 1], [1, 2]))
+    >>> g = dgl.graph(([0, 0, 2], [2, 1, 0]))
+    >>> g.edata['he'] = torch.arange(3).float().reshape(-1, 1)
    >>> new_g = transform(g)
    >>> print(new_g.edges())
    (tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
+    >>> print(new_g.edata('he'))
+    tensor([[0.],
+            [1.],
+            [2.],
+            [2.],
+            [1.],
+            [0.]])
    Case2: Add self-loops for a heterogeneous graph
+    >>> transform = AddSelfLoop(fill_data='sum')
    >>> g = dgl.heterograph({
-    ...     ('user', 'plays', 'game'): ([0], [1]),
+   ...     ('user', 'follows', 'user'): (torch.tensor([1, 2]),
-    ...     ('user', 'follows', 'user'): ([1], [2])
+   ...                                   torch.tensor([0, 1])),
-    ... })
+   ...     ('user', 'plays', 'game'): (torch.tensor([0, 1]),
+   ...                                 torch.tensor([0, 1]))})
+   >>> g.edata['feat'] = {('user', 'follows', 'user'): torch.randn(2, 5),
+   ...                    ('user', 'plays', 'game'): torch.randn(2, 5)}
+   >>> g.edata['feat1'] = {('user', 'follows', 'user'): torch.randn(2, 15),
+     ...                   ('user', 'plays', 'game'): torch.randn(2, 15)}
    >>> new_g = transform(g)
    >>> print(new_g.edges(etype='plays'))
-    (tensor([0]), tensor([1]))
+    (tensor([0, 1]), tensor([0, 1]))
    >>> print(new_g.edges(etype='follows'))
-    (tensor([1, 0, 1, 2]), tensor([2, 0, 1, 2]))
+    (tensor([1, 2]), tensor([0, 1]))
+    >>> print(new_g.edata['feat'][('user', 'follows', 'user')].shape)
+    torch.Size([5, 5])
    Case3: Add self-etypes for a heterogeneous graph
@@ -451,9 +478,12 @@ class AddSelfLoop(BaseTransform):
    >>> print(new_g.edges(etype=('game', 'self', 'game')))
    (tensor([0, 1]), tensor([0, 1]))
    """
-    def __init__(self, allow_duplicate=False, new_etypes=False):
+    def __init__(self, allow_duplicate=False, new_etypes=False, edge_feat_names=None, fill_data=1.):
        self.allow_duplicate = allow_duplicate
        self.new_etypes = new_etypes
+        self.edge_feat_names = edge_feat_names
+        self.fill_data = fill_data
    def transform_etype(self, c_etype, g):
        r"""
@@ -480,7 +510,8 @@ class AddSelfLoop(BaseTransform):
        if not self.allow_duplicate:
            g = functional.remove_self_loop(g, etype=c_etype)
-        return functional.add_self_loop(g, etype=c_etype)
+        return functional.add_self_loop(g, edge_feat_names=self.edge_feat_names,
+                                        fill_data=self.fill_data, etype=c_etype)
    def __call__(self, g):
        for c_etype in g.canonical_etypes:
@@ -501,6 +532,7 @@ class AddSelfLoop(BaseTransform):
                data_dict[c_etype] = g.edges(etype=c_etype)
            g = update_graph_structure(g, data_dict)
        return g
 class RemoveSelfLoop(BaseTransform):

--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -549,11 +549,13 @@ COOMatrix CSRRowWiseSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
  COOMatrix ret;
  if (IsNullArray(prob)) {
-    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
+    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSamplingUniform", {
      ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
    });
  } else {
-    ATEN_CSR_SWITCH(mat, XPU, IdType, "CSRRowWiseSampling", {
+    // prob is pinned and rows on GPU is valid
+    CHECK_VALID_CONTEXT(prob, rows);
+    ATEN_CSR_SWITCH_CUDA_UVA(mat, rows, XPU, IdType, "CSRRowWiseSampling", {
      ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
        ret = impl::CSRRowWiseSampling<XPU, IdType, FloatType>(
            mat, rows, num_samples, prob, replace);

--- a/src/array/cuda/dgl_cub.cuh
+++ b/src/array/cuda/dgl_cub.cuh
@@ -7,13 +7,11 @@
 #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_
 #define DGL_ARRAY_CUDA_DGL_CUB_CUH_
-// include cub in a safe manner
+// This should be defined in CMakeLists.txt
-#define CUB_NS_PREFIX namespace dgl {
+#ifndef THRUST_CUB_WRAPPED_NAMESPACE
-#define CUB_NS_POSTFIX }
+static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
-#define CUB_NS_QUALIFIER ::dgl::cub
+#endif
 #include "cub/cub.cuh"
-#undef CUB_NS_QUALIFIER
-#undef CUB_NS_POSTFIX
-#undef CUB_NS_PREFIX
 #endif
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
 /*!
 *  Copyright (c) 2021 by Contributors
 * \file array/cuda/rowwise_sampling.cu
- * \brief rowwise sampling
+ * \brief uniform rowwise sampling
 */
 #include <dgl/random.h>
@@ -13,6 +13,7 @@
 #include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
 using namespace dgl::aten::cuda;
 namespace dgl {
@@ -21,7 +22,7 @@ namespace impl {
 namespace {
-constexpr int CTA_SIZE = 128;
+constexpr int BLOCK_SIZE = 128;
 /**
 * @brief Compute the size of each row in the sampled CSR, without replacement.
@@ -41,14 +42,14 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
    const IdType * const in_rows,
    const IdType * const in_ptr,
    IdType * const out_deg) {
-  const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
  if (tIdx < num_rows) {
    const int in_row = in_rows[tIdx];
    const int out_row = tIdx;
-    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row+1]-in_ptr[in_row]);
+    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
-    if (out_row == num_rows-1) {
+    if (out_row == num_rows - 1) {
      // make the prefixsum work
      out_deg[num_rows] = 0;
    }
@@ -73,19 +74,19 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
    const IdType * const in_rows,
    const IdType * const in_ptr,
    IdType * const out_deg) {
-  const int tIdx = threadIdx.x + blockIdx.x*blockDim.x;
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
  if (tIdx < num_rows) {
    const int64_t in_row = in_rows[tIdx];
    const int64_t out_row = tIdx;
-    if (in_ptr[in_row+1]-in_ptr[in_row] == 0) {
+    if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) {
      out_deg[out_row] = 0;
    } else {
      out_deg[out_row] = static_cast<IdType>(num_picks);
    }
-    if (out_row == num_rows-1) {
+    if (out_row == num_rows - 1) {
      // make the prefixsum work
      out_deg[num_rows] = 0;
    }
@@ -93,11 +94,10 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 }
 /**
-* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
+* @brief Perform row-wise uniform sampling on a CSR matrix,
-* without replacement.
+* and generate a COO matrix, without replacement.
 *
 * @tparam IdType The ID type used for matrices.
-* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
 * @tparam TILE_SIZE The number of rows covered by each threadblock.
 * @param rand_seed The random seed to use.
 * @param num_picks The number of non-zeros to pick per row.
@@ -111,8 +111,8 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 * @param out_cols The columns of the output COO (output).
 * @param out_idxs The data array of the output COO (output).
 */
-template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
+template<typename IdType, int TILE_SIZE>
-__global__ void _CSRRowWiseSampleKernel(
+__global__ void _CSRRowWiseSampleUniformKernel(
    const uint64_t rand_seed,
    const int64_t num_picks,
    const int64_t num_rows,
@@ -125,68 +125,62 @@ __global__ void _CSRRowWiseSampleKernel(
    IdType * const out_cols,
    IdType * const out_idxs) {
  // we assign one warp per row
-  assert(blockDim.x == CTA_SIZE);
+  assert(blockDim.x == BLOCK_SIZE);
-  int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
+  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
  curandStatePhilox4_32_10_t rng;
-  curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
    const int64_t in_row_start = in_ptr[row];
-    const int64_t deg = in_ptr[row+1] - in_row_start;
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
    const int64_t out_row_start = out_ptr[out_row];
    if (deg <= num_picks) {
-      // just copy row
+      // just copy row when there is not enough nodes to sample.
-      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
+      for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
-        const IdType in_idx = in_row_start+idx;
+        const IdType in_idx = in_row_start + idx;
-        out_rows[out_row_start+idx] = row;
+        out_rows[out_row_start + idx] = row;
-        out_cols[out_row_start+idx] = in_index[in_idx];
+        out_cols[out_row_start + idx] = in_index[in_idx];
-        out_idxs[out_row_start+idx] = data ? data[in_idx] : in_idx;
+        out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx;
      }
    } else {
      // generate permutation list via reservoir algorithm
-      for (int idx = threadIdx.x; idx < num_picks; idx+=CTA_SIZE) {
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
-        out_idxs[out_row_start+idx] = idx;
+        out_idxs[out_row_start + idx] = idx;
      }
      __syncthreads();
-      for (int idx = num_picks+threadIdx.x; idx < deg; idx+=CTA_SIZE) {
+      for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
-        const int num = curand(&rng)%(idx+1);
+        const int num = curand(&rng) % (idx + 1);
        if (num < num_picks) {
          // use max so as to achieve the replacement order the serial
          // algorithm would have
-          AtomicMax(out_idxs+out_row_start+num, idx);
+          AtomicMax(out_idxs + out_row_start + num, idx);
        }
      }
      __syncthreads();
      // copy permutation over
-      for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
-        const IdType perm_idx = out_idxs[out_row_start+idx]+in_row_start;
+        const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start;
-        out_rows[out_row_start+idx] = row;
+        out_rows[out_row_start + idx] = row;
-        out_cols[out_row_start+idx] = in_index[perm_idx];
+        out_cols[out_row_start + idx] = in_index[perm_idx];
-        if (data) {
+        out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx;
-          out_idxs[out_row_start+idx] = data[perm_idx];
-        }
      }
    }
+    out_row += 1;
-    out_row += BLOCK_CTAS;
  }
 }
 /**
-* @brief Perform row-wise sampling on a CSR matrix, and generate a COO matrix,
+* @brief Perform row-wise uniform sampling on a CSR matrix,
-* with replacement.
+* and generate a COO matrix, with replacement.
 *
 * @tparam IdType The ID type used for matrices.
-* @tparam BLOCK_CTAS The number of rows each thread block runs in parallel.
 * @tparam TILE_SIZE The number of rows covered by each threadblock.
 * @param rand_seed The random seed to use.
 * @param num_picks The number of non-zeros to pick per row.
@@ -200,8 +194,8 @@ __global__ void _CSRRowWiseSampleKernel(
 * @param out_cols The columns of the output COO (output).
 * @param out_idxs The data array of the output COO (output).
 */
-template<typename IdType, int BLOCK_CTAS, int TILE_SIZE>
+template<typename IdType, int TILE_SIZE>
-__global__ void _CSRRowWiseSampleReplaceKernel(
+__global__ void _CSRRowWiseSampleUniformReplaceKernel(
    const uint64_t rand_seed,
    const int64_t num_picks,
    const int64_t num_rows,
@@ -214,39 +208,37 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
    IdType * const out_cols,
    IdType * const out_idxs) {
  // we assign one warp per row
-  assert(blockDim.x == CTA_SIZE);
+  assert(blockDim.x == BLOCK_SIZE);
-  int64_t out_row = blockIdx.x*TILE_SIZE+threadIdx.y;
+  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x+1)*TILE_SIZE, num_rows);
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
  curandStatePhilox4_32_10_t rng;
-  curand_init((rand_seed*gridDim.x+blockIdx.x)*blockDim.y+threadIdx.y, threadIdx.x, 0, &rng);
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
  while (out_row < last_row) {
    const int64_t row = in_rows[out_row];
    const int64_t in_row_start = in_ptr[row];
    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
-    const int64_t deg = in_ptr[row+1] - in_row_start;
    if (deg > 0) {
      // each thread then blindly copies in rows only if deg > 0.
-      for (int idx = threadIdx.x; idx < num_picks; idx += CTA_SIZE) {
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
        const int64_t edge = curand(&rng) % deg;
-        const int64_t out_idx = out_row_start+idx;
+        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
-        out_cols[out_idx] = in_index[in_row_start+edge];
+        out_cols[out_idx] = in_index[in_row_start + edge];
-        out_idxs[out_idx] = data ? data[in_row_start+edge] : in_row_start+edge;
+        out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
      }
    }
-    out_row += BLOCK_CTAS;
+    out_row += 1;
  }
 }
 }  // namespace
-/////////////////////////////// CSR ///////////////////////////////
+///////////////////////////// CSR sampling //////////////////////////
 template <DLDeviceType XPU, typename IdType>
 COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
@@ -277,22 +269,26 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
  // compute degree
  IdType * out_deg = static_cast<IdType*>(
-      device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  if (replace) {
    const dim3 block(512);
-    const dim3 grid((num_rows+block.x-1)/block.x);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
-    _CSRRowWiseSampleDegreeReplaceKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeReplaceKernel,
+        grid, block, 0, stream,
        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  } else {
    const dim3 block(512);
-    const dim3 grid((num_rows+block.x-1)/block.x);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
-    _CSRRowWiseSampleDegreeKernel<<<grid, block, 0, stream>>>(
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeKernel,
+        grid, block, 0, stream,
        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  }
  // fill out_ptr
  IdType * out_ptr = static_cast<IdType*>(
-      device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
      out_deg,
@@ -314,24 +310,25 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
  // a cudaevent
  IdType new_len;
-  device->CopyDataFromTo(out_ptr, num_rows*sizeof(new_len), &new_len, 0,
+  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
-        sizeof(new_len),
+      sizeof(new_len),
-        ctx,
+      ctx,
-        DGLContext{kDLCPU, 0},
+      DGLContext{kDLCPU, 0},
-        mat.indptr->dtype,
+      mat.indptr->dtype,
-        stream);
+      stream);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
  // select edges
-  if (replace) {
+  // the number of rows each thread block will cover
-    constexpr int BLOCK_CTAS = 128/CTA_SIZE;
+  constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
-    // the number of rows each thread block will cover
+  if (replace) {  // with replacement
-    constexpr int TILE_SIZE = BLOCK_CTAS;
+    const dim3 block(BLOCK_SIZE);
-    const dim3 block(CTA_SIZE, BLOCK_CTAS);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
-    const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
+    CUDA_KERNEL_CALL(
-    _CSRRowWiseSampleReplaceKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
+        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
+        grid, block, 0, stream,
        random_seed,
        num_picks,
        num_rows,
@@ -343,13 +340,12 @@ COOMatrix CSRRowWiseSamplingUniform(CSRMatrix mat,
        out_rows,
        out_cols,
        out_idxs);
-  } else {
+  } else {  // without replacement
-    constexpr int BLOCK_CTAS = 128/CTA_SIZE;
+    const dim3 block(BLOCK_SIZE);
-    // the number of rows each thread block will cover
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
-    constexpr int TILE_SIZE = BLOCK_CTAS;
+    CUDA_KERNEL_CALL(
-    const dim3 block(CTA_SIZE, BLOCK_CTAS);
+        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
-    const dim3 grid((num_rows+TILE_SIZE-1)/TILE_SIZE);
+        grid, block, 0, stream,
-    _CSRRowWiseSampleKernel<IdType, BLOCK_CTAS, TILE_SIZE><<<grid, block, 0, stream>>>(
        random_seed,
        num_picks,
        num_rows,

--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
+/*!
+ *  Copyright (c) 2022 by Contributors
+ * \file array/cuda/rowwise_sampling_prob.cu
+ * \brief weighted rowwise sampling. The degree computing kernels and
+ * host-side functions are partially borrowed from the uniform rowwise
+ * sampling code rowwise_sampling.cu.
+ * \author pengqirong (OPPO), dlasalle and Xin from Nvidia.
+ */
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+#include <curand_kernel.h>
+#include <numeric>
+#include "./dgl_cub.cuh"
+#include "../../array/cuda/atomic.cuh"
+#include "../../runtime/cuda/cuda_common.h"
+// require CUB 1.17 to use DeviceSegmentedSort
+static_assert(CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
+using namespace dgl::aten::cuda;
+namespace dgl {
+namespace aten {
+namespace impl {
+namespace {
+constexpr int BLOCK_SIZE = 128;
+/**
+* @brief Compute the size of each row in the sampled CSR, without replacement.
+* temp_deg is calculated for rows with deg > num_picks.
+* For these rows, we will calculate their A-Res values and sort them to get top-num_picks.
+*
+* @tparam IdType The type of node and edge indexes.
+* @param num_picks The number of non-zero entries to pick per row.
+* @param num_rows The number of rows to pick.
+* @param in_rows The set of rows to pick.
+* @param in_ptr The index where each row's edges start.
+* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
+* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
+*/
+template<typename IdType>
+__global__ void _CSRRowWiseSampleDegreeKernel(
+    const int64_t num_picks,
+    const int64_t num_rows,
+    const IdType * const in_rows,
+    const IdType * const in_ptr,
+    IdType * const out_deg,
+    IdType * const temp_deg) {
+  const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tIdx < num_rows) {
+    const int64_t in_row = in_rows[tIdx];
+    const int64_t out_row = tIdx;
+    const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
+    // temp_deg is used to generate ares_ptr
+    temp_deg[out_row] = deg > static_cast<IdType>(num_picks) ? deg : 0;
+    out_deg[out_row] = min(static_cast<IdType>(num_picks), deg);
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+      temp_deg[num_rows] = 0;
+    }
+  }
+}
+/**
+* @brief Compute the size of each row in the sampled CSR, with replacement.
+* We need the actual in degree of each row to store CDF values.
+*
+* @tparam IdType The type of node and edge indexes.
+* @param num_picks The number of non-zero entries to pick per row.
+* @param num_rows The number of rows to pick.
+* @param in_rows The set of rows to pick.
+* @param in_ptr The index where each row's edges start.
+* @param out_deg The size of each row in the sampled matrix, as indexed by `in_rows` (output).
+* @param temp_deg The size of each row in the input matrix, as indexed by `in_rows` (output).
+*/
+template<typename IdType>
+__global__ void _CSRRowWiseSampleDegreeReplaceKernel(
+    const int64_t num_picks,
+    const int64_t num_rows,
+    const IdType * const in_rows,
+    const IdType * const in_ptr,
+    IdType * const out_deg,
+    IdType * const temp_deg) {
+  const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tIdx < num_rows) {
+    const int64_t in_row = in_rows[tIdx];
+    const int64_t out_row = tIdx;
+    const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
+    temp_deg[out_row] = deg;
+    out_deg[out_row] = deg == 0 ? 0 : static_cast<IdType>(num_picks);
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+      temp_deg[num_rows] = 0;
+    }
+  }
+}
+/**
+* @brief Equivalent to numpy expression: array[idx[off:off + len]]
+*
+* @tparam IdType The ID type used for indices.
+* @tparam FloatType The float type used for array values.
+* @param array The array to be selected.
+* @param idx_data The index mapping array.
+* @param index The index of value to be selected.
+* @param offset The offset to start.
+* @param out The selected value (output).
+*/
+template<typename IdType, typename FloatType>
+__device__ void _DoubleSlice(
+    const FloatType * const array,
+    const IdType * const idx_data,
+    const IdType idx,
+    const IdType offset,
+    FloatType* const out) {
+  if (idx_data) {
+    *out = array[idx_data[offset + idx]];
+  } else {
+    *out = array[offset + idx];
+  }
+}
+/**
+* @brief Compute A-Res value. A-Res value needs to be calculated only if deg 
+* is greater than num_picks in weighted rowwise sampling without replacement.
+*
+* @tparam IdType The ID type used for matrices.
+* @tparam FloatType The Float type used for matrices.
+* @tparam TILE_SIZE The number of rows covered by each threadblock.
+* @param rand_seed The random seed to use.
+* @param num_picks The number of non-zeros to pick per row.
+* @param num_rows The number of rows to pick.
+* @param in_rows The set of rows to pick.
+* @param in_ptr The indptr array of the input CSR.
+* @param data The data array of the input CSR.
+* @param prob The probability array of the input CSR.
+* @param ares_ptr The offset to write each row to in the A-res array.
+* @param ares_idxs The A-Res value corresponding index array, the index of input CSR (output).
+* @param ares The A-Res value array (output).
+* @author pengqirong (OPPO)
+*/
+template<typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRAResValueKernel(
+    const uint64_t rand_seed,
+    const int64_t num_picks,
+    const int64_t num_rows,
+    const IdType * const in_rows,
+    const IdType * const in_ptr,
+    const IdType * const data,
+    const FloatType * const prob,
+    const IdType * const ares_ptr,
+    IdType * const ares_idxs,
+    FloatType * const ares) {
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    // A-Res value needs to be calculated only if deg is greater than num_picks
+    // in weighted rowwise sampling without replacement
+    if (deg > num_picks) {
+      const int64_t ares_row_start = ares_ptr[out_row];
+      for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const int64_t in_idx = in_row_start + idx;
+        const int64_t ares_idx = ares_row_start + idx;
+        FloatType item_prob;
+        _DoubleSlice<IdType, FloatType>(prob, data, idx, in_row_start, &item_prob);
+        // compute A-Res value
+        ares[ares_idx] = static_cast<FloatType>(__powf(curand_uniform(&rng), 1.0f / item_prob));
+        ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+/**
+* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
+* without replacement. After sorting, we select top-num_picks items.
+*
+* @tparam IdType The ID type used for matrices.
+* @tparam FloatType The Float type used for matrices.
+* @tparam TILE_SIZE The number of rows covered by each threadblock.
+* @param num_picks The number of non-zeros to pick per row.
+* @param num_rows The number of rows to pick.
+* @param in_rows The set of rows to pick.
+* @param in_ptr The indptr array of the input CSR.
+* @param in_cols The columns array of the input CSR.
+* @param data The data array of the input CSR.
+* @param out_ptr The offset to write each row to in the output COO.
+* @param ares_ptr The offset to write each row to in the ares array.
+* @param sort_ares_idxs The sorted A-Res value corresponding index array, the index of input CSR.
+* @param out_rows The rows of the output COO (output).
+* @param out_cols The columns of the output COO (output).
+* @param out_idxs The data array of the output COO (output).
+* @author pengqirong (OPPO)
+*/
+template<typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleKernel(
+    const int64_t num_picks,
+    const int64_t num_rows,
+    const IdType * const in_rows,
+    const IdType * const in_ptr,
+    const IdType * const in_cols,
+    const IdType * const data,
+    const IdType * const out_ptr,
+    const IdType * const ares_ptr,
+    const IdType * const sort_ares_idxs,
+    IdType * const out_rows,
+    IdType * const out_cols,
+    IdType * const out_idxs) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    if (deg > num_picks) {
+      const int64_t ares_row_start = ares_ptr[out_row];
+      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        // get in and out index, the in_idx is one of top num_picks A-Res value
+        // corresponding index in input CSR.
+        const int64_t out_idx = out_row_start + idx;
+        const int64_t ares_idx = ares_row_start + idx;
+        const int64_t in_idx = sort_ares_idxs[ares_idx];
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    } else {
+      for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        // get in and out index
+        const int64_t out_idx = out_row_start + idx;
+        const int64_t in_idx = in_row_start + idx;
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+template<typename FloatType>
+struct BlockPrefixCallbackOp {
+    // Running prefix
+    FloatType running_total;
+    // Constructor
+    __device__ BlockPrefixCallbackOp(FloatType running_total) : running_total(running_total) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ FloatType operator()(FloatType block_aggregate) {
+      FloatType old_prefix = running_total;
+      running_total += block_aggregate;
+      return old_prefix;
+    }
+};
+/**
+* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix,
+* with replacement. We store the CDF (unnormalized) of all neighbors of a row
+* in global memory and use binary search to find inverse indices as selected items.
+*
+* @tparam IdType The ID type used for matrices.
+* @tparam FloatType The Float type used for matrices.
+* @tparam TILE_SIZE The number of rows covered by each threadblock.
+* @param rand_seed The random seed to use.
+* @param num_picks The number of non-zeros to pick per row.
+* @param num_rows The number of rows to pick.
+* @param in_rows The set of rows to pick.
+* @param in_ptr The indptr array of the input CSR.
+* @param in_cols The columns array of the input CSR.
+* @param data The data array of the input CSR.
+* @param prob The probability array of the input CSR.
+* @param out_ptr The offset to write each row to in the output COO.
+* @param cdf_ptr The offset of each cdf segment.
+* @param cdf The global buffer to store cdf segments.
+* @param out_rows The rows of the output COO (output).
+* @param out_cols The columns of the output COO (output).
+* @param out_idxs The data array of the output COO (output).
+* @author pengqirong (OPPO)
+*/
+template<typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleReplaceKernel(
+    const uint64_t rand_seed,
+    const int64_t num_picks,
+    const int64_t num_rows,
+    const IdType * const in_rows,
+    const IdType * const in_ptr,
+    const IdType * const in_cols,
+    const IdType * const data,
+    const FloatType * const prob,
+    const IdType * const out_ptr,
+    const IdType * const cdf_ptr,
+    FloatType * const cdf,
+    IdType * const out_rows,
+    IdType * const out_cols,
+    IdType * const out_idxs
+) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t cdf_row_start = cdf_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    const FloatType MIN_THREAD_DATA = static_cast<FloatType>(0.0f);
+    if (deg > 0) {
+      // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
+      typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
+      // Allocate shared memory for BlockScan
+      __shared__ typename BlockScan::TempStorage temp_storage;
+      // Initialize running total
+      BlockPrefixCallbackOp<FloatType> prefix_op(MIN_THREAD_DATA);
+      int64_t max_iter = (1 + (deg - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+      // Have the block iterate over segments of items
+      for (int64_t idx = threadIdx.x; idx < max_iter; idx += BLOCK_SIZE) {
+        // Load a segment of consecutive items that are blocked across threads
+        FloatType thread_data;
+        if (idx < deg)
+          _DoubleSlice<IdType, FloatType>(prob, data, idx, in_row_start, &thread_data);
+        else
+          thread_data = MIN_THREAD_DATA;
+        thread_data = max(thread_data, MIN_THREAD_DATA);
+        // Collectively compute the block-wide inclusive prefix sum
+        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, prefix_op);
+        __syncthreads();
+        // Store scanned items to cdf array
+        if (idx < deg) {
+          cdf[cdf_row_start + idx] = thread_data;
+        }
+      }
+      __syncthreads();
+      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        // get random value
+        FloatType sum = cdf[cdf_row_start + deg - 1];
+        FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
+        // get the offset of the first value within cdf array which is greater than random value.
+        int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
+            &cdf[cdf_row_start], deg, rand);
+        item = min(item, deg - 1);
+        // get in and out index
+        const int64_t in_idx = in_row_start + item;
+        const int64_t out_idx = out_row_start + idx;
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+}  // namespace
+/////////////////////////////// CSR ///////////////////////////////
+/**
+* @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO matrix.
+* Use CDF sampling algorithm for with replacement:
+*   1) Calculate the CDF of all neighbor's prob.
+*   2) For each [0, num_picks), generate a rand ~ U(0, 1).
+*      Use binary search to find its index in the CDF array as a chosen item.
+* Use A-Res sampling algorithm for without replacement:
+*   1) For rows with deg > num_picks, calculate A-Res values for all neighbors.
+*   2) Sort the A-Res array and select top-num_picks as chosen items.
+*
+* @tparam XPU The device type used for matrices.
+* @tparam IdType The ID type used for matrices.
+* @tparam FloatType The Float type used for matrices.
+* @param mat The CSR matrix.
+* @param rows The set of rows to pick.
+* @param num_picks The number of non-zeros to pick per row.
+* @param prob The probability array of the input CSR.
+* @param replace Is replacement sampling?
+* @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
+*/
+template <DLDeviceType XPU, typename IdType, typename FloatType>
+COOMatrix CSRRowWiseSampling(CSRMatrix mat,
+                             IdArray rows,
+                             int64_t num_picks,
+                             FloatArray prob,
+                             bool replace) {
+  const auto& ctx = rows->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // TODO(dlasalle): Once the device api supports getting the stream from the
+  // context, that should be used instead of the default stream here.
+  cudaStream_t stream = 0;
+  const int64_t num_rows = rows->shape[0];
+  const IdType * const slice_rows = static_cast<const IdType*>(rows->data);
+  IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  const IdType * const in_ptr = static_cast<const IdType*>(mat.indptr->data);
+  const IdType * const in_cols = static_cast<const IdType*>(mat.indices->data);
+  IdType* const out_rows = static_cast<IdType*>(picked_row->data);
+  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
+  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
+  const IdType* const data = CSRHasData(mat) ?
+      static_cast<IdType*>(mat.data->data) : nullptr;
+  const FloatType* const prob_data = static_cast<const FloatType*>(prob->data);
+  // compute degree
+  // out_deg: the size of each row in the sampled matrix
+  // temp_deg: the size of each row we will manipulate in sampling
+  //    1) for w/o replacement: in degree if it's greater than num_picks else 0
+  //    2) for w/ replacement: in degree
+  IdType * out_deg = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  IdType * temp_deg = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  if (replace) {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeReplaceKernel,
+        grid, block, 0, stream,
+        num_picks, num_rows, slice_rows, in_ptr, out_deg, temp_deg);
+  } else {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeKernel,
+        grid, block, 0, stream,
+        num_picks, num_rows, slice_rows, in_ptr, out_deg, temp_deg);
+  }
+  // fill temp_ptr
+  IdType * temp_ptr = static_cast<IdType*>(
+    device->AllocWorkspace(ctx, (num_rows + 1)*sizeof(IdType)));
+  size_t prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
+      temp_deg,
+      temp_ptr,
+      num_rows + 1,
+      stream));
+  void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
+      temp_deg,
+      temp_ptr,
+      num_rows + 1,
+      stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+  device->FreeWorkspace(ctx, temp_deg);
+  // TODO(Xin): The copy here is too small, and the overhead of creating
+  // cuda events cannot be ignored. Just use synchronized copy.
+  IdType temp_len;
+  device->CopyDataFromTo(temp_ptr, num_rows * sizeof(temp_len), &temp_len, 0,
+      sizeof(temp_len),
+      ctx,
+      DGLContext{kDLCPU, 0},
+      mat.indptr->dtype,
+      stream);
+  device->StreamSync(ctx, stream);
+  // fill out_ptr
+  IdType * out_ptr = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows+1)*sizeof(IdType)));
+  prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
+      out_deg,
+      out_ptr,
+      num_rows+1,
+      stream));
+  prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
+      out_deg,
+      out_ptr,
+      num_rows+1,
+      stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+  device->FreeWorkspace(ctx, out_deg);
+  cudaEvent_t copyEvent;
+  CUDA_CALL(cudaEventCreate(&copyEvent));
+  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
+  // a cudaevent
+  IdType new_len;
+  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
+      sizeof(new_len),
+      ctx,
+      DGLContext{kDLCPU, 0},
+      mat.indptr->dtype,
+      stream);
+  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+  // allocate workspace
+  // 1) for w/ replacement, it's a global buffer to store cdf segments (one segment for each row).
+  // 2) for w/o replacement, it's used to store a-res segments (one segment for
+  //    each row with degree > num_picks)
+  FloatType * temp = static_cast<FloatType*>(
+      device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
+  const uint64_t rand_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+  // select edges
+  // the number of rows each thread block will cover
+  constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
+  if (replace) {  // with replacement.
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleReplaceKernel<IdType, FloatType, TILE_SIZE>),
+        grid, block, 0, stream,
+        rand_seed,
+        num_picks,
+        num_rows,
+        slice_rows,
+        in_ptr,
+        in_cols,
+        data,
+        prob_data,
+        out_ptr,
+        temp_ptr,
+        temp,
+        out_rows,
+        out_cols,
+        out_idxs);
+    device->FreeWorkspace(ctx, temp);
+  } else {  // without replacement
+    IdType* temp_idxs = static_cast<IdType*>(
+        device->AllocWorkspace(ctx, (temp_len) * sizeof(IdType)));
+    // Compute A-Res value. A-Res value needs to be calculated only if deg
+    // is greater than num_picks in weighted rowwise sampling without replacement.
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRAResValueKernel<IdType, FloatType, TILE_SIZE>),
+        grid, block, 0, stream,
+        rand_seed,
+        num_picks,
+        num_rows,
+        slice_rows,
+        in_ptr,
+        data,
+        prob_data,
+        temp_ptr,
+        temp_idxs,
+        temp);
+    // sort A-Res value array.
+    FloatType* sort_temp = static_cast<FloatType*>(
+      device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
+    IdType* sort_temp_idxs = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
+    cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
+    cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
+    void *d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+        d_temp_storage,
+        temp_storage_bytes,
+        sort_keys,
+        sort_values,
+        temp_len,
+        num_rows,
+        temp_ptr,
+        temp_ptr + 1));
+    d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
+    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+        d_temp_storage,
+        temp_storage_bytes,
+        sort_keys,
+        sort_values,
+        temp_len,
+        num_rows,
+        temp_ptr,
+        temp_ptr + 1));
+    device->FreeWorkspace(ctx, d_temp_storage);
+    device->FreeWorkspace(ctx, temp);
+    device->FreeWorkspace(ctx, temp_idxs);
+    device->FreeWorkspace(ctx, sort_temp);
+    device->FreeWorkspace(ctx, sort_temp_idxs);
+    // select tok-num_picks as results
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleKernel<IdType, FloatType, TILE_SIZE>),
+        grid, block, 0, stream,
+        num_picks,
+        num_rows,
+        slice_rows,
+        in_ptr,
+        in_cols,
+        data,
+        out_ptr,
+        temp_ptr,
+        sort_values.Current(),
+        out_rows,
+        out_cols,
+        out_idxs);
+  }
+  device->FreeWorkspace(ctx, temp_ptr);
+  device->FreeWorkspace(ctx, out_ptr);
+  // wait for copying `new_len` to finish
+  CUDA_CALL(cudaEventSynchronize(copyEvent));
+  CUDA_CALL(cudaEventDestroy(copyEvent));
+  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
+  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
+  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
+  return COOMatrix(mat.num_rows, mat.num_cols, picked_row,
+      picked_col, picked_idx);
+}
+template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, float>(
+  CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, float>(
+  CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDLGPU, int32_t, double>(
+  CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDLGPU, int64_t, double>(
+  CSRMatrix, IdArray, int64_t, FloatArray, bool);
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
--- a/src/array/cuda/sddmm_hetero.cu
+++ b/src/array/cuda/sddmm_hetero.cu
@@ -9,39 +9,6 @@
 namespace dgl {
 namespace aten {
-/*!
- * \brief CUDA implementation of g-SDDMM on heterograph using 
-    Csr format.
- */
-template <int XPU, typename IdType, int bits>
-void SDDMMCsrHetero(const std::string& op,
-              const BcastOff& bcast,
-              const std::vector<CSRMatrix>& vec_csr,
-              const std::vector<NDArray>& vec_lhs,
-              const std::vector<NDArray>& vec_rhs,
-              std::vector<NDArray> vec_out,
-              int lhs_target,
-              int rhs_target,
-              const std::vector<dgl_type_t>& lhs_eid,
-              const std::vector<dgl_type_t>& rhs_eid) {
-  SWITCH_BITS(bits, DType, {
-    SWITCH_OP(op, Op, {
-      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
-        /* Call SDDMM CUDA kernel for each relation type sequentially */
-        for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
-          CSRMatrix csr = vec_csr[etype];
-          NDArray lhs = vec_lhs[lhs_eid[etype]];
-          NDArray rhs = vec_rhs[rhs_eid[etype]];
-          NDArray out = vec_out[etype];
-          cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
-            bcast, csr, lhs, rhs, out);
-        }
-      });
-    });
-  });
-}
 /*!
 * \brief CUDA implementation of g-SDDMM on heterograph using
    Csr format.
@@ -76,49 +43,6 @@ void SDDMMCooHetero(const std::string& op,
 }
-template void SDDMMCsrHetero<kDLGPU, int32_t, 16>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 16>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int32_t, 32>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 32>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int32_t, 64>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDLGPU, int64_t, 64>(
-    const std::string& op, const BcastOff& bcast,
-    const std::vector<CSRMatrix>& vec_csr,
-    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
-    std::vector<NDArray> out, int lhs_target, int rhs_target,
-    const std::vector<dgl_type_t>& in_eid,
-    const std::vector<dgl_type_t>& out_eid);
 template void SDDMMCooHetero<kDLGPU, int32_t, 16>(
    const std::string& op, const BcastOff& bcast,
    const std::vector<COOMatrix>& vec_coo,

--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cuda/sddmm.cu
+ * \brief SDDMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+#include "./sddmm.cuh"
+namespace dgl {
+namespace aten {
+/*!
+ * \brief CUDA implementation of g-SDDMM on heterograph using 
+    Csr format.
+ */
+template <int XPU, typename IdType, int bits>
+void SDDMMCsrHetero(const std::string& op,
+              const BcastOff& bcast,
+              const std::vector<CSRMatrix>& vec_csr,
+              const std::vector<NDArray>& vec_lhs,
+              const std::vector<NDArray>& vec_rhs,
+              std::vector<NDArray> vec_out,
+              int lhs_target,
+              int rhs_target,
+              const std::vector<dgl_type_t>& lhs_eid,
+              const std::vector<dgl_type_t>& rhs_eid) {
+  SWITCH_BITS(bits, DType, {
+    SWITCH_OP(op, Op, {
+      SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
+        /* Call SDDMM CUDA kernel for each relation type sequentially */
+        for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
+          CSRMatrix csr = vec_csr[etype];
+          NDArray lhs = vec_lhs[lhs_eid[etype]];
+          NDArray rhs = vec_rhs[rhs_eid[etype]];
+          NDArray out = vec_out[etype];
+          cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
+            bcast, csr, lhs, rhs, out);
+        }
+      });
+    });
+  });
+}
+template void SDDMMCsrHetero<kDLGPU, int32_t, 16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDLGPU, int64_t, 16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDLGPU, int32_t, 32>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDLGPU, int64_t, 32>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDLGPU, int32_t, 64>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDLGPU, int64_t, 64>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& lhs, const std::vector<NDArray>& rhs,
+    std::vector<NDArray> out, int lhs_target, int rhs_target,
+    const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+}  // namespace aten
+}  // namespace dgl
--- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
 /*!
- *  Copyright (c) 2021 by Contributors
+ *  Copyright (c) 2021-2022 by Contributors
 * \file graph/sampling/randomwalk_gpu.cu
- * \brief DGL sampler 
+ * \brief CUDA random walk sampleing
 */
 #include <dgl/array.h>
@@ -13,6 +13,7 @@
 #include <utility>
 #include <tuple>
+#include "../../../array/cuda/dgl_cub.cuh"
 #include "../../../runtime/cuda/cuda_common.h"
 #include "frequency_hashmap.cuh"
@@ -89,6 +90,81 @@ __global__ void _RandomWalkKernel(
  }
 }
+template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _RandomWalkBiasedKernel(
+    const uint64_t rand_seed,
+    const IdType *seed_data,
+    const int64_t num_seeds,
+    const IdType *metapath_data,
+    const uint64_t max_num_steps,
+    const GraphKernelData<IdType> *graphs,
+    const FloatType **probs,
+    const FloatType **prob_sums,
+    const FloatType *restart_prob_data,
+    const int64_t restart_prob_size,
+    const int64_t max_nodes,
+    IdType *out_traces_data,
+    IdType *out_eids_data) {
+  assert(BLOCK_SIZE == blockDim.x);
+  int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x;
+  int64_t last_idx = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
+  int64_t trace_length = (max_num_steps + 1);
+  curandState rng;
+  // reference:
+  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
+  curand_init(rand_seed + idx, 0, 0, &rng);
+  while (idx < last_idx) {
+    IdType curr = seed_data[idx];
+    assert(curr < max_nodes);
+    IdType *traces_data_ptr = &out_traces_data[idx * trace_length];
+    IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps];
+    *(traces_data_ptr++) = curr;
+    int64_t step_idx;
+    for (step_idx = 0; step_idx < max_num_steps; ++step_idx) {
+      IdType metapath_id = metapath_data[step_idx];
+      const GraphKernelData<IdType> &graph = graphs[metapath_id];
+      const int64_t in_row_start = graph.in_ptr[curr];
+      const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr];
+      if (deg == 0) {  // the degree is zero
+        break;
+      }
+      // randomly select by weight
+      const FloatType *prob_sum = prob_sums[metapath_id];
+      const FloatType *prob = probs[metapath_id];
+      int64_t num;
+      if (prob == nullptr) {
+        num = curand(&rng) % deg;
+      } else {
+        auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
+        FloatType sum_w{0.};
+        for (num = 0; num < deg; ++num) {
+          sum_w += prob[in_row_start + num];
+          if (sum_w >= rnd_sum_w) break;
+        }
+      }
+      IdType pick = graph.in_cols[in_row_start + num];
+      IdType eid = (graph.data? graph.data[in_row_start + num] : in_row_start + num);
+      *traces_data_ptr = pick;
+      *eids_data_ptr = eid;
+      if ((restart_prob_size > 1) && (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+        break;
+      } else if ((restart_prob_size == 1) && (curand_uniform(&rng) < restart_prob_data[0])) {
+        break;
+      }
+      ++traces_data_ptr; ++eids_data_ptr;
+      curr = pick;
+    }
+    for (; step_idx < max_num_steps; ++step_idx) {
+      *(traces_data_ptr++) = -1;
+      *(eids_data_ptr++) = -1;
+    }
+    idx += BLOCK_SIZE;
+  }
+}
 }  // namespace
 // random walk for uniform choice
@@ -167,6 +243,143 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
  return std::make_pair(traces, eids);
 }
+/** 
+ * \brief Random walk for biased choice. We use inverse transform sampling to
+ * choose the next step.
+ */
+template <DLDeviceType XPU, typename FloatType, typename IdType>
+std::pair<IdArray, IdArray> RandomWalkBiased(
+    const HeteroGraphPtr hg,
+    const IdArray seeds,
+    const TypeArray metapath,
+    const std::vector<FloatArray> &prob,
+    FloatArray restart_prob) {
+  const int64_t max_num_steps = metapath->shape[0];
+  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const int64_t begin_ntype = hg->meta_graph()->FindEdge(metapath_data[0]).first;
+  const int64_t max_nodes = hg->NumVertices(begin_ntype);
+  int64_t num_etypes = hg->NumEdgeTypes();
+  auto ctx = seeds->ctx;
+  const IdType *seed_data = static_cast<const IdType*>(seeds->data);
+  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
+  const int64_t num_seeds = seeds->shape[0];
+  int64_t trace_length = max_num_steps + 1;
+  IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
+  IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
+  IdType *traces_data = traces.Ptr<IdType>();
+  IdType *eids_data = eids.Ptr<IdType>();
+  cudaStream_t stream = 0;
+  auto device = DeviceAPI::Get(ctx);
+  // new probs and prob sums pointers
+  assert(num_etypes == static_cast<int64_t>(prob.size()));
+  std::unique_ptr<FloatType *[]> probs(new FloatType *[prob.size()]);
+  std::unique_ptr<FloatType *[]> prob_sums(new FloatType *[prob.size()]);
+  std::vector<FloatArray> prob_sums_arr;
+  prob_sums_arr.reserve(prob.size());
+  // graphs
+  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
+    h_graphs[etype].in_ptr  = static_cast<const IdType*>(csr.indptr->data);
+    h_graphs[etype].in_cols = static_cast<const IdType*>(csr.indices->data);
+    h_graphs[etype].data = (CSRHasData(csr) ? static_cast<const IdType*>(csr.data->data) : nullptr);
+    int64_t num_segments = csr.indptr->shape[0] - 1;
+    // will handle empty probs in the kernel
+    if (IsNullArray(prob[etype])) {
+      probs[etype] = nullptr;
+      prob_sums[etype] = nullptr;
+      continue;
+    }
+    probs[etype] = prob[etype].Ptr<FloatType>();
+    prob_sums_arr.push_back(FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
+    prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+    // calculate the sum of the neighbor weights
+    const IdType *d_offsets = static_cast<const IdType*>(csr.indptr->data);
+    size_t temp_storage_size = 0;
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_size,
+        probs[etype],
+        prob_sums[etype],
+        num_segments,
+        d_offsets,
+        d_offsets + 1));
+    void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(temp_storage, temp_storage_size,
+        probs[etype],
+        prob_sums[etype],
+        num_segments,
+        d_offsets,
+        d_offsets + 1));
+    device->FreeWorkspace(ctx, temp_storage);
+  }
+  // copy graph metadata pointers to GPU
+  auto d_graphs = static_cast<GraphKernelData<IdType>*>(
+      device->AllocWorkspace(ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
+  device->CopyDataFromTo(h_graphs.data(), 0, d_graphs, 0,
+      (num_etypes) * sizeof(GraphKernelData<IdType>),
+      DGLContext{kDLCPU, 0},
+      ctx,
+      hg->GetCSRMatrix(0).indptr->dtype,
+      stream);
+  // copy probs pointers to GPU
+  const FloatType **probs_dev = static_cast<const FloatType **>(
+      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
+  device->CopyDataFromTo(probs.get(), 0, probs_dev, 0,
+      (num_etypes) * sizeof(FloatType *),
+      DGLContext{kDLCPU, 0},
+      ctx,
+      prob[0]->dtype,
+      stream);
+  // copy probs_sum pointers to GPU
+  const FloatType **prob_sums_dev = static_cast<const FloatType **>(
+      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
+  device->CopyDataFromTo(prob_sums.get(), 0, prob_sums_dev, 0,
+      (num_etypes) * sizeof(FloatType *),
+      DGLContext{kDLCPU, 0},
+      ctx,
+      prob[0]->dtype,
+      stream);
+  // copy metapath to GPU
+  auto d_metapath = metapath.CopyTo(ctx);
+  const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+  constexpr int BLOCK_SIZE = 256;
+  constexpr int TILE_SIZE = BLOCK_SIZE * 4;
+  dim3 block(256);
+  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
+  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+  CHECK(restart_prob->ctx.device_type == kDLGPU) << "restart prob should be in GPU.";
+  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
+  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+  const int64_t restart_prob_size = restart_prob->shape[0];
+  CUDA_KERNEL_CALL(
+    (_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>),
+    grid, block, 0, stream,
+    random_seed,
+    seed_data,
+    num_seeds,
+    d_metapath_data,
+    max_num_steps,
+    d_graphs,
+    probs_dev,
+    prob_sums_dev,
+    restart_prob_data,
+    restart_prob_size,
+    max_nodes,
+    traces_data,
+    eids_data);
+  device->FreeWorkspace(ctx, d_graphs);
+  device->FreeWorkspace(ctx, probs_dev);
+  device->FreeWorkspace(ctx, prob_sums_dev);
+  return std::make_pair(traces, eids);
+}
 template<DLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> RandomWalk(
    const HeteroGraphPtr hg,
@@ -174,16 +387,25 @@ std::pair<IdArray, IdArray> RandomWalk(
    const TypeArray metapath,
    const std::vector<FloatArray> &prob) {
-  // not support no-uniform choice now
+  bool isUniform = true;
  for (const auto &etype_prob : prob) {
    if (!IsNullArray(etype_prob)) {
-      LOG(FATAL) << "Non-uniform choice is not supported in GPU.";
+      isUniform = false;
+      break;
    }
  }
  auto restart_prob = NDArray::Empty(
      {0}, DLDataType{kDLFloat, 32, 1}, DGLContext{XPU, 0});
-  return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(hg, seeds, metapath, prob, restart_prob);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  }
 }
 template<DLDeviceType XPU, typename IdType>
@@ -194,12 +416,14 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
    const std::vector<FloatArray> &prob,
    double restart_prob) {
-  // not support no-uniform choice now
+  bool isUniform = true;
  for (const auto &etype_prob : prob) {
    if (!IsNullArray(etype_prob)) {
-      LOG(FATAL) << "Non-uniform choice is not supported in GPU.";
+      isUniform = false;
+      break;
    }
  }
  auto device_ctx = seeds->ctx;
  auto restart_prob_array = NDArray::Empty(
      {1}, DLDataType{kDLFloat, 64, 1}, device_ctx);
@@ -214,7 +438,16 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
      restart_prob_array->dtype, stream);
  device->StreamSync(device_ctx, stream);
-  return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob_array);
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(
+          hg, seeds, metapath, prob, restart_prob_array);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob_array);
+  }
 }
 template<DLDeviceType XPU, typename IdType>
@@ -225,14 +458,23 @@ std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
    const std::vector<FloatArray> &prob,
    FloatArray restart_prob) {
-  // not support no-uniform choice now
+  bool isUniform = true;
  for (const auto &etype_prob : prob) {
    if (!IsNullArray(etype_prob)) {
-      LOG(FATAL) << "Non-uniform choice is not supported in GPU.";
+      isUniform = false;
+      break;
    }
  }
-  return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(hg, seeds, metapath, prob, restart_prob);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  }
 }
 template<DLDeviceType XPU, typename IdxType>

--- a/src/graph/sampling/randomwalks/randomwalks.cc
+++ b/src/graph/sampling/randomwalks/randomwalks.cc
@@ -44,9 +44,14 @@ void CheckRandomWalkInputs(
  }
  for (uint64_t i = 0; i < prob.size(); ++i) {
    FloatArray p = prob[i];
+    CHECK_EQ(hg->Context(), p->ctx) << "Expected prob (" << p->ctx << ")" << " to have the same " \
+      << "context as graph (" << hg->Context() << ").";
    CHECK_FLOAT(p, "probability");
-    if (p.GetSize() != 0)
+    if (p.GetSize() != 0) {
+      CHECK_EQ(hg->IsPinned(), p.IsPinned())
+        << "The prob array should have the same pinning status as the graph";
      CHECK_NDIM(p, 1, "probability");
+    }
  }
 }

--- a/src/random/random.cc
+++ b/src/random/random.cc
@@ -29,13 +29,15 @@ DGL_REGISTER_GLOBAL("rng._CAPI_SetSeed")
      }
    });
 #ifdef DGL_USE_CUDA
-    auto* thr_entry = CUDAThreadEntry::ThreadLocal();
+    if (DeviceAPI::Get(kDLGPU)->IsAvailable()) {
-    if (!thr_entry->curand_gen) {
+      auto* thr_entry = CUDAThreadEntry::ThreadLocal();
-      CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
+      if (!thr_entry->curand_gen) {
+        CURAND_CALL(curandCreateGenerator(&thr_entry->curand_gen, CURAND_RNG_PSEUDO_DEFAULT));
+      }
+      CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
+          thr_entry->curand_gen,
+          static_cast<uint64_t>(seed)));
    }
-    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(
-        thr_entry->curand_gen,
-        static_cast<uint64_t>(seed)));
 #endif  // DGL_USE_CUDA
  });

--- a/src/rpc/network/tcp_socket.cc
+++ b/src/rpc/network/tcp_socket.cc
@@ -64,14 +64,22 @@ bool TCPSocket::Bind(const char * ip, int port) {
  SAI sa_server;
  sa_server.sin_family      = AF_INET;
  sa_server.sin_port        = htons(port);
-  int retval = 0;
+  int ret = 0;
+  ret = inet_pton(AF_INET, ip, &sa_server.sin_addr);
+  if (ret == 0) {
+    LOG(ERROR) << "Invalid IP: " << ip;
+    return false;
+  } else if (ret < 0) {
+    LOG(ERROR) << "Failed to convert [" << ip
+               << "] to binary form, error: " << strerror(errno);
+    return false;
+  }
  do {  // retry if EINTR failure appears
-    if (0 < inet_pton(AF_INET, ip, &sa_server.sin_addr) &&
+    if (0 <= (ret = bind(socket_, reinterpret_cast<SA *>(&sa_server),
-        0 <= (retval = bind(socket_, reinterpret_cast<SA*>(&sa_server),
+                         sizeof(sa_server)))) {
-                  sizeof(sa_server)))) {
      return true;
    }
-  } while (retval == -1 && errno == EINTR);
+  } while (ret == -1 && errno == EINTR);
  LOG(ERROR) << "Failed bind on " << ip << ":" << port << " , error: " << strerror(errno);
  return false;

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -4,7 +4,7 @@
 * \brief GPU specific API
 */
 #include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>
 #include <dmlc/thread_local.h>
 #include <dgl/runtime/registry.h>
 #include <cuda_runtime.h>
@@ -15,6 +15,23 @@ namespace runtime {
 class CUDADeviceAPI final : public DeviceAPI {
 public:
+  CUDADeviceAPI() {
+    int count;
+    auto err = cudaGetDeviceCount(&count);
+    switch (err) {
+      case cudaSuccess:
+        break;
+      default:
+        count = 0;
+        cudaGetLastError();
+    }
+    is_available_ = count > 0;
+  }
+  bool IsAvailable() final {
+    return is_available_;
+  }
  void SetDevice(DGLContext ctx) final {
    CUDA_CALL(cudaSetDevice(ctx.device_id));
  }
@@ -224,11 +241,21 @@ class CUDADeviceAPI final : public DeviceAPI {
  }
  void* AllocWorkspace(DGLContext ctx, size_t size, DGLType type_hint) final {
-    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+    // Redirect to PyTorch's allocator when available.
+    SetDevice(ctx);
+    TensorDispatcher* td = TensorDispatcher::Global();
+    if (td->IsAvailable())
+      return td->AllocWorkspace(size);
+    else
+      return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
  }
  void FreeWorkspace(DGLContext ctx, void* data) final {
-    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+    TensorDispatcher* td = TensorDispatcher::Global();
+    if (td->IsAvailable())
+      td->FreeWorkspace(data);
+    else
+      CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
  }
  static const std::shared_ptr<CUDADeviceAPI>& Global() {