Merge branch 'master' into dist_part

2cf4bd0a · Minjie Wang · GitHub · 2e8ae9f9 · d077d371 · 2cf4bd0a
Unverified Commit 2cf4bd0a authored Aug 20, 2022 by Minjie Wang Committed by GitHub Aug 20, 2022
15 changed files
--- a/python/dgl/distributed/__init__.py
+++ b/python/dgl/distributed/__init__.py
-"""DGL distributed module contains classes and functions to support
-distributed graph neural network training and inference in a cluster of
-machines.
-
-This includes a few submodules:
-
-* distributed data structures including distributed graph, distributed tensor
-  and distributed embeddings.
-* distributed sampling.
-* distributed workload split at runtime.
-* graph partition.
-
-"""
-import os
-import sys
-
+"""DGL distributed module"""
 from .dist_graph import DistGraphServer, DistGraph, node_split, edge_split
 from .dist_tensor import DistTensor
 from .partition import partition_graph, load_partition, load_partition_feats, load_partition_book
@@ -28,4 +13,4 @@ from .dist_context import initialize, exit_client
 from .kvstore import KVServer, KVClient
 from .server_state import ServerState
 from .dist_dataloader import DistDataLoader
-from .graph_services import sample_neighbors, sample_etype_neighbors, in_subgraph
+from .graph_services import *
--- a/python/dgl/distributed/constants.py
+++ b/python/dgl/distributed/constants.py
@@ -5,3 +5,6 @@ MAX_QUEUE_SIZE = 20*1024*1024*1024

 SERVER_EXIT = "server_exit"
 SERVER_KEEP_ALIVE = "server_keep_alive"
+
+DEFAULT_NTYPE = '_N'
+DEFAULT_ETYPE = (DEFAULT_NTYPE, '_E', DEFAULT_NTYPE)
--- a/python/dgl/distributed/graph_partition_book.py
+++ b/python/dgl/distributed/graph_partition_book.py
@@ -2,16 +2,25 @@

 import pickle
 from abc import ABC
+from ast import literal_eval
 import numpy as np

 from .. import backend as F
-from ..base import NID, EID
+from ..base import NID, EID, DGLError, dgl_warning
 from .. import utils
 from .shared_mem_utils import _to_shared_mem, _get_ndata_path, _get_edata_path, DTYPE_DICT
 from .._ffi.ndarray import empty_shared_mem
 from ..ndarray import exist_shared_mem_array
 from ..partition import NDArrayPartition
 from .id_map import IdMap
+from .constants import DEFAULT_NTYPE, DEFAULT_ETYPE
+
+def _str_to_tuple(s):
+    try:
+        ret = literal_eval(s)
+    except ValueError:
+        ret = s
+    return ret

 def _move_metadata_to_shared_mem(graph_name, num_nodes, num_edges, part_id,
                                 num_partitions, node_map, edge_map, is_range_part):
@@ -270,7 +279,7 @@ class GraphPartitionBook(ABC):
        ----------
        eids : tensor
            global edge IDs
-        etype : str
+        etype : str or (str, str, str)
            The edge type

        Returns
@@ -302,7 +311,7 @@ class GraphPartitionBook(ABC):
        ----------
        partid : int
            partition id
-        etype : str
+        etype : str or (str, str, str)
            The edge type

        Returns
@@ -338,7 +347,7 @@ class GraphPartitionBook(ABC):
            global edge IDs
        partid : int
            partition ID
-        etype : str
+        etype : str or (str, str, str)
            The edge type

        Returns
@@ -367,6 +376,16 @@ class GraphPartitionBook(ABC):
        """Get the list of edge types
        """

+    @property
+    def canonical_etypes(self):
+        """Get the list of canonical edge types
+
+        Returns
+        -------
+        list[(str, str, str)]
+            A list of canonical etypes
+        """
+
    @property
    def is_homogeneous(self):
        """check if homogeneous
@@ -424,7 +443,7 @@ class GraphPartitionBook(ABC):
        ----------
        ids : tensor
            Type-wise edge Ids
-        etype : str
+        etype : str or (str, str, str)
            edge type

        Returns
@@ -528,16 +547,17 @@ class BasicPartitionBook(GraphPartitionBook):
        """
        return self._partition_meta_data

-    def _num_nodes(self, ntype='_N'):
+    def _num_nodes(self, ntype=DEFAULT_NTYPE):
        """ The total number of nodes
        """
-        assert ntype == '_N', 'Base partition book only supports homogeneous graph.'
+        assert ntype == DEFAULT_NTYPE, 'Base partition book only supports homogeneous graph.'
        return len(self._nid2partid)

-    def _num_edges(self, etype='_E'):
+    def _num_edges(self, etype=DEFAULT_ETYPE):
        """ The total number of edges
        """
-        assert etype == '_E', 'Base partition book only supports homogeneous graph.'
+        assert etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]), \
+            'Base partition book only supports homogeneous graph.'
        return len(self._eid2partid)

    def map_to_per_ntype(self, ids):
@@ -554,55 +574,59 @@ class BasicPartitionBook(GraphPartitionBook):
        """
        return F.zeros((len(ids),), F.int32, F.cpu()), ids

-    def map_to_homo_nid(self, ids, ntype):
+    def map_to_homo_nid(self, ids, ntype=DEFAULT_NTYPE):
        """Map per-node-type IDs to global node IDs in the homogeneous format.
        """
-        assert ntype == '_N', 'Base partition book only supports homogeneous graph.'
+        assert ntype == DEFAULT_NTYPE, 'Base partition book only supports homogeneous graph.'
        return ids

-    def map_to_homo_eid(self, ids, etype):
+    def map_to_homo_eid(self, ids, etype=DEFAULT_ETYPE):
        """Map per-edge-type IDs to global edge IDs in the homoenegeous format.
        """
-        assert etype == '_E', 'Base partition book only supports homogeneous graph.'
+        assert etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]), \
+            'Base partition book only supports homogeneous graph.'
        return ids

-    def nid2partid(self, nids, ntype='_N'):
+    def nid2partid(self, nids, ntype=DEFAULT_NTYPE):
        """From global node IDs to partition IDs
        """
-        assert ntype == '_N', 'Base partition book only supports homogeneous graph.'
+        assert ntype == DEFAULT_NTYPE, 'Base partition book only supports homogeneous graph.'
        return F.gather_row(self._nid2partid, nids)

-    def eid2partid(self, eids, etype='_E'):
+    def eid2partid(self, eids, etype=DEFAULT_ETYPE):
        """From global edge IDs to partition IDs
        """
-        assert etype == '_E', 'Base partition book only supports homogeneous graph.'
+        assert etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]), \
+            'Base partition book only supports homogeneous graph.'
        return F.gather_row(self._eid2partid, eids)

-    def partid2nids(self, partid, ntype='_N'):
+    def partid2nids(self, partid, ntype=DEFAULT_NTYPE):
        """From partition id to global node IDs
        """
-        assert ntype == '_N', 'Base partition book only supports homogeneous graph.'
+        assert ntype == DEFAULT_NTYPE, 'Base partition book only supports homogeneous graph.'
        return self._partid2nids[partid]

-    def partid2eids(self, partid, etype='_E'):
+    def partid2eids(self, partid, etype=DEFAULT_ETYPE):
        """From partition id to global edge IDs
        """
-        assert etype == '_E', 'Base partition book only supports homogeneous graph.'
+        assert etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]), \
+            'Base partition book only supports homogeneous graph.'
        return self._partid2eids[partid]

-    def nid2localnid(self, nids, partid, ntype='_N'):
+    def nid2localnid(self, nids, partid, ntype=DEFAULT_NTYPE):
        """Get local node IDs within the given partition.
        """
-        assert ntype == '_N', 'Base partition book only supports homogeneous graph.'
+        assert ntype == DEFAULT_NTYPE, 'Base partition book only supports homogeneous graph.'
        if partid != self._part_id:
            raise RuntimeError('Now GraphPartitionBook does not support \
                getting remote tensor of nid2localnid.')
        return F.gather_row(self._nidg2l[partid], nids)

-    def eid2localeid(self, eids, partid, etype='_E'):
+    def eid2localeid(self, eids, partid, etype=DEFAULT_ETYPE):
        """Get the local edge ids within the given partition.
        """
-        assert etype == '_E', 'Base partition book only supports homogeneous graph.'
+        assert etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]), \
+            'Base partition book only supports homogeneous graph.'
        if partid != self._part_id:
            raise RuntimeError('Now GraphPartitionBook does not support \
                getting remote tensor of eid2localeid.')
@@ -618,13 +642,24 @@ class BasicPartitionBook(GraphPartitionBook):
    def ntypes(self):
        """Get the list of node types
        """
-        return ['_N']
+        return [DEFAULT_NTYPE]

    @property
    def etypes(self):
        """Get the list of edge types
        """
-        return ['_E']
+        return [DEFAULT_ETYPE[1]]
+
+    @property
+    def canonical_etypes(self):
+        """Get the list of canonical edge types
+
+        Returns
+        -------
+        list[(str, str, str)]
+            A list of canonical etypes
+        """
+        return [DEFAULT_ETYPE]


 class RangePartitionBook(GraphPartitionBook):
@@ -632,7 +667,8 @@ class RangePartitionBook(GraphPartitionBook):

    This partition book is used if the nodes and edges of a graph partition are assigned
    with contiguous IDs. It uses very small amount of memory to store the partition
-    information.
+    information. Canonical etypes are availabe only when the keys of argument ``etypes``
+    are canonical etypes.

    Parameters
    ----------
@@ -646,7 +682,7 @@ class RangePartitionBook(GraphPartitionBook):
        the number of partitions. Each row has two integers: the starting and the ending IDs
        for a particular node type in a partition. For example, all nodes of type ``"T"`` in
        partition ``i`` has ID range ``node_map["T"][i][0]`` to ``node_map["T"][i][1]``.
-    edge_map : dict[str, Tensor]
+    edge_map : dict[str, Tensor] or dict[(str, str, str), Tensor]
        Global edge ID ranges within partitions for each edge type. The key is the edge type
        name in string. The value is a tensor of shape :math:`(K, 2)`, where :math:`K` is
        the number of partitions. Each row has two integers: the starting and the ending IDs
@@ -654,8 +690,13 @@ class RangePartitionBook(GraphPartitionBook):
        partition ``i`` has ID range ``edge_map["T"][i][0]`` to ``edge_map["T"][i][1]``.
    ntypes : dict[str, int]
        map ntype strings to ntype IDs.
-    etypes : dict[str, int]
+    etypes : dict[str, int] or dict[(str, str, str), int]
        map etype strings to etype IDs.
+
+    .. deprecated:: 0.9.1
+
+        Single string format for keys of ``edge_map`` and ``etypes`` is deprecated.
+        ``(str, str, str)`` will be the only format supported in the future.
    """
    def __init__(self, part_id, num_parts, node_map, edge_map, ntypes, etypes):
        assert part_id >= 0, 'part_id cannot be a negative number.'
@@ -664,14 +705,34 @@ class RangePartitionBook(GraphPartitionBook):
        self._num_partitions = num_parts
        self._ntypes = [None] * len(ntypes)
        self._etypes = [None] * len(etypes)
+        self._canonical_etypes = [None] * len(etypes)
+        # map etypes to canonical ones
+        self._etype2canonical = {}
        for ntype in ntypes:
            ntype_id = ntypes[ntype]
            self._ntypes[ntype_id] = ntype
        assert all(ntype is not None for ntype in self._ntypes), \
                "The node types have invalid IDs."
-        for etype in etypes:
-            etype_id = etypes[etype]
-            self._etypes[etype_id] = etype
+        for etype, etype_id in etypes.items():
+            if isinstance(etype, tuple):
+                assert len(etype) == 3, \
+                    'Canonical etype should be in format of (str, str, str).'
+                c_etype = etype
+                etype = etype[1]
+                self._etypes[etype_id] = etype
+                self._canonical_etypes[etype_id] = c_etype
+                if etype in self._etype2canonical:
+                    # If one etype maps to multiple canonical etypes, empty
+                    # tuple is used to indicate such ambiguity casued by etype.
+                    # See more details in self._to_canonical_etype().
+                    self._etype2canonical[etype] = tuple()
+                else:
+                    self._etype2canonical[etype] = c_etype
+            else:
+                dgl_warning(
+                    "Etype with 'str' format is deprecated. Please use '(str, str, str)'.")
+                self._etypes[etype_id] = etype
+                self._canonical_etypes[etype_id] = None
        assert all(etype is not None for etype in self._etypes), \
                "The edge types have invalid IDs."

@@ -686,6 +747,7 @@ class RangePartitionBook(GraphPartitionBook):
        self._typed_max_node_ids = {}
        max_node_map = np.zeros((num_parts,), dtype=np.int64)
        for key in node_map:
+            assert key in ntypes, 'Unexpected ntype: {}.'.format(key)
            if not isinstance(node_map[key], np.ndarray):
                node_map[key] = F.asnumpy(node_map[key])
            assert node_map[key].shape == (num_parts, 2)
@@ -705,6 +767,7 @@ class RangePartitionBook(GraphPartitionBook):
        self._typed_max_edge_ids = {}
        max_edge_map = np.zeros((num_parts,), dtype=np.int64)
        for key in edge_map:
+            assert key in etypes, 'Unexpected etype: {}.'.format(key)
            if not isinstance(edge_map[key], np.ndarray):
                edge_map[key] = F.asnumpy(edge_map[key])
            assert edge_map[key].shape == (num_parts, 2)
@@ -750,7 +813,8 @@ class RangePartitionBook(GraphPartitionBook):

        eid_range = [None] * len(self.etypes)
        for i, etype in enumerate(self.etypes):
-            eid_range[i] = (etype, self._typed_eid_range[etype])
+            c_etype = self._to_canonical_etype(etype)
+            eid_range[i] = (c_etype, self._typed_eid_range[c_etype])
        eid_range_pickle = list(pickle.dumps(eid_range))

        self._meta = _move_metadata_to_shared_mem(graph_name,
@@ -767,21 +831,22 @@ class RangePartitionBook(GraphPartitionBook):
        return self._num_partitions


-    def _num_nodes(self, ntype='_N'):
+    def _num_nodes(self, ntype=DEFAULT_NTYPE):
        """ The total number of nodes
        """
-        if ntype == '_N':
+        if ntype == DEFAULT_NTYPE:
            return int(self._max_node_ids[-1])
        else:
            return int(self._typed_max_node_ids[ntype][-1])

-    def _num_edges(self, etype='_E'):
+    def _num_edges(self, etype=DEFAULT_ETYPE):
        """ The total number of edges
        """
-        if etype == '_E':
+        if etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]):
            return int(self._max_edge_ids[-1])
        else:
-            return int(self._typed_max_edge_ids[etype][-1])
+            c_etype = self._to_canonical_etype(etype)
+            return int(self._typed_max_edge_ids[c_etype][-1])

    def metadata(self):
        """Return the partition meta data.
@@ -816,40 +881,42 @@ class RangePartitionBook(GraphPartitionBook):
        """Map per-edge-type IDs to global edge IDs in the homoenegeous format.
        """
        ids = utils.toindex(ids).tousertensor()
-        partids = self.eid2partid(ids, etype)
-        typed_max_eids = F.zerocopy_from_numpy(self._typed_max_edge_ids[etype])
+        c_etype = self._to_canonical_etype(etype)
+        partids = self.eid2partid(ids, c_etype)
+        typed_max_eids = F.zerocopy_from_numpy(self._typed_max_edge_ids[c_etype])
        end_diff = F.gather_row(typed_max_eids, partids) - ids
-        typed_eid_range = F.zerocopy_from_numpy(self._typed_eid_range[etype][:, 1])
+        typed_eid_range = F.zerocopy_from_numpy(self._typed_eid_range[c_etype][:, 1])
        return F.gather_row(typed_eid_range, partids) - end_diff

-    def nid2partid(self, nids, ntype='_N'):
+    def nid2partid(self, nids, ntype=DEFAULT_NTYPE):
        """From global node IDs to partition IDs
        """
        nids = utils.toindex(nids)
-        if ntype == '_N':
+        if ntype == DEFAULT_NTYPE:
            ret = np.searchsorted(self._max_node_ids, nids.tonumpy(), side='right')
        else:
            ret = np.searchsorted(self._typed_max_node_ids[ntype], nids.tonumpy(), side='right')
        ret = utils.toindex(ret)
        return ret.tousertensor()

-    def eid2partid(self, eids, etype='_E'):
+    def eid2partid(self, eids, etype=DEFAULT_ETYPE):
        """From global edge IDs to partition IDs
        """
        eids = utils.toindex(eids)
-        if etype == '_E':
+        if etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]):
            ret = np.searchsorted(self._max_edge_ids, eids.tonumpy(), side='right')
        else:
-            ret = np.searchsorted(self._typed_max_edge_ids[etype], eids.tonumpy(), side='right')
+            c_etype = self._to_canonical_etype(etype)
+            ret = np.searchsorted(self._typed_max_edge_ids[c_etype], eids.tonumpy(), side='right')
        ret = utils.toindex(ret)
        return ret.tousertensor()


-    def partid2nids(self, partid, ntype='_N'):
+    def partid2nids(self, partid, ntype=DEFAULT_NTYPE):
        """From partition ID to global node IDs
        """
        # TODO do we need to cache it?
-        if ntype == '_N':
+        if ntype == DEFAULT_NTYPE:
            start = self._max_node_ids[partid - 1] if partid > 0 else 0
            end = self._max_node_ids[partid]
            return F.arange(start, end)
@@ -859,21 +926,22 @@ class RangePartitionBook(GraphPartitionBook):
            return F.arange(start, end)


-    def partid2eids(self, partid, etype='_E'):
+    def partid2eids(self, partid, etype=DEFAULT_ETYPE):
        """From partition ID to global edge IDs
        """
        # TODO do we need to cache it?
-        if etype == '_E':
+        if etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]):
            start = self._max_edge_ids[partid - 1] if partid > 0 else 0
            end = self._max_edge_ids[partid]
            return F.arange(start, end)
        else:
-            start = self._typed_max_edge_ids[etype][partid - 1] if partid > 0 else 0
-            end = self._typed_max_edge_ids[etype][partid]
+            c_etype = self._to_canonical_etype(etype)
+            start = self._typed_max_edge_ids[c_etype][partid - 1] if partid > 0 else 0
+            end = self._typed_max_edge_ids[c_etype][partid]
            return F.arange(start, end)


-    def nid2localnid(self, nids, partid, ntype='_N'):
+    def nid2localnid(self, nids, partid, ntype=DEFAULT_NTYPE):
        """Get local node IDs within the given partition.
        """
        if partid != self._partid:
@@ -882,14 +950,14 @@ class RangePartitionBook(GraphPartitionBook):

        nids = utils.toindex(nids)
        nids = nids.tousertensor()
-        if ntype == '_N':
+        if ntype == DEFAULT_NTYPE:
            start = self._max_node_ids[partid - 1] if partid > 0 else 0
        else:
            start = self._typed_max_node_ids[ntype][partid - 1] if partid > 0 else 0
        return nids - int(start)


-    def eid2localeid(self, eids, partid, etype='_E'):
+    def eid2localeid(self, eids, partid, etype=DEFAULT_ETYPE):
        """Get the local edge IDs within the given partition.
        """
        if partid != self._partid:
@@ -898,10 +966,11 @@ class RangePartitionBook(GraphPartitionBook):

        eids = utils.toindex(eids)
        eids = eids.tousertensor()
-        if etype == '_E':
+        if etype in (DEFAULT_ETYPE, DEFAULT_ETYPE[1]):
            start = self._max_edge_ids[partid - 1] if partid > 0 else 0
        else:
-            start = self._typed_max_edge_ids[etype][partid - 1] if partid > 0 else 0
+            c_etype = self._to_canonical_etype(etype)
+            start = self._typed_max_edge_ids[c_etype][partid - 1] if partid > 0 else 0
        return eids - int(start)


@@ -923,6 +992,41 @@ class RangePartitionBook(GraphPartitionBook):
        """
        return self._etypes

+    @property
+    def canonical_etypes(self):
+        """Get the list of canonical edge types
+
+        Returns
+        -------
+        list[(str, str, str)] or list[None]
+            A list of canonical etypes. If keys of ``edge_map`` and ``etypes``
+            are strings, a list of ``None`` is returned as canonical etypes
+            are not available.
+        """
+        return self._canonical_etypes
+
+    def _to_canonical_etype(self, etype):
+        """Convert an edge type to the corresponding canonical edge type.
+        If canonical etype is not available, no conversion is applied.
+        """
+        if isinstance(etype, tuple):
+            if etype not in self.canonical_etypes:
+                raise DGLError('Edge type "{}" does not exist.'.format(etype))
+            return etype
+        if not self._etype2canonical:
+            # canonical etype is not available, no conversion is applied.
+            # This is the case that 'etypes' passed in when instantiating
+            # are in format of str instead of (str, str, str).
+            # [TODO] Deprecate support for str etypes.
+            return etype
+        ret = self._etype2canonical.get(etype, None)
+        if ret is None:
+            raise DGLError('Edge type "{}" does not exist.'.format(etype))
+        if len(ret) == 0:
+            raise DGLError('Edge type "%s" is ambiguous. Please use canonical edge type '
+                            'in the form of (srctype, etype, dsttype)' % etype)
+        return ret
+
 NODE_PART_POLICY = 'node'
 EDGE_PART_POLICY = 'edge'

@@ -949,12 +1053,13 @@ class PartitionPolicy(object):
            assert policy_str in (EDGE_PART_POLICY, NODE_PART_POLICY), \
                    'policy_str must contain \'edge\' or \'node\'.'
            if NODE_PART_POLICY == policy_str:
-                policy_str = NODE_PART_POLICY + ":_N"
+                policy_str = NODE_PART_POLICY + ":" + DEFAULT_NTYPE
            else:
-                policy_str = EDGE_PART_POLICY + ":_E"
+                policy_str = EDGE_PART_POLICY + ":" + DEFAULT_ETYPE[1]
        self._policy_str = policy_str
        self._part_id = partition_book.partid
        self._partition_book = partition_book
+        self._type_name = _str_to_tuple(self.policy_str[5:])

    @property
    def policy_str(self):
@@ -967,6 +1072,17 @@ class PartitionPolicy(object):
        """
        return self._policy_str

+    @property
+    def type_name(self):
+        """Get the type name: ntype or etype
+
+        Returns
+        -------
+        str or (str, str, str)
+            The ntype or etype.
+        """
+        return self._type_name
+
    @property
    def part_id(self):
        """Get partition ID
@@ -992,8 +1108,8 @@ class PartitionPolicy(object):
    def get_data_name(self, name):
        """Get HeteroDataName
        """
-        is_node = NODE_PART_POLICY in self._policy_str
-        return HeteroDataName(is_node, self._policy_str[5:], name)
+        is_node = NODE_PART_POLICY in self.policy_str
+        return HeteroDataName(is_node, self.type_name, name)

    def to_local(self, id_tensor):
        """Mapping global ID to local ID.
@@ -1008,12 +1124,12 @@ class PartitionPolicy(object):
        tensor
            local ID tensor
        """
-        if EDGE_PART_POLICY in self._policy_str:
-            return self._partition_book.eid2localeid(id_tensor, self._part_id, self._policy_str[5:])
-        elif NODE_PART_POLICY in self._policy_str:
-            return self._partition_book.nid2localnid(id_tensor, self._part_id, self._policy_str[5:])
+        if EDGE_PART_POLICY in self.policy_str:
+            return self._partition_book.eid2localeid(id_tensor, self._part_id, self.type_name)
+        elif NODE_PART_POLICY in self.policy_str:
+            return self._partition_book.nid2localnid(id_tensor, self._part_id, self.type_name)
        else:
-            raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
+            raise RuntimeError('Cannot support policy: %s ' % self.policy_str)

    def to_partid(self, id_tensor):
        """Mapping global ID to partition ID.
@@ -1028,12 +1144,12 @@ class PartitionPolicy(object):
        tensor
            partition ID
        """
-        if EDGE_PART_POLICY in self._policy_str:
-            return self._partition_book.eid2partid(id_tensor, self._policy_str[5:])
-        elif NODE_PART_POLICY in self._policy_str:
-            return self._partition_book.nid2partid(id_tensor, self._policy_str[5:])
+        if EDGE_PART_POLICY in self.policy_str:
+            return self._partition_book.eid2partid(id_tensor, self.type_name)
+        elif NODE_PART_POLICY in self.policy_str:
+            return self._partition_book.nid2partid(id_tensor, self.type_name)
        else:
-            raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
+            raise RuntimeError('Cannot support policy: %s ' % self.policy_str)

    def get_part_size(self):
        """Get data size of current partition.
@@ -1043,12 +1159,12 @@ class PartitionPolicy(object):
        int
            data size
        """
-        if EDGE_PART_POLICY in self._policy_str:
-            return len(self._partition_book.partid2eids(self._part_id, self._policy_str[5:]))
-        elif NODE_PART_POLICY in self._policy_str:
-            return len(self._partition_book.partid2nids(self._part_id, self._policy_str[5:]))
+        if EDGE_PART_POLICY in self.policy_str:
+            return len(self._partition_book.partid2eids(self._part_id, self.type_name))
+        elif NODE_PART_POLICY in self.policy_str:
+            return len(self._partition_book.partid2nids(self._part_id, self.type_name))
        else:
-            raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
+            raise RuntimeError('Cannot support policy: %s ' % self.policy_str)

    def get_size(self):
        """Get the full size of the data.
@@ -1058,24 +1174,26 @@ class PartitionPolicy(object):
        int
            data size
        """
-        if EDGE_PART_POLICY in self._policy_str:
-            return self._partition_book._num_edges(self._policy_str[5:])
-        elif NODE_PART_POLICY in self._policy_str:
-            return self._partition_book._num_nodes(self._policy_str[5:])
+        if EDGE_PART_POLICY in self.policy_str:
+            return self._partition_book._num_edges(self.type_name)
+        elif NODE_PART_POLICY in self.policy_str:
+            return self._partition_book._num_nodes(self.type_name)
        else:
-            raise RuntimeError('Cannot support policy: %s ' % self._policy_str)
+            raise RuntimeError('Cannot support policy: %s ' % self.policy_str)

 class NodePartitionPolicy(PartitionPolicy):
    '''Partition policy for nodes.
    '''
-    def __init__(self, partition_book, ntype='_N'):
-        super(NodePartitionPolicy, self).__init__(NODE_PART_POLICY + ':' + ntype, partition_book)
+    def __init__(self, partition_book, ntype=DEFAULT_NTYPE):
+        super(NodePartitionPolicy, self).__init__(
+            NODE_PART_POLICY + ':' + ntype, partition_book)

 class EdgePartitionPolicy(PartitionPolicy):
    '''Partition policy for edges.
    '''
-    def __init__(self, partition_book, etype='_E'):
-        super(EdgePartitionPolicy, self).__init__(EDGE_PART_POLICY + ':' + etype, partition_book)
+    def __init__(self, partition_book, etype=DEFAULT_ETYPE):
+        super(EdgePartitionPolicy, self).__init__(
+            EDGE_PART_POLICY + ':' + str(etype), partition_book)

 class HeteroDataName(object):
    ''' The data name in a heterogeneous graph.
@@ -1089,16 +1207,22 @@ class HeteroDataName(object):
    ----------
    is_node : bool
        Indicate whether it's node data or edge data.
-    entity_type : str
+    entity_type : str or (str, str, str)
        The type of the node/edge.
    data_name : str
        The name of the data.
    '''
    def __init__(self, is_node, entity_type, data_name):
-        self.policy_str = NODE_PART_POLICY if is_node else EDGE_PART_POLICY
-        self.policy_str = self.policy_str + ':' + entity_type
+        self._policy = NODE_PART_POLICY if is_node else EDGE_PART_POLICY
+        self._entity_type = entity_type
        self.data_name = data_name

+    @property
+    def policy_str(self):
+        ''' concatenate policy and entity type into string
+        '''
+        return self._policy + ':' + str(self.get_type())
+
    def is_node(self):
        ''' Is this the name of node data
        '''
@@ -1114,7 +1238,7 @@ class HeteroDataName(object):
        This is only meaningful in a heterogeneous graph.
        In homogeneous graph, type is '_N' for a node and '_E' for an edge.
        '''
-        return self.policy_str[5:]
+        return self._entity_type

    def get_name(self):
        ''' The name of the data.
@@ -1148,4 +1272,4 @@ def parse_hetero_data_name(name):
    assert len(names) == 3, '{} is not a valid heterograph data name'.format(name)
    assert names[0] in (NODE_PART_POLICY, EDGE_PART_POLICY), \
            '{} is not a valid heterograph data name'.format(name)
-    return HeteroDataName(names[0] == NODE_PART_POLICY, names[1], names[2])
+    return HeteroDataName(names[0] == NODE_PART_POLICY, _str_to_tuple(names[1]), names[2])
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -12,7 +12,10 @@ from ..base import NID, EID
 from ..utils import toindex
 from .. import backend as F

-__all__ = ['sample_neighbors', 'in_subgraph', 'find_edges']
+__all__ = [
+    'sample_neighbors', 'sample_etype_neighbors',
+    'in_subgraph', 'find_edges'
+]

 SAMPLING_SERVICE_ID = 6657
 INSUBGRAPH_SERVICE_ID = 6658

--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -173,7 +173,7 @@ def load_partition_feats(part_config, part_id):
    return node_feats, edge_feats

 def load_partition_book(part_config, part_id, graph=None):
-    ''' Load a graph partition book from the partition config file.
+    '''Load a graph partition book from the partition config file.

    Parameters
    ----------

--- a/python/dgl/utils/internal.py
+++ b/python/dgl/utils/internal.py
@@ -4,6 +4,8 @@ from __future__ import absolute_import, division
 from collections.abc import Mapping, Iterable, Sequence
 from collections import defaultdict
 from functools import wraps
+import glob
+import os
 import numpy as np

 from ..base import DGLError, dgl_warning, NID, EID
@@ -914,6 +916,46 @@ def set_num_threads(num_threads):
    """
    _CAPI_DGLSetOMPThreads(num_threads)

+def get_num_threads():
+    """Get the number of OMP threads in the process"""
+    return _CAPI_DGLGetOMPThreads()
+
+def get_numa_nodes_cores():
+    """ Returns numa nodes info, format:
+        {<node_id>: [(<core_id>, [<sibling_thread_id_0>, <sibling_thread_id_1>, ...]), ...], ...}
+        E.g.: {0: [(0, [0, 4]), (1, [1, 5])], 1: [(2, [2, 6]), (3, [3, 7])]}
+
+        If not available, returns {}
+    """
+    numa_node_paths = glob.glob('/sys/devices/system/node/node[0-9]*')
+
+    if not numa_node_paths:
+        return {}
+
+    nodes = {}
+    try:
+        for node_path in numa_node_paths:
+            numa_node_id = int(os.path.basename(node_path)[4:])
+
+            thread_siblings = {}
+            for cpu_dir in glob.glob(os.path.join(node_path, 'cpu[0-9]*')):
+                cpu_id = int(os.path.basename(cpu_dir)[3:])
+
+                with open(os.path.join(cpu_dir, 'topology', 'core_id')) as core_id_file:
+                    core_id = int(core_id_file.read().strip())
+                    if core_id in thread_siblings:
+                        thread_siblings[core_id].append(cpu_id)
+                    else:
+                        thread_siblings[core_id] = [cpu_id]
+
+            nodes[numa_node_id] = sorted([(k, sorted(v)) for k, v in thread_siblings.items()])
+
+    except (OSError, ValueError, IndexError, IOError):
+        dgl_warning('Failed to read NUMA info')
+        return {}
+
+    return nodes
+
 def alias_func(func):
    """Return an alias function with proper docstring."""
    @wraps(func)

--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -548,11 +548,11 @@ __global__ void SpMMCsrKernel(
  const int64_t* __restrict__ ebcast_off,
  int64_t ufeat_len, int64_t efeat_len, int64_t out_len) {
  // SPMM with CSR.
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  const Idx stride_y = blockDim.y * gridDim.y;
-  const int stride_x = blockDim.x * gridDim.x;
+  int ty = blockIdx.x * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.x;
+  const int stride_x = blockDim.x * gridDim.y;
  while (ty < num_rows) {
-    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int tx = blockIdx.y * blockDim.x + threadIdx.x;
    while (tx < out_len) {
      DType local_accum = ReduceOp::zero();
      Idx local_argu = 0, local_arge = 0;
@@ -759,8 +759,8 @@ void SpMMCsr(
          rhs_len = bcast.rhs_len;
  const int ntx = FindNumThreads(len);
  const int nty = CUDA_MAX_NUM_THREADS / ntx;
-  const int nbx = (len + ntx - 1) / ntx;
-  const int nby = FindNumBlocks<'y'>((csr.num_rows + nty - 1) / nty);
+  const int nby= (len + ntx - 1) / ntx;
+  const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty);
  //LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -18,7 +18,8 @@ namespace cuda {
 #define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
 #define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
 #define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
-#define CUDA_MAX_NUM_THREADS 1024
+// The max number of threads per block
+#define CUDA_MAX_NUM_THREADS 256

 #ifdef USE_FP16
 #define SWITCH_BITS(bits, DType, ...)                           \

--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -197,10 +197,15 @@ class CUDADeviceAPI final : public DeviceAPI {
   *        not just the one that performed the allocation
   */
  void PinData(void* ptr, size_t nbytes) {
+    // prevent users from pinning empty tensors or graphs
+    if (ptr == nullptr || nbytes == 0)
+      return;
    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
  }

  void UnpinData(void* ptr) {
+    if (ptr == nullptr)
+      return;
    CUDA_CALL(cudaHostUnregister(ptr));
  }


--- a/src/runtime/utils.cc
+++ b/src/runtime/utils.cc
@@ -26,6 +26,10 @@ DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLSetOMPThreads")
    omp_set_num_threads(num_threads);
  });

+DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLGetOMPThreads")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    *rv = omp_get_max_threads();
+  });

 DGL_REGISTER_GLOBAL("utils.checks._CAPI_DGLCOOIsSorted")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {

--- a/tests/compute/test_heterograph.py
+++ b/tests/compute/test_heterograph.py
@@ -1008,52 +1008,70 @@ def test_pin_memory_(idtype):
    g = g.to(F.cpu())
    assert not g.is_pinned()

-    if F.is_cuda_available():
-        # unpin an unpinned CPU graph, directly return
-        g.unpin_memory_()
-        assert not g.is_pinned()
-        assert g.device == F.cpu()
+    # unpin an unpinned CPU graph, directly return
+    g.unpin_memory_()
+    assert not g.is_pinned()
+    assert g.device == F.cpu()

-        # pin a CPU graph
-        g.pin_memory_()
-        assert g.is_pinned()
-        assert g.device == F.cpu()
-        assert F.context(g.nodes['user'].data['h']) == F.cpu()
-        assert F.context(g.nodes['game'].data['i']) == F.cpu()
-        assert F.context(g.edges['plays'].data['e']) == F.cpu()
-        for ntype in g.ntypes:
-            assert F.context(g.batch_num_nodes(ntype)) == F.cpu()
-        for etype in g.canonical_etypes:
-            assert F.context(g.batch_num_edges(etype)) == F.cpu()
+    # pin a CPU graph
+    g.pin_memory_()
+    assert g.is_pinned()
+    assert g.device == F.cpu()
+    assert g.nodes['user'].data['h'].is_pinned()
+    assert g.nodes['game'].data['i'].is_pinned()
+    assert g.edges['plays'].data['e'].is_pinned()
+    assert F.context(g.nodes['user'].data['h']) == F.cpu()
+    assert F.context(g.nodes['game'].data['i']) == F.cpu()
+    assert F.context(g.edges['plays'].data['e']) == F.cpu()
+    for ntype in g.ntypes:
+        assert F.context(g.batch_num_nodes(ntype)) == F.cpu()
+    for etype in g.canonical_etypes:
+        assert F.context(g.batch_num_edges(etype)) == F.cpu()

-        # it's fine to clone with new formats, but new graphs are not pinned
-        # >>> g.formats()
-        # {'created': ['coo'], 'not created': ['csr', 'csc']}
-        assert not g.formats('csc').is_pinned()
-        assert not g.formats('csr').is_pinned()
-        # 'coo' formats is already created and thus not cloned
-        assert g.formats('coo').is_pinned()
-
-        # pin a pinned graph, directly return
-        g.pin_memory_()
-        assert g.is_pinned()
-        assert g.device == F.cpu()
+    # it's fine to clone with new formats, but new graphs are not pinned
+    # >>> g.formats()
+    # {'created': ['coo'], 'not created': ['csr', 'csc']}
+    assert not g.formats('csc').is_pinned()
+    assert not g.formats('csr').is_pinned()
+    # 'coo' formats is already created and thus not cloned
+    assert g.formats('coo').is_pinned()
+
+    # pin a pinned graph, directly return
+    g.pin_memory_()
+    assert g.is_pinned()
+    assert g.device == F.cpu()

-        # unpin a pinned graph
-        g.unpin_memory_()
-        assert not g.is_pinned()
-        assert g.device == F.cpu()
+    # unpin a pinned graph
+    g.unpin_memory_()
+    assert not g.is_pinned()
+    assert g.device == F.cpu()

-        g1 = g.to(F.cuda())
+    g1 = g.to(F.cuda())

-        # unpin an unpinned GPU graph, directly return
-        g1.unpin_memory_()
-        assert not g1.is_pinned()
-        assert g1.device == F.cuda()
+    # unpin an unpinned GPU graph, directly return
+    g1.unpin_memory_()
+    assert not g1.is_pinned()
+    assert g1.device == F.cuda()

-        # error pinning a GPU graph
-        with pytest.raises(DGLError):
-            g1.pin_memory_()
+    # error pinning a GPU graph
+    with pytest.raises(DGLError):
+        g1.pin_memory_()
+
+    # test pin empty homograph
+    g2 = dgl.graph(([], []))
+    g2.pin_memory_()
+    assert g2.is_pinned()
+    g2.unpin_memory_()
+    assert not g2.is_pinned()
+
+    # test pin heterograph with 0 edge of one relation type
+    g3 = dgl.heterograph({
+        ('a','b','c'): ([0, 1], [1, 2]),
+        ('c','d','c'): ([], [])}).astype(idtype)
+    g3.pin_memory_()
+    assert g3.is_pinned()
+    g3.unpin_memory_()
+    assert not g3.is_pinned()

 @parametrize_idtype
 def test_convert_bound(idtype):

--- a/tests/compute/test_pin_memory.py
+++ b/tests/compute/test_pin_memory.py
@@ -25,7 +25,7 @@ def test_pin_unpin():
            F.to_dgl_nd(t_pin).unpin_memory_()
    else:
        with pytest.raises(dgl.DGLError):
-            # tensorflow and mxnet should throw an erro
+            # tensorflow and mxnet should throw an error
            dgl.utils.pin_memory_inplace(t)

 if __name__ == "__main__":

--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -3,14 +3,12 @@ import sys
 import os
 import numpy as np
 from scipy import sparse as spsp
-from numpy.testing import assert_array_equal
-from dgl.heterograph_index import create_unitgraph_from_coo
 from dgl.distributed import partition_graph, load_partition, load_partition_feats
+from dgl.distributed.graph_partition_book import BasicPartitionBook, RangePartitionBook, \
+    NodePartitionPolicy, EdgePartitionPolicy, HeteroDataName
 from dgl import function as fn
 import backend as F
 import unittest
-import pickle
-import random
 import tempfile

 def _get_inner_node_mask(graph, ntype_id):
@@ -426,8 +424,100 @@ def test_hetero_partition():
    check_hetero_partition(hg, 'random')
    check_hetero_partition(hg, 'metis', 4, 8, load_feats=False)

+@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
+def test_BasicPartitionBook():
+    part_id = 0
+    num_parts = 2
+    node_map = np.random.choice(num_parts, 1000)
+    edge_map = np.random.choice(num_parts, 5000)
+    graph = dgl.rand_graph(1000, 5000)
+    graph = dgl.node_subgraph(graph, F.arange(0, graph.num_nodes()))
+    gpb = BasicPartitionBook(part_id, num_parts, node_map, edge_map, graph)
+    c_etype = ('_N', '_E', '_N')
+    assert gpb.etypes == ['_E']
+    assert gpb.canonical_etypes == [c_etype]
+
+    node_policy = NodePartitionPolicy(gpb, '_N')
+    assert node_policy.type_name == '_N'
+    edge_policy = EdgePartitionPolicy(gpb, '_E')
+    assert edge_policy.type_name == '_E'
+
+@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
+def test_RangePartitionBook():
+    part_id = 0
+    num_parts = 2
+    # homogeneous
+    node_map = {'_N': F.tensor([[0, 1000], [1000, 2000]])}
+    edge_map = {'_E': F.tensor([[0, 5000], [5000, 10000]])}
+    ntypes = {'_N': 0}
+    etypes = {'_E': 0}
+    gpb = RangePartitionBook(
+        part_id, num_parts, node_map, edge_map, ntypes, etypes)
+    assert gpb.etypes == ['_E']
+    assert gpb.canonical_etypes == [None]
+    assert gpb._to_canonical_etype('_E') == '_E'
+
+    node_policy = NodePartitionPolicy(gpb, '_N')
+    assert node_policy.type_name == '_N'
+    edge_policy = EdgePartitionPolicy(gpb, '_E')
+    assert edge_policy.type_name == '_E'
+
+    # heterogeneous, init via etype
+    node_map = {'node1': F.tensor([[0, 1000], [1000, 2000]]), 'node2': F.tensor([
+        [0, 1000], [1000, 2000]])}
+    edge_map = {'edge1': F.tensor([[0, 5000], [5000, 10000]])}
+    ntypes = {'node1': 0, 'node2': 1}
+    etypes = {'edge1': 0}
+    gpb = RangePartitionBook(
+        part_id, num_parts, node_map, edge_map, ntypes, etypes)
+    assert gpb.etypes == ['edge1']
+    assert gpb.canonical_etypes == [None]
+    assert gpb._to_canonical_etype('edge1') == 'edge1'
+
+    node_policy = NodePartitionPolicy(gpb, 'node1')
+    assert node_policy.type_name == 'node1'
+    edge_policy = EdgePartitionPolicy(gpb, 'edge1')
+    assert edge_policy.type_name == 'edge1'
+
+    # heterogeneous, init via canonical etype
+    node_map = {'node1': F.tensor([[0, 1000], [1000, 2000]]), 'node2': F.tensor([
+        [0, 1000], [1000, 2000]])}
+    edge_map = {('node1', 'edge1', 'node2'): F.tensor([[0, 5000], [5000, 10000]])}
+    ntypes = {'node1': 0, 'node2': 1}
+    etypes = {('node1', 'edge1', 'node2'): 0}
+    c_etype = list(etypes.keys())[0]
+    gpb = RangePartitionBook(
+        part_id, num_parts, node_map, edge_map, ntypes, etypes)
+    assert gpb.etypes == ['edge1']
+    assert gpb.canonical_etypes == [c_etype]
+    assert gpb._to_canonical_etype('edge1') == c_etype
+    assert gpb._to_canonical_etype(c_etype) == c_etype
+    expect_except = False
+    try:
+        gpb._to_canonical_etype(('node1', 'edge2', 'node2'))
+    except:
+        expect_except = True
+    assert expect_except
+    expect_except = False
+    try:
+        gpb._to_canonical_etype('edge2')
+    except:
+        expect_except = True
+    assert expect_except
+
+    node_policy = NodePartitionPolicy(gpb, 'node1')
+    assert node_policy.type_name == 'node1'
+    edge_policy = EdgePartitionPolicy(gpb, c_etype)
+    assert edge_policy.type_name == c_etype
+
+    data_name = HeteroDataName(False, 'edge1', 'edge1')
+    assert data_name.get_type() == 'edge1'
+    data_name = HeteroDataName(False, c_etype, 'edge1')
+    assert data_name.get_type() == c_etype

 if __name__ == '__main__':
    os.makedirs('/tmp/partition', exist_ok=True)
    test_partition()
    test_hetero_partition()
+    test_BasicPartitionBook()
+    test_RangePartitionBook()
--- a/tests/pytorch/test_pin_memory.py
+++ b/tests/pytorch/test_pin_memory.py
@@ -57,6 +57,17 @@ def test_pin_unpin_column():
        assert col._data_nd is None
    assert not g.ndata['x'].is_pinned()

+@pytest.mark.skipif(F._default_context_str == 'cpu', reason='Need gpu for this test.')
+def test_pin_empty():
+    t = torch.tensor([])
+    assert not t.is_pinned()
+
+    # Empty tensors will not be pinned or unpinned. It's a no-op.
+    # This is also the default behavior in PyTorch.
+    # We just check that it won't raise an error.
+    nd = dgl.utils.pin_memory_inplace(t)
+    assert not t.is_pinned()
+
 if __name__ == "__main__":
    test_pin_noncontiguous()
    test_pin_view()

--- a/tutorials/cpu/cpu_best_practises.py
+++ b/tutorials/cpu/cpu_best_practises.py
@@ -22,25 +22,39 @@ OpenMP settings
 During training on CPU, the training and dataloading part need to be maintained simultaneously.
 Best performance of parallelization in OpenMP
 can be achieved by setting up the optimal number of working threads and dataloading workers.
+Nodes with high number of CPU cores may benefit from higher number of dataloading workers.
+A good starting point could be setting num_threads=4 in Dataloader constructor for nodes with 32 cores or more.
+If number of cores is rather small, the best performance might be achieved with just one
+dataloader worker or even with dataloader num_threads=0 for dataloading and trainig performed
+in the same process

-**GNU OpenMP**
-    Default BKM for setting the number of OMP threads with Pytorch backend:
+**Dataloader CPU affinity**

-    ``OMP_NUM_THREADS`` = number of physical cores – ``num_workers``
+If number of dataloader workers is more than 0, please consider using **use_cpu_affinity()** method
+of DGL Dataloader class, it will generally result in significant performance improvement for training.

-    Number of physical cores can be checked by using ``lscpu`` ("Core(s) per socket")
-    or ``nproc`` command in Linux command line.
-    Below simple bash script example for setting the OMP threads and ``pytorch`` backend dataloader workers:
+*use_cpu_affinity* will set the proper OpenMP thread count (equal to the number of CPU cores allocated for main process),
+affinitize dataloader workers for separate CPU cores and restrict the main process to remaining cores

-    .. code:: bash
+In multiple NUMA nodes setups *use_cpu_affinity* will only use cores of NUMA node 0 by default
+with an assumption, that the workload is scaling poorly across multiple NUMA nodes. If you believe
+your workload will have better performance utilizing more than one NUMA node, you can pass
+the list of cores to use for dataloading (loader_cores) and for compute (compute_cores).

-        cores=`nproc`
-        num_workers=4
-        export OMP_NUM_THREADS=$(($cores-$num_workers))
-        python script.py --gpu -1 --num_workers=$num_workers
+loader_cores and compute_cores arguments (list of CPU cores) can be passed to *enable_cpu_affinity* for more
+control over which cores should be used, e.g. in case a workload scales well across multiple NUMA nodes.

-    Depending on the dataset, model and CPU optimal number of dataloader workers and OpemMP threads may vary
-    but close to the general default advise presented above [#f4]_ .
+Usage:
+    .. code:: python
+
+        dataloader = dgl.dataloading.DataLoader(...)
+        ...
+        with dataloader.enable_cpu_affinity():
+            <training loop or inferencing>
+
+**Manual control**
+
+For advanced and more fine-grained control over OpenMP settings please refer to Maximize Performance of Intel® Optimization for PyTorch* on CPU [#f4]_ article

 .. rubric:: Footnotes