[Feature] Add support for sparse embedding (#2451)

* Add sparse embedding for dgl and update rgcn example * upd * Fix * Revert "Fix" This reverts commit 4da87cdfb8b8c3506b7fc7376cd2385ba8045c2a. * Fix * upd * upd * Fix * Add unitest and update impl * fix * Clean up rgcn example code * upd * upd * update * Fix * update score * sparse for sage * remove model sparse * upd * upd * remove global norm * revert delete model_sparse.py * update according to comments * Fix doc * upd * Fix test * upd * lint * lint * lint * upd * upd * clean up Co-authored-by: Ubuntu <ubuntu@ip-172-31-56-220.ec2.internal>

[Feature] Add support for sparse embedding (#2451)
* Add sparse embedding for dgl and update rgcn example * upd * Fix * Revert "Fix" This reverts commit 4da87cdfb8b8c3506b7fc7376cd2385ba8045c2a. * Fix * upd * upd * Fix * Add unitest and update impl * fix * Clean up rgcn example code * upd * upd * update * Fix * update score * sparse for sage * remove model sparse * upd * upd * remove global norm * revert delete model_sparse.py * update according to comments * Fix doc * upd * Fix test * upd * lint * lint * lint * upd * upd * clean up Co-authored-by: Ubuntu <ubuntu@ip-172-31-56-220.ec2.internal>
a7e941c3 · xiang song(charlie.song) · GitHub · 362f72cb · a7e941c3 · a7e941c3
Unverified Commit a7e941c3 authored Jan 28, 2021 by xiang song(charlie.song) Committed by GitHub Jan 28, 2021
20 changed files
--- a/docs/source/api/python/dgl.optim.rst
+++ b/docs/source/api/python/dgl.optim.rst
+.. _apioptim:
+dgl.optim
+=========
+.. automodule:: dgl.optim
+Node embedding optimizer
+-------------------------
+.. currentmodule:: dgl.optim.pytorch
+.. autoclass:: SparseAdagrad
+.. autoclass:: SparseAdam
\ No newline at end of file
--- a/docs/source/api/python/nn.pytorch.rst
+++ b/docs/source/api/python/nn.pytorch.rst
@@ -268,3 +268,9 @@ SegmentedKNNGraph
    :members:
    :show-inheritance:
+NodeEmbedding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: dgl.nn.pytorch.sparse_emb.NodeEmbedding
+    :members:
+    :show-inheritance:
--- a/examples/pytorch/rgcn/README.md
+++ b/examples/pytorch/rgcn/README.md
@@ -43,32 +43,32 @@ AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL)
 python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout='20,20' --batch-size 128
 ```
-MUTAG: accuracy avg(5 runs) 66.77%, best 69.12% (DGL)
+MUTAG: accuracy avg(5 runs) 70.00%, best 73.53% (DGL)
 ```
-python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40
+python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 64 --fanout '50,40' --use-self-loop --dgl-sparse --n-epochs 30 --sparse-lr 0.01 --dropout 0.7
 ```
-BGS: accuracy avg(5 runs) 91.72%, best 96.55% (DGL)
+BGS: accuracy avg(5 runs) 84.83%, best 89.66% (DGL)
 ```
-python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '40,40' --n-epochs=40 --batch-size=128
+python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '50,40' --n-epochs=20 --batch-size=32 --dgl-sparse  --lr 0.01 --sparse-lr 0.01 --dropout 0.3
 ```
-AM: accuracy avg(5 runs) 88.28%, best 90.40% (DGL)
+AM: accuracy avg(5 runs) 88.59%, best 88.89% (DGL)
 ```
-python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '35,35' --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40
+python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing  --gpu 0 --fanout '35,35' --batch-size 64 --n-hidden 16 --use-self-loop --n-epochs=20 --dgl-sparse --lr 0.01  --sparse-lr 0.02 --dropout 0.7
 ```
 ### Entity Classification on OGBN-MAG
 Test-bd: P3-8xlarge
-OGBN-MAG accuracy 46.22
+OGBN-MAG accuracy 45.5 (3 runs)
 ```
-python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,30' --batch-size 512 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --node-feats
+python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --node-feats --dgl-sparse --sparse-lr 0.08
 ```
-OGBN-MAG without node-feats 43.63
+OGBN-MAG without node-feats 42.79
 ```
-python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,25' --batch-size 256 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --layer-norm
+python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.0
 ```
 Test-bd: P2-8xlarge

--- a/examples/pytorch/rgcn/entity_classify_mp.py
+++ b/examples/pytorch/rgcn/entity_classify_mp.py
--- a/examples/pytorch/rgcn/model.py
+++ b/examples/pytorch/rgcn/model.py
 import torch as th
 import torch.nn as nn
+import dgl
 class BaseRGCN(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases,
                 num_hidden_layers=1, dropout=0,
@@ -48,6 +50,10 @@ class BaseRGCN(nn.Module):
            h = layer(g, h, r, norm)
        return h
+def initializer(emb):
+    emb.uniform_(-1.0, 1.0)
+    return emb
 class RelGraphEmbedLayer(nn.Module):
    r"""Embedding layer for featureless heterograph.
    Parameters
@@ -65,8 +71,8 @@ class RelGraphEmbedLayer(nn.Module):
        treat certain input feature as an one-hot encoding feature.
    embed_size : int
        Output embed size
-    embed_name : str, optional
+    dgl_sparse : bool, optional
-        Embed name
+        If true, use dgl.nn.NodeEmbedding otherwise use torch.nn.Embedding
    """
    def __init__(self,
                 dev_id,
@@ -75,29 +81,42 @@ class RelGraphEmbedLayer(nn.Module):
                 num_of_ntype,
                 input_size,
                 embed_size,
-                 sparse_emb=False,
+                 dgl_sparse=False):
-                 embed_name='embed'):
        super(RelGraphEmbedLayer, self).__init__()
        self.dev_id = th.device(dev_id if dev_id >= 0 else 'cpu')
        self.embed_size = embed_size
-        self.embed_name = embed_name
        self.num_nodes = num_nodes
-        self.sparse_emb = sparse_emb
+        self.dgl_sparse = dgl_sparse
        # create weight embeddings for each node for each relation
        self.embeds = nn.ParameterDict()
+        self.node_embeds = {} if dgl_sparse else nn.ModuleDict()
        self.num_of_ntype = num_of_ntype
-        self.idmap = th.empty(num_nodes).long()
        for ntype in range(num_of_ntype):
-            if input_size[ntype] is not None:
+            if isinstance(input_size[ntype], int):
+                if dgl_sparse:
+                    self.node_embeds[str(ntype)] = dgl.nn.NodeEmbedding(input_size[ntype], embed_size, name=str(ntype),
+                        init_func=initializer)
+                else:
+                    sparse_emb = th.nn.Embedding(input_size[ntype], embed_size, sparse=True)
+                    nn.init.uniform_(sparse_emb.weight, -1.0, 1.0)
+                    self.node_embeds[str(ntype)] = sparse_emb
+            else:
                input_emb_size = input_size[ntype].shape[1]
                embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
                nn.init.xavier_uniform_(embed)
                self.embeds[str(ntype)] = embed
-        self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb)
+    @property
-        nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0)
+    def dgl_emb(self):
+        """
+        """
+        if self.dgl_sparse:
+            embs = [emb for emb in self.node_embeds.values()]
+            return embs
+        else:
+            return []
    def forward(self, node_ids, node_tids, type_ids, features):
        """Forward computation
@@ -117,14 +136,16 @@ class RelGraphEmbedLayer(nn.Module):
        tensor
            embeddings as the input of the next layer
        """
-        tsd_ids = node_ids.to(self.node_embeds.weight.device)
+        tsd_ids = node_ids.to(self.dev_id)
        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id)
        for ntype in range(self.num_of_ntype):
-            if features[ntype] is not None:
            loc = node_tids == ntype
-                embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id)
+            if isinstance(features[ntype], int):
+                if self.dgl_sparse:
+                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc], self.dev_id)
                else:
-                loc = node_tids == ntype
+                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc]).to(self.dev_id)
-                embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.dev_id)
+            else:
+                embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id)
        return embeds
--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -38,6 +38,7 @@ from .transform import *
 from .propagate import *
 from .random import *
 from .data.utils import save_graphs, load_graphs
+from . import optim
 from ._deprecate.graph import DGLGraph as DGLGraphStale
 from ._deprecate.nodeflow import *
--- a/python/dgl/backend/__init__.py
+++ b/python/dgl/backend/__init__.py
@@ -73,7 +73,6 @@ def load_backend(mod_name):
            else:
                setattr(thismod, api, _gen_missing_api(api, mod_name))
 def get_preferred_backend():
    config_path = os.path.join(os.path.expanduser('~'), '.dgl', 'config.json')
    backend_name = None

--- a/python/dgl/backend/backend.py
+++ b/python/dgl/backend/backend.py
@@ -1615,3 +1615,14 @@ class no_grad(object):
    def __exit__(self, exc_type, exc_value, exc_traceback):
        pass
+class NodeEmbedding(object):
+    """Sparse node embeddings"""
+    def __init__(self):
+        pass
+    def __enter__(self):
+        pass
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        pass
--- a/python/dgl/backend/mxnet/sparse_optim.py
+++ b/python/dgl/backend/mxnet/sparse_optim.py
+"""Sparse optimizer is not supported for mxnet"""
\ No newline at end of file
--- a/python/dgl/backend/tensorflow/sparse_optim.py
+++ b/python/dgl/backend/tensorflow/sparse_optim.py
+"""Sparse optimizer is not supported for tensorflow"""
\ No newline at end of file
--- a/python/dgl/nn/pytorch/__init__.py
+++ b/python/dgl/nn/pytorch/__init__.py
@@ -5,3 +5,4 @@ from .softmax import *
 from .factory import *
 from .hetero import *
 from .utils import Sequential, WeightBasis
+from .sparse_emb import NodeEmbedding
--- a/python/dgl/nn/pytorch/sparse_emb.py
+++ b/python/dgl/nn/pytorch/sparse_emb.py
+"""Torch NodeEmbedding."""
+from datetime import timedelta
+import torch as th
+from ...backend import pytorch as F
+from ...utils import get_shared_mem_array, create_shared_mem_array
+_STORE = None
+class NodeEmbedding: # NodeEmbedding
+    '''Class for storing node embeddings.
+    The class is optimized for training large-scale node embeddings. It updates the embedding in
+    a sparse way and can scale to graphs with millions of nodes. It also supports partitioning
+    to multiple GPUs (on a single machine) for more acceleration. It does not support partitioning
+    across machines.
+    Currently, DGL provides two optimizers that work with this NodeEmbedding
+    class: ``SparseAdagrad`` and ``SparseAdam``.
+    The implementation is based on torch.distributed package. It depends on the pytorch
+    default distributed process group to collect multi-process information and uses
+    ``torch.distributed.TCPStore`` to share meta-data information across multiple gpu processes.
+    It use the local address of '127.0.0.1:12346' to initialize the TCPStore.
+    Parameters
+    ----------
+    num_embeddings : int
+        The number of embeddings. Currently, the number of embeddings has to be the same as
+        the number of nodes.
+    embedding_dim : int
+        The dimension size of embeddings.
+    name : str
+        The name of the embeddings. The name should uniquely identify the embeddings in the system.
+    init_func : callable, optional
+        The function to create the initial data. If the init function is not provided,
+        the values of the embeddings are initialized to zero.
+    Examples
+    --------
+    Before launching multiple gpu processes
+    >>> def initializer(emb):
+            th.nn.init.xavier_uniform_(emb)
+            return emb
+    In each training process
+    >>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
+    >>> optimizer = dgl.optim.SparseAdam([emb], lr=0.001)
+    >>> for blocks in dataloader:
+    ...     ...
+    ...     feats = emb(nids, gpu_0)
+    ...     loss = F.sum(feats + 1, 0)
+    ...     loss.backward()
+    ...     optimizer.step()
+    '''
+    def __init__(self, num_embeddings, embedding_dim, name,
+                 init_func=None):
+        global _STORE
+        # Check whether it is multi-gpu training or not.
+        if th.distributed.is_initialized():
+            rank = th.distributed.get_rank()
+            world_size = th.distributed.get_world_size()
+        else:
+            rank = -1
+            world_size = 0
+        self._rank = rank
+        self._world_size = world_size
+        host_name = '127.0.0.1'
+        port = 12346
+        if rank <= 0:
+            emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
+            if init_func is not None:
+                emb = init_func(emb)
+        if rank == 0:
+            if world_size > 1:
+                # for multi-gpu training, setup a TCPStore for
+                # embeding status synchronization across GPU processes
+                if _STORE is None:
+                    _STORE = th.distributed.TCPStore(
+                        host_name, port, world_size, True, timedelta(seconds=30))
+                for _ in range(1, world_size):
+                    # send embs
+                    _STORE.set(name, name)
+        elif rank > 0:
+            # receive
+            if _STORE is None:
+                _STORE = th.distributed.TCPStore(
+                    host_name, port, world_size, False, timedelta(seconds=30))
+            _STORE.wait([name])
+            emb = get_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
+        self._store = _STORE
+        self._tensor = emb
+        self._num_embeddings = num_embeddings
+        self._embedding_dim = embedding_dim
+        self._name = name
+        self._optm_state = None # track optimizer state
+        self._trace = [] # track minibatch
+    def __call__(self, node_ids, device=th.device('cpu')):
+        """
+        node_ids : th.tensor
+            Index of the embeddings to collect.
+        device : th.device
+            Target device to put the collected embeddings.
+        """
+        emb = self._tensor[node_ids].to(device)
+        if F.is_recording():
+            emb = F.attach_grad(emb)
+            self._trace.append((node_ids.to(device, non_blocking=True), emb))
+        return emb
+    @property
+    def store(self):
+        """Return torch.distributed.TCPStore for
+        meta data sharing across processes.
+        Returns
+        -------
+        torch.distributed.TCPStore
+            KVStore used for meta data sharing.
+        """
+        return self._store
+    @property
+    def rank(self):
+        """Return rank of current process.
+        Returns
+        -------
+        int
+            The rank of current process.
+        """
+        return self._rank
+    @property
+    def world_size(self):
+        """Return world size of the pytorch distributed training env.
+        Returns
+        -------
+        int
+            The world size of the pytorch distributed training env.
+        """
+        return self._world_size
+    @property
+    def name(self):
+        """Return the name of NodeEmbedding.
+        Returns
+        -------
+        str
+            The name of NodeEmbedding.
+        """
+        return self._name
+    @property
+    def num_embeddings(self):
+        """Return the number of embeddings.
+        Returns
+        -------
+        int
+            The number of embeddings.
+        """
+        return self._num_embeddings
+    def set_optm_state(self, state):
+        """Store the optimizer related state tensor.
+        Parameters
+        ----------
+        state : tuple of torch.Tensor
+            Optimizer related state.
+        """
+        self._optm_state = state
+    @property
+    def optm_state(self):
+        """Return the optimizer related state tensor.
+        Returns
+        -------
+        tuple of torch.Tensor
+            The optimizer related state.
+        """
+        return self._optm_state
+    @property
+    def trace(self):
+        """Return a trace of the indices of embeddings
+        used in the training step(s).
+        Returns
+        -------
+        [torch.Tensor]
+            The indices of embeddings used in the training step(s).
+        """
+        return self._trace
+    def reset_trace(self):
+        """Clean up the trace of the indices of embeddings
+        used in the training step(s).
+        """
+        self._trace = []
+    @property
+    def emb_tensor(self):
+        """Return the tensor storing the node embeddings
+        Returns
+        -------
+        torch.Tensor
+            The tensor storing the node embeddings
+        """
+        return self._tensor
--- a/python/dgl/optim/__init__.py
+++ b/python/dgl/optim/__init__.py
+"""dgl optims."""
+import importlib
+import sys
+import os
+from ..backend import backend_name
+from ..utils import expand_as_pair
+def _load_backend(mod_name):
+    mod = importlib.import_module('.%s' % mod_name, __name__)
+    thismod = sys.modules[__name__]
+    for api, obj in mod.__dict__.items():
+        setattr(thismod, api, obj)
+_load_backend(backend_name)
--- a/python/dgl/optim/mxnet/__init__.py
+++ b/python/dgl/optim/mxnet/__init__.py
--- a/python/dgl/optim/pytorch/__init__.py
+++ b/python/dgl/optim/pytorch/__init__.py
+"""dgl optims for pytorch."""
+from .sparse_optim import SparseAdagrad, SparseAdam
--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
+"""Node embedding optimizers"""
+import abc
+from abc import abstractmethod
+import torch as th
+from ...utils import get_shared_mem_array, create_shared_mem_array
+from ...nn.pytorch import NodeEmbedding
+class SparseGradOptimizer(abc.ABC):
+    r''' The abstract sparse optimizer.
+    Note: dgl sparse optimizer only work with dgl.NodeEmbedding
+    Parameters
+    ----------
+    params : list of NodeEmbedding
+        The list of NodeEmbeddings.
+    lr : float
+        The learning rate.
+    '''
+    def __init__(self, params, lr):
+        self._params = params
+        self._lr = lr
+        self._rank = None
+        self._world_size = None
+        self._shared_cache = {}
+        self._clean_grad = False
+    def step(self):
+        ''' The step function.
+        The step function is invoked at the end of every batch to update embeddings
+        '''
+        with th.no_grad():
+            # Frequently alloc and free shared memory to hold intermediate tensor is expensive
+            # We cache shared memory buffers in shared_emb.
+            shared_emb = {emb.name: ([], []) for emb in self._params}
+            # Go through all sparse embeddings
+            for emb in self._params: # pylint: disable=too-many-nested-blocks
+                num_embeddings = emb.num_embeddings
+                emb_name = emb.name
+                # Each gpu process takes the resposibility of update a range of sparse embedding,
+                # thus we can parallel the gradient update.
+                range_size = (num_embeddings + self._world_size - 1) // self._world_size \
+                    if self._world_size > 0 else 0
+                for idx, data in emb._trace:
+                    grad = data.grad.data
+                    device = grad.device
+                    idx_dtype = idx.dtype
+                    grad_dtype = grad.dtype
+                    grad_dim = grad.shape[1]
+                    if self._world_size > 0:
+                        if emb_name not in self._shared_cache:
+                            self._shared_cache[emb_name] = {}
+                        for i in range(self._world_size):
+                            start = i * range_size
+                            end = (i + 1) * range_size \
+                                if (i + 1) * range_size < num_embeddings \
+                                else num_embeddings
+                            if i == 0:
+                                mask = idx < end
+                            elif i + 1 == self._world_size:
+                                mask = idx >= start
+                            else:
+                                mask = th.logical_and((idx >= start), (idx < end))
+                            idx_i = idx[mask]
+                            grad_i = grad[mask]
+                            if i == self._rank:
+                                shared_emb[emb_name][0].append(idx_i)
+                                shared_emb[emb_name][1].append(grad_i)
+                            else:
+                                # currently nccl does not support Alltoallv operation
+                                # we need to use CPU shared memory to share gradient
+                                # across processes
+                                idx_i = idx_i.to(th.device('cpu'))
+                                grad_i = grad_i.to(th.device('cpu'))
+                                idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
+                                grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
+                                if idx_shmem_name not in self._shared_cache[emb_name] or \
+                                    self._shared_cache[emb_name][idx_shmem_name].shape[0] \
+                                        < idx_i.shape[0]:
+                                    # in case idx_i.shape[0] is 0
+                                    idx_shmem = create_shared_mem_array(idx_shmem_name, \
+                                        (idx_i.shape[0] * 2 + 2,), idx_dtype)
+                                    grad_shmem = create_shared_mem_array(grad_shmem_name, \
+                                        (idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype)
+                                    self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
+                                    self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
+                                self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
+                                    = idx_i
+                                self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
+                                    = grad_i
+                                emb.store.set(idx_shmem_name, str(idx_i.shape[0]))
+                        # gather gradients from all other processes
+                        for i in range(self._world_size):
+                            if i != self._rank:
+                                idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
+                                grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
+                                size = int(emb.store.get(idx_shmem_name))
+                                if idx_shmem_name not in self._shared_cache[emb_name] or \
+                                    self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
+                                    idx_shmem = get_shared_mem_array(idx_shmem_name, \
+                                        (size * 2 + 2,), idx_dtype)
+                                    grad_shmem = get_shared_mem_array(grad_shmem_name, \
+                                        (size * 2 + 2, grad_dim), grad_dtype)
+                                    self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
+                                    self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
+                                idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
+                                grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
+                                shared_emb[emb_name][0].append(idx_i.to(device,
+                                                                        non_blocking=True))
+                                shared_emb[emb_name][1].append(grad_i.to(device,
+                                                                         non_blocking=True))
+                    else:
+                        shared_emb[emb_name][0].append(idx)
+                        shared_emb[emb_name][1].append(grad)
+            if self._clean_grad:
+                # clean gradient track
+                for emb in self._params:
+                    emb.reset_trace()
+                self._clean_grad = False
+            for emb in self._params:
+                emb_name = emb.name
+                idx = th.cat(shared_emb[emb_name][0], dim=0)
+                grad = th.cat(shared_emb[emb_name][1], dim=0)
+                self.update(idx, grad, emb)
+            # synchronized gradient update
+            if self._world_size > 1:
+                th.distributed.barrier()
+    @abstractmethod
+    def update(self, idx, grad, emb):
+        """ Update embeddings in a sparse manner
+        Sparse embeddings are updated in mini batches. we maintains gradient states for
+        each embedding so they can be updated separately.
+        Parameters
+        ----------
+        idx : tensor
+            Index of the embeddings to be updated.
+        grad : tensor
+            Gradient of each embedding.
+        emb : dgl.nn.NodeEmbedding
+            Sparse node embedding to update.
+        """
+    def zero_grad(self):
+        """clean grad cache
+        """
+        self._clean_grad = True
+class SparseAdagrad(SparseGradOptimizer):
+    r''' Node embedding optimizer using the Adagrad algorithm.
+    This optimizer implements a sparse version of Adagrad algorithm for
+    optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only updates
+    the embeddings whose gradients have updates, which are usually a very
+    small portion of the total embeddings.
+    Adagrad maintains a :math:`G_{t,i,j}` for every parameter in the embeddings, where
+    :math:`G_{t,i,j}=G_{t-1,i,j} + g_{t,i,j}^2` and :math:`g_{t,i,j}` is the gradient of
+    the dimension :math:`j` of embedding :math:`i` at step :math:`t`.
+    Parameters
+    ----------
+    params : list[dgl.nn.NodeEmbedding]
+        The list of dgl.nn.NodeEmbedding.
+    lr : float
+        The learning rate.
+    eps : float, Optional
+        The term added to the denominator to improve numerical stability
+        Default: 1e-10
+    Examples
+    --------
+    >>> def initializer(emb):
+            th.nn.init.xavier_uniform_(emb)
+            return emb
+    >>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
+    >>> optimizer = dgl.optim.SparseAdagrad([emb], lr=0.001)
+    >>> for blocks in dataloader:
+    ...     ...
+    ...     feats = emb(nids, gpu_0)
+    ...     loss = F.sum(feats + 1, 0)
+    ...     loss.backward()
+    ...     optimizer.step()
+    '''
+    def __init__(self, params, lr, eps=1e-10):
+        super(SparseAdagrad, self).__init__(params, lr)
+        self._eps = eps
+        # We need to register a state sum for each embedding in the kvstore.
+        for emb in params:
+            assert isinstance(emb, NodeEmbedding), \
+                'SparseAdagrad only supports dgl.nn.NodeEmbedding'
+            if self._rank is None:
+                self._rank = emb.rank
+                self._world_size = emb.world_size
+            else:
+                assert self._rank == emb.rank, \
+                    'MultiGPU rank for each embedding should be same.'
+                assert self._world_size == emb.world_size, \
+                    'MultiGPU world_size for each embedding should be same.'
+            if self._rank <= 0:
+                emb_name = emb.name
+                state = create_shared_mem_array(emb_name+'_state', \
+                    emb.emb_tensor.shape, th.float32).zero_()
+            if self._rank == 0:
+                for _ in range(1, world_size):
+                    # send embs
+                    emb.store.set(emb_name+'_opt', emb_name)
+            elif self._rank > 0:
+                # receive
+                emb_name = emb.name
+                emb.store.wait([emb_name+'_opt'])
+                state = get_shared_mem_array(emb_name+'_state', \
+                    emb.emb_tensor.shape, th.float32)
+            emb.set_optm_state(state)
+    def update(self, idx, grad, emb):
+        """ Update embeddings in a sparse manner
+        Sparse embeddings are updated in mini batches. we maintains gradient states for
+        each embedding so they can be updated separately.
+        Parameters
+        ----------
+        idx : tensor
+            Index of the embeddings to be updated.
+        grad : tensor
+            Gradient of each embedding.
+        emb : dgl.nn.NodeEmbedding
+            Sparse embedding to update.
+        """
+        eps = self._eps
+        clr = self._lr
+        # the update is non-linear so indices must be unique
+        grad_indices, inverse, cnt = th.unique(idx, return_inverse=True, return_counts=True)
+        grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=grad.device)
+        grad_values.index_add_(0, inverse, grad)
+        grad_values = grad_values / cnt.unsqueeze(1)
+        grad_sum = (grad_values * grad_values)
+        state = emb.optm_state
+        state_dev = state.device
+        state_idx = grad_indices.to(state_dev)
+        grad_state = state[state_idx].to(grad.device)
+        grad_state += grad_sum
+        state[state_idx] = grad_state.to(state_dev)
+        std_values = grad_state.add_(eps).sqrt_()
+        tmp = clr * grad_values / std_values
+        emb.emb_tensor[state_idx] -= tmp.to(state_dev)
+class SparseAdam(SparseGradOptimizer):
+    r''' Node embedding optimizer using the Adam algorithm.
+    This optimizer implements a sparse version of Adagrad algorithm for
+    optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only
+    updates the embeddings whose gradients have updates, which are usually
+    a very small portion of the total embeddings.
+    Adam maintains a :math:`Gm_{t,i,j}` and `Gp_{t,i,j}` for every parameter
+    in the embeddings, where
+    :math:`Gm_{t,i,j}=beta1 * Gm_{t-1,i,j} + (1-beta1) * g_{t,i,j}`,
+    :math:`Gp_{t,i,j}=beta2 * Gp_{t-1,i,j} + (1-beta2) * g_{t,i,j}^2`,
+    :math:`g_{t,i,j} = lr * Gm_{t,i,j} / (1 - beta1^t) / \sqrt{Gp_{t,i,j} / (1 - beta2^t)}` and
+    :math:`g_{t,i,j}` is the gradient of the dimension :math:`j` of embedding :math:`i`
+    at step :math:`t`.
+    Parameters
+    ----------
+    params : list[dgl.nn.NodeEmbedding]
+        The list of dgl.nn.NodeEmbeddings.
+    lr : float
+        The learning rate.
+    betas : tuple[float, float], Optional
+        Coefficients used for computing running averages of gradient and its square.
+        Default: (0.9, 0.999)
+    eps : float, Optional
+        The term added to the denominator to improve numerical stability
+        Default: 1e-8
+    Examples:
+    >>> def initializer(emb):
+            th.nn.init.xavier_uniform_(emb)
+            return emb
+    >>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
+    >>> optimizer = dgl.optim.SparseAdam([emb], lr=0.001)
+    >>> for blocks in dataloader:
+    ...     ...
+    ...     feats = emb(nids, gpu_0)
+    ...     loss = F.sum(feats + 1, 0)
+    ...     loss.backward()
+    ...     optimizer.step()
+    '''
+    def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08):
+        super(SparseAdam, self).__init__(params, lr)
+        self._lr = lr
+        self._beta1 = betas[0]
+        self._beta2 = betas[1]
+        self._eps = eps
+        # We need to register a state sum for each embedding in the kvstore.
+        for emb in params:
+            assert isinstance(emb, NodeEmbedding), \
+                'SparseAdam only supports dgl.nn.NodeEmbedding'
+            if self._rank is None:
+                self._rank = emb.rank
+                self._world_size = emb.world_size
+            else:
+                assert self._rank == emb.rank, \
+                    'MultiGPU rank for each embedding should be same.'
+                assert self._world_size == emb.world_size, \
+                    'MultiGPU world_size for each embedding should be same.'
+            if self._rank <= 0:
+                emb_name = emb.name
+                state_step = create_shared_mem_array(emb_name+'_step', \
+                    (emb.emb_tensor.shape[0],), th.float32).zero_()
+                state_mem = create_shared_mem_array(emb_name+'_mem', \
+                    emb.emb_tensor.shape, th.float32).zero_()
+                state_power = create_shared_mem_array(emb_name+'_power', \
+                    emb.emb_tensor.shape, th.float32).zero_()
+            if self._rank == 0:
+                state = (state_step, state_mem, state_power)
+                emb_name = emb.name
+                for _ in range(1, self._world_size):
+                    # send embs
+                    emb.store.set(emb_name+'_opt', emb_name)
+            elif self._rank > 0:
+                # receive
+                emb_name = emb.name
+                emb.store.wait([emb_name+'_opt'])
+                state_step = get_shared_mem_array(emb_name+'_step', \
+                    (emb.emb_tensor.shape[0],), th.float32)
+                state_mem = get_shared_mem_array(emb_name+'_mem', \
+                    emb.emb_tensor.shape, th.float32)
+                state_power = get_shared_mem_array(emb_name+'_power', \
+                    emb.emb_tensor.shape, th.float32)
+            state = (state_step, state_mem, state_power)
+            emb.set_optm_state(state)
+    def update(self, idx, grad, emb):
+        """ Update embeddings in a sparse manner
+        Sparse embeddings are updated in mini batches. we maintains gradient states for
+        each embedding so they can be updated separately.
+        Parameters
+        ----------
+        idx : tensor
+            Index of the embeddings to be updated.
+        grad : tensor
+            Gradient of each embedding.
+        emb : dgl.nn.NodeEmbedding
+            Sparse embedding to update.
+        """
+        with th.no_grad():
+            beta1 = self._beta1
+            beta2 = self._beta2
+            eps = self._eps
+            clr = self._lr
+            state_step, state_mem, state_power = emb.optm_state
+            exec_dev = grad.device
+            state_dev = state_step.device
+            # There can be duplicated indices due to sampling.
+            # Thus unique them here and average the gradient here.
+            grad_indices, inverse, cnt = th.unique(idx,
+                                                   return_inverse=True,
+                                                   return_counts=True)
+            state_idx = grad_indices.to(state_dev)
+            state_step[state_idx] += 1
+            state_step = state_step[state_idx].to(exec_dev, non_blocking=True)
+            orig_mem = state_mem[state_idx].to(exec_dev, non_blocking=True)
+            orig_power = state_power[state_idx].to(exec_dev, non_blocking=True)
+            grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=exec_dev)
+            grad_values.index_add_(0, inverse, grad)
+            grad_values = grad_values / cnt.unsqueeze(1)
+            grad_mem = grad_values
+            grad_power = grad_values * grad_values
+            update_mem = beta1 * orig_mem + (1.-beta1) * grad_mem
+            update_power = beta2 * orig_power + (1.-beta2) * grad_power
+            state_mem[state_idx] = update_mem.to(state_dev, non_blocking=True)
+            state_power[state_idx] = update_power.to(state_dev, non_blocking=True)
+            update_mem_corr = update_mem / (1. - th.pow(th.tensor(beta1, device=exec_dev),
+                                                        state_step)).unsqueeze(1)
+            update_power_corr = update_power / (1. - th.pow(th.tensor(beta2, device=exec_dev),
+                                                            state_step)).unsqueeze(1)
+            std_values = clr * update_mem_corr / (th.sqrt(update_power_corr) + eps)
+            emb.emb_tensor[state_idx] -= std_values.to(state_dev)
--- a/python/dgl/optim/tensorflow/__init__.py
+++ b/python/dgl/optim/tensorflow/__init__.py
--- a/python/dgl/utils/__init__.py
+++ b/python/dgl/utils/__init__.py
@@ -2,3 +2,4 @@
 from .internal import *
 from .data import *
 from .checks import *
+from .shared_mem import *
--- a/python/dgl/utils/shared_mem.py
+++ b/python/dgl/utils/shared_mem.py
+"""Shared memory utilities."""
+from .. import backend as F
+from .._ffi.ndarray import empty_shared_mem
+def get_shared_mem_array(name, shape, dtype):
+    """ Get a tensor from shared memory with specific name
+    Parameters
+    ----------
+    name : str
+        The unique name of the shared memory
+    shape : tuple of int
+        The shape of the returned tensor
+    dtype : F.dtype
+        The dtype of the returned tensor
+    Returns
+    -------
+    F.tensor
+        The tensor got from shared memory.
+    """
+    name = 'DGL_'+name
+    new_arr = empty_shared_mem(name, False, shape, F.reverse_data_type_dict[dtype])
+    dlpack = new_arr.to_dlpack()
+    return F.zerocopy_from_dlpack(dlpack)
+def create_shared_mem_array(name, shape, dtype):
+    """ Create a tensor from shared memory with the specific name
+    Parameters
+    ----------
+    name : str
+        The unique name of the shared memory
+    shape : tuple of int
+        The shape of the returned tensor
+    dtype : F.dtype
+        The dtype of the returned tensor
+    Returns
+    -------
+    F.tensor
+        The created tensor.
+    """
+    name = 'DGL_'+name
+    new_arr = empty_shared_mem(name, True, shape, F.reverse_data_type_dict[dtype])
+    dlpack = new_arr.to_dlpack()
+    return F.zerocopy_from_dlpack(dlpack)
--- a/src/runtime/shared_mem.cc
+++ b/src/runtime/shared_mem.cc
@@ -32,7 +32,7 @@ class SharedMemoryResource: public Resource {
  }
  void Destroy() {
-    LOG(INFO) << "remove " << name << " for shared memory";
+    // LOG(INFO) << "remove " << name << " for shared memory";
    shm_unlink(name.c_str());
  }
 };
@@ -55,7 +55,7 @@ SharedMemory::~SharedMemory() {
  CHECK(munmap(ptr_, size_) != -1) << strerror(errno);
  close(fd_);
  if (own_) {
-    LOG(INFO) << "remove " << name << " for shared memory";
+    // LOG(INFO) << "remove " << name << " for shared memory";
    shm_unlink(name.c_str());
    // The resource has been deleted. We don't need to keep track of it any more.
    DeleteResource(name);