Unverified Commit a7e941c3 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Feature] Add support for sparse embedding (#2451)



* Add sparse embedding for dgl and update rgcn example

* upd

* Fix

* Revert "Fix"

This reverts commit 4da87cdfb8b8c3506b7fc7376cd2385ba8045c2a.

* Fix

* upd

* upd

* Fix

* Add unitest and update impl

* fix

* Clean up rgcn example code

* upd

* upd

* update

* Fix

* update score

* sparse for sage

* remove model sparse

* upd

* upd

* remove global norm

* revert delete model_sparse.py

* update according to comments

* Fix doc

* upd

* Fix test

* upd

* lint

* lint

* lint

* upd

* upd

* clean up
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-56-220.ec2.internal>
parent 362f72cb
.. _apioptim:
dgl.optim
=========
.. automodule:: dgl.optim
Node embedding optimizer
-------------------------
.. currentmodule:: dgl.optim.pytorch
.. autoclass:: SparseAdagrad
.. autoclass:: SparseAdam
\ No newline at end of file
...@@ -268,3 +268,9 @@ SegmentedKNNGraph ...@@ -268,3 +268,9 @@ SegmentedKNNGraph
:members: :members:
:show-inheritance: :show-inheritance:
NodeEmbedding
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: dgl.nn.pytorch.sparse_emb.NodeEmbedding
:members:
:show-inheritance:
...@@ -43,32 +43,32 @@ AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL) ...@@ -43,32 +43,32 @@ AIFB: accuracy avg(5 runs) 90.56%, best 94.44% (DGL)
python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout='20,20' --batch-size 128 python3 entity_classify_mp.py -d aifb --testing --gpu 0 --fanout='20,20' --batch-size 128
``` ```
MUTAG: accuracy avg(5 runs) 66.77%, best 69.12% (DGL) MUTAG: accuracy avg(5 runs) 70.00%, best 73.53% (DGL)
``` ```
python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 256 --use-self-loop --n-epochs 40 python3 entity_classify_mp.py -d mutag --l2norm 5e-4 --n-bases 30 --testing --gpu 0 --batch-size 64 --fanout '50,40' --use-self-loop --dgl-sparse --n-epochs 30 --sparse-lr 0.01 --dropout 0.7
``` ```
BGS: accuracy avg(5 runs) 91.72%, best 96.55% (DGL) BGS: accuracy avg(5 runs) 84.83%, best 89.66% (DGL)
``` ```
python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '40,40' --n-epochs=40 --batch-size=128 python3 entity_classify_mp.py -d bgs --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '50,40' --n-epochs=20 --batch-size=32 --dgl-sparse --lr 0.01 --sparse-lr 0.01 --dropout 0.3
``` ```
AM: accuracy avg(5 runs) 88.28%, best 90.40% (DGL) AM: accuracy avg(5 runs) 88.59%, best 88.89% (DGL)
``` ```
python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '35,35' --batch-size 256 --lr 1e-2 --n-hidden 16 --use-self-loop --n-epochs=40 python3 entity_classify_mp.py -d am --l2norm 5e-4 --n-bases 40 --testing --gpu 0 --fanout '35,35' --batch-size 64 --n-hidden 16 --use-self-loop --n-epochs=20 --dgl-sparse --lr 0.01 --sparse-lr 0.02 --dropout 0.7
``` ```
### Entity Classification on OGBN-MAG ### Entity Classification on OGBN-MAG
Test-bd: P3-8xlarge Test-bd: P3-8xlarge
OGBN-MAG accuracy 46.22 OGBN-MAG accuracy 45.5 (3 runs)
``` ```
python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,30' --batch-size 512 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --node-feats python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --node-feats --dgl-sparse --sparse-lr 0.08
``` ```
OGBN-MAG without node-feats 43.63 OGBN-MAG without node-feats 42.79
``` ```
python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='25,25' --batch-size 256 --n-hidden 64 --lr 0.01 --num-worker 0 --eval-batch-size 8 --low-mem --gpu 0,1,2,3,4,5,6,7 --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --mix-cpu-gpu --layer-norm python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.0
``` ```
Test-bd: P2-8xlarge Test-bd: P2-8xlarge
......
This diff is collapsed.
import torch as th import torch as th
import torch.nn as nn import torch.nn as nn
import dgl
class BaseRGCN(nn.Module): class BaseRGCN(nn.Module):
def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases, def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases,
num_hidden_layers=1, dropout=0, num_hidden_layers=1, dropout=0,
...@@ -48,6 +50,10 @@ class BaseRGCN(nn.Module): ...@@ -48,6 +50,10 @@ class BaseRGCN(nn.Module):
h = layer(g, h, r, norm) h = layer(g, h, r, norm)
return h return h
def initializer(emb):
emb.uniform_(-1.0, 1.0)
return emb
class RelGraphEmbedLayer(nn.Module): class RelGraphEmbedLayer(nn.Module):
r"""Embedding layer for featureless heterograph. r"""Embedding layer for featureless heterograph.
Parameters Parameters
...@@ -65,8 +71,8 @@ class RelGraphEmbedLayer(nn.Module): ...@@ -65,8 +71,8 @@ class RelGraphEmbedLayer(nn.Module):
treat certain input feature as an one-hot encoding feature. treat certain input feature as an one-hot encoding feature.
embed_size : int embed_size : int
Output embed size Output embed size
embed_name : str, optional dgl_sparse : bool, optional
Embed name If true, use dgl.nn.NodeEmbedding otherwise use torch.nn.Embedding
""" """
def __init__(self, def __init__(self,
dev_id, dev_id,
...@@ -75,29 +81,42 @@ class RelGraphEmbedLayer(nn.Module): ...@@ -75,29 +81,42 @@ class RelGraphEmbedLayer(nn.Module):
num_of_ntype, num_of_ntype,
input_size, input_size,
embed_size, embed_size,
sparse_emb=False, dgl_sparse=False):
embed_name='embed'):
super(RelGraphEmbedLayer, self).__init__() super(RelGraphEmbedLayer, self).__init__()
self.dev_id = th.device(dev_id if dev_id >= 0 else 'cpu') self.dev_id = th.device(dev_id if dev_id >= 0 else 'cpu')
self.embed_size = embed_size self.embed_size = embed_size
self.embed_name = embed_name
self.num_nodes = num_nodes self.num_nodes = num_nodes
self.sparse_emb = sparse_emb self.dgl_sparse = dgl_sparse
# create weight embeddings for each node for each relation # create weight embeddings for each node for each relation
self.embeds = nn.ParameterDict() self.embeds = nn.ParameterDict()
self.node_embeds = {} if dgl_sparse else nn.ModuleDict()
self.num_of_ntype = num_of_ntype self.num_of_ntype = num_of_ntype
self.idmap = th.empty(num_nodes).long()
for ntype in range(num_of_ntype): for ntype in range(num_of_ntype):
if input_size[ntype] is not None: if isinstance(input_size[ntype], int):
if dgl_sparse:
self.node_embeds[str(ntype)] = dgl.nn.NodeEmbedding(input_size[ntype], embed_size, name=str(ntype),
init_func=initializer)
else:
sparse_emb = th.nn.Embedding(input_size[ntype], embed_size, sparse=True)
nn.init.uniform_(sparse_emb.weight, -1.0, 1.0)
self.node_embeds[str(ntype)] = sparse_emb
else:
input_emb_size = input_size[ntype].shape[1] input_emb_size = input_size[ntype].shape[1]
embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size)) embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
nn.init.xavier_uniform_(embed) nn.init.xavier_uniform_(embed)
self.embeds[str(ntype)] = embed self.embeds[str(ntype)] = embed
self.node_embeds = th.nn.Embedding(node_tids.shape[0], self.embed_size, sparse=self.sparse_emb) @property
nn.init.uniform_(self.node_embeds.weight, -1.0, 1.0) def dgl_emb(self):
"""
"""
if self.dgl_sparse:
embs = [emb for emb in self.node_embeds.values()]
return embs
else:
return []
def forward(self, node_ids, node_tids, type_ids, features): def forward(self, node_ids, node_tids, type_ids, features):
"""Forward computation """Forward computation
...@@ -117,14 +136,16 @@ class RelGraphEmbedLayer(nn.Module): ...@@ -117,14 +136,16 @@ class RelGraphEmbedLayer(nn.Module):
tensor tensor
embeddings as the input of the next layer embeddings as the input of the next layer
""" """
tsd_ids = node_ids.to(self.node_embeds.weight.device) tsd_ids = node_ids.to(self.dev_id)
embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id) embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id)
for ntype in range(self.num_of_ntype): for ntype in range(self.num_of_ntype):
if features[ntype] is not None:
loc = node_tids == ntype loc = node_tids == ntype
embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id) if isinstance(features[ntype], int):
if self.dgl_sparse:
embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc], self.dev_id)
else: else:
loc = node_tids == ntype embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc]).to(self.dev_id)
embeds[loc] = self.node_embeds(tsd_ids[loc]).to(self.dev_id) else:
embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id)
return embeds return embeds
...@@ -38,6 +38,7 @@ from .transform import * ...@@ -38,6 +38,7 @@ from .transform import *
from .propagate import * from .propagate import *
from .random import * from .random import *
from .data.utils import save_graphs, load_graphs from .data.utils import save_graphs, load_graphs
from . import optim
from ._deprecate.graph import DGLGraph as DGLGraphStale from ._deprecate.graph import DGLGraph as DGLGraphStale
from ._deprecate.nodeflow import * from ._deprecate.nodeflow import *
...@@ -73,7 +73,6 @@ def load_backend(mod_name): ...@@ -73,7 +73,6 @@ def load_backend(mod_name):
else: else:
setattr(thismod, api, _gen_missing_api(api, mod_name)) setattr(thismod, api, _gen_missing_api(api, mod_name))
def get_preferred_backend(): def get_preferred_backend():
config_path = os.path.join(os.path.expanduser('~'), '.dgl', 'config.json') config_path = os.path.join(os.path.expanduser('~'), '.dgl', 'config.json')
backend_name = None backend_name = None
......
...@@ -1615,3 +1615,14 @@ class no_grad(object): ...@@ -1615,3 +1615,14 @@ class no_grad(object):
def __exit__(self, exc_type, exc_value, exc_traceback): def __exit__(self, exc_type, exc_value, exc_traceback):
pass pass
class NodeEmbedding(object):
"""Sparse node embeddings"""
def __init__(self):
pass
def __enter__(self):
pass
def __exit__(self, exc_type, exc_value, exc_traceback):
pass
"""Sparse optimizer is not supported for mxnet"""
\ No newline at end of file
"""Sparse optimizer is not supported for tensorflow"""
\ No newline at end of file
...@@ -5,3 +5,4 @@ from .softmax import * ...@@ -5,3 +5,4 @@ from .softmax import *
from .factory import * from .factory import *
from .hetero import * from .hetero import *
from .utils import Sequential, WeightBasis from .utils import Sequential, WeightBasis
from .sparse_emb import NodeEmbedding
"""Torch NodeEmbedding."""
from datetime import timedelta
import torch as th
from ...backend import pytorch as F
from ...utils import get_shared_mem_array, create_shared_mem_array
_STORE = None
class NodeEmbedding: # NodeEmbedding
'''Class for storing node embeddings.
The class is optimized for training large-scale node embeddings. It updates the embedding in
a sparse way and can scale to graphs with millions of nodes. It also supports partitioning
to multiple GPUs (on a single machine) for more acceleration. It does not support partitioning
across machines.
Currently, DGL provides two optimizers that work with this NodeEmbedding
class: ``SparseAdagrad`` and ``SparseAdam``.
The implementation is based on torch.distributed package. It depends on the pytorch
default distributed process group to collect multi-process information and uses
``torch.distributed.TCPStore`` to share meta-data information across multiple gpu processes.
It use the local address of '127.0.0.1:12346' to initialize the TCPStore.
Parameters
----------
num_embeddings : int
The number of embeddings. Currently, the number of embeddings has to be the same as
the number of nodes.
embedding_dim : int
The dimension size of embeddings.
name : str
The name of the embeddings. The name should uniquely identify the embeddings in the system.
init_func : callable, optional
The function to create the initial data. If the init function is not provided,
the values of the embeddings are initialized to zero.
Examples
--------
Before launching multiple gpu processes
>>> def initializer(emb):
th.nn.init.xavier_uniform_(emb)
return emb
In each training process
>>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
>>> optimizer = dgl.optim.SparseAdam([emb], lr=0.001)
>>> for blocks in dataloader:
... ...
... feats = emb(nids, gpu_0)
... loss = F.sum(feats + 1, 0)
... loss.backward()
... optimizer.step()
'''
def __init__(self, num_embeddings, embedding_dim, name,
init_func=None):
global _STORE
# Check whether it is multi-gpu training or not.
if th.distributed.is_initialized():
rank = th.distributed.get_rank()
world_size = th.distributed.get_world_size()
else:
rank = -1
world_size = 0
self._rank = rank
self._world_size = world_size
host_name = '127.0.0.1'
port = 12346
if rank <= 0:
emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
if init_func is not None:
emb = init_func(emb)
if rank == 0:
if world_size > 1:
# for multi-gpu training, setup a TCPStore for
# embeding status synchronization across GPU processes
if _STORE is None:
_STORE = th.distributed.TCPStore(
host_name, port, world_size, True, timedelta(seconds=30))
for _ in range(1, world_size):
# send embs
_STORE.set(name, name)
elif rank > 0:
# receive
if _STORE is None:
_STORE = th.distributed.TCPStore(
host_name, port, world_size, False, timedelta(seconds=30))
_STORE.wait([name])
emb = get_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
self._store = _STORE
self._tensor = emb
self._num_embeddings = num_embeddings
self._embedding_dim = embedding_dim
self._name = name
self._optm_state = None # track optimizer state
self._trace = [] # track minibatch
def __call__(self, node_ids, device=th.device('cpu')):
"""
node_ids : th.tensor
Index of the embeddings to collect.
device : th.device
Target device to put the collected embeddings.
"""
emb = self._tensor[node_ids].to(device)
if F.is_recording():
emb = F.attach_grad(emb)
self._trace.append((node_ids.to(device, non_blocking=True), emb))
return emb
@property
def store(self):
"""Return torch.distributed.TCPStore for
meta data sharing across processes.
Returns
-------
torch.distributed.TCPStore
KVStore used for meta data sharing.
"""
return self._store
@property
def rank(self):
"""Return rank of current process.
Returns
-------
int
The rank of current process.
"""
return self._rank
@property
def world_size(self):
"""Return world size of the pytorch distributed training env.
Returns
-------
int
The world size of the pytorch distributed training env.
"""
return self._world_size
@property
def name(self):
"""Return the name of NodeEmbedding.
Returns
-------
str
The name of NodeEmbedding.
"""
return self._name
@property
def num_embeddings(self):
"""Return the number of embeddings.
Returns
-------
int
The number of embeddings.
"""
return self._num_embeddings
def set_optm_state(self, state):
"""Store the optimizer related state tensor.
Parameters
----------
state : tuple of torch.Tensor
Optimizer related state.
"""
self._optm_state = state
@property
def optm_state(self):
"""Return the optimizer related state tensor.
Returns
-------
tuple of torch.Tensor
The optimizer related state.
"""
return self._optm_state
@property
def trace(self):
"""Return a trace of the indices of embeddings
used in the training step(s).
Returns
-------
[torch.Tensor]
The indices of embeddings used in the training step(s).
"""
return self._trace
def reset_trace(self):
"""Clean up the trace of the indices of embeddings
used in the training step(s).
"""
self._trace = []
@property
def emb_tensor(self):
"""Return the tensor storing the node embeddings
Returns
-------
torch.Tensor
The tensor storing the node embeddings
"""
return self._tensor
"""dgl optims."""
import importlib
import sys
import os
from ..backend import backend_name
from ..utils import expand_as_pair
def _load_backend(mod_name):
mod = importlib.import_module('.%s' % mod_name, __name__)
thismod = sys.modules[__name__]
for api, obj in mod.__dict__.items():
setattr(thismod, api, obj)
_load_backend(backend_name)
"""dgl optims for pytorch."""
from .sparse_optim import SparseAdagrad, SparseAdam
"""Node embedding optimizers"""
import abc
from abc import abstractmethod
import torch as th
from ...utils import get_shared_mem_array, create_shared_mem_array
from ...nn.pytorch import NodeEmbedding
class SparseGradOptimizer(abc.ABC):
r''' The abstract sparse optimizer.
Note: dgl sparse optimizer only work with dgl.NodeEmbedding
Parameters
----------
params : list of NodeEmbedding
The list of NodeEmbeddings.
lr : float
The learning rate.
'''
def __init__(self, params, lr):
self._params = params
self._lr = lr
self._rank = None
self._world_size = None
self._shared_cache = {}
self._clean_grad = False
def step(self):
''' The step function.
The step function is invoked at the end of every batch to update embeddings
'''
with th.no_grad():
# Frequently alloc and free shared memory to hold intermediate tensor is expensive
# We cache shared memory buffers in shared_emb.
shared_emb = {emb.name: ([], []) for emb in self._params}
# Go through all sparse embeddings
for emb in self._params: # pylint: disable=too-many-nested-blocks
num_embeddings = emb.num_embeddings
emb_name = emb.name
# Each gpu process takes the resposibility of update a range of sparse embedding,
# thus we can parallel the gradient update.
range_size = (num_embeddings + self._world_size - 1) // self._world_size \
if self._world_size > 0 else 0
for idx, data in emb._trace:
grad = data.grad.data
device = grad.device
idx_dtype = idx.dtype
grad_dtype = grad.dtype
grad_dim = grad.shape[1]
if self._world_size > 0:
if emb_name not in self._shared_cache:
self._shared_cache[emb_name] = {}
for i in range(self._world_size):
start = i * range_size
end = (i + 1) * range_size \
if (i + 1) * range_size < num_embeddings \
else num_embeddings
if i == 0:
mask = idx < end
elif i + 1 == self._world_size:
mask = idx >= start
else:
mask = th.logical_and((idx >= start), (idx < end))
idx_i = idx[mask]
grad_i = grad[mask]
if i == self._rank:
shared_emb[emb_name][0].append(idx_i)
shared_emb[emb_name][1].append(grad_i)
else:
# currently nccl does not support Alltoallv operation
# we need to use CPU shared memory to share gradient
# across processes
idx_i = idx_i.to(th.device('cpu'))
grad_i = grad_i.to(th.device('cpu'))
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
if idx_shmem_name not in self._shared_cache[emb_name] or \
self._shared_cache[emb_name][idx_shmem_name].shape[0] \
< idx_i.shape[0]:
# in case idx_i.shape[0] is 0
idx_shmem = create_shared_mem_array(idx_shmem_name, \
(idx_i.shape[0] * 2 + 2,), idx_dtype)
grad_shmem = create_shared_mem_array(grad_shmem_name, \
(idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
= idx_i
self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
= grad_i
emb.store.set(idx_shmem_name, str(idx_i.shape[0]))
# gather gradients from all other processes
for i in range(self._world_size):
if i != self._rank:
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
size = int(emb.store.get(idx_shmem_name))
if idx_shmem_name not in self._shared_cache[emb_name] or \
self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
idx_shmem = get_shared_mem_array(idx_shmem_name, \
(size * 2 + 2,), idx_dtype)
grad_shmem = get_shared_mem_array(grad_shmem_name, \
(size * 2 + 2, grad_dim), grad_dtype)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
shared_emb[emb_name][0].append(idx_i.to(device,
non_blocking=True))
shared_emb[emb_name][1].append(grad_i.to(device,
non_blocking=True))
else:
shared_emb[emb_name][0].append(idx)
shared_emb[emb_name][1].append(grad)
if self._clean_grad:
# clean gradient track
for emb in self._params:
emb.reset_trace()
self._clean_grad = False
for emb in self._params:
emb_name = emb.name
idx = th.cat(shared_emb[emb_name][0], dim=0)
grad = th.cat(shared_emb[emb_name][1], dim=0)
self.update(idx, grad, emb)
# synchronized gradient update
if self._world_size > 1:
th.distributed.barrier()
@abstractmethod
def update(self, idx, grad, emb):
""" Update embeddings in a sparse manner
Sparse embeddings are updated in mini batches. we maintains gradient states for
each embedding so they can be updated separately.
Parameters
----------
idx : tensor
Index of the embeddings to be updated.
grad : tensor
Gradient of each embedding.
emb : dgl.nn.NodeEmbedding
Sparse node embedding to update.
"""
def zero_grad(self):
"""clean grad cache
"""
self._clean_grad = True
class SparseAdagrad(SparseGradOptimizer):
r''' Node embedding optimizer using the Adagrad algorithm.
This optimizer implements a sparse version of Adagrad algorithm for
optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only updates
the embeddings whose gradients have updates, which are usually a very
small portion of the total embeddings.
Adagrad maintains a :math:`G_{t,i,j}` for every parameter in the embeddings, where
:math:`G_{t,i,j}=G_{t-1,i,j} + g_{t,i,j}^2` and :math:`g_{t,i,j}` is the gradient of
the dimension :math:`j` of embedding :math:`i` at step :math:`t`.
Parameters
----------
params : list[dgl.nn.NodeEmbedding]
The list of dgl.nn.NodeEmbedding.
lr : float
The learning rate.
eps : float, Optional
The term added to the denominator to improve numerical stability
Default: 1e-10
Examples
--------
>>> def initializer(emb):
th.nn.init.xavier_uniform_(emb)
return emb
>>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
>>> optimizer = dgl.optim.SparseAdagrad([emb], lr=0.001)
>>> for blocks in dataloader:
... ...
... feats = emb(nids, gpu_0)
... loss = F.sum(feats + 1, 0)
... loss.backward()
... optimizer.step()
'''
def __init__(self, params, lr, eps=1e-10):
super(SparseAdagrad, self).__init__(params, lr)
self._eps = eps
# We need to register a state sum for each embedding in the kvstore.
for emb in params:
assert isinstance(emb, NodeEmbedding), \
'SparseAdagrad only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0:
emb_name = emb.name
state = create_shared_mem_array(emb_name+'_state', \
emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0:
for _ in range(1, world_size):
# send embs
emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0:
# receive
emb_name = emb.name
emb.store.wait([emb_name+'_opt'])
state = get_shared_mem_array(emb_name+'_state', \
emb.emb_tensor.shape, th.float32)
emb.set_optm_state(state)
def update(self, idx, grad, emb):
""" Update embeddings in a sparse manner
Sparse embeddings are updated in mini batches. we maintains gradient states for
each embedding so they can be updated separately.
Parameters
----------
idx : tensor
Index of the embeddings to be updated.
grad : tensor
Gradient of each embedding.
emb : dgl.nn.NodeEmbedding
Sparse embedding to update.
"""
eps = self._eps
clr = self._lr
# the update is non-linear so indices must be unique
grad_indices, inverse, cnt = th.unique(idx, return_inverse=True, return_counts=True)
grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=grad.device)
grad_values.index_add_(0, inverse, grad)
grad_values = grad_values / cnt.unsqueeze(1)
grad_sum = (grad_values * grad_values)
state = emb.optm_state
state_dev = state.device
state_idx = grad_indices.to(state_dev)
grad_state = state[state_idx].to(grad.device)
grad_state += grad_sum
state[state_idx] = grad_state.to(state_dev)
std_values = grad_state.add_(eps).sqrt_()
tmp = clr * grad_values / std_values
emb.emb_tensor[state_idx] -= tmp.to(state_dev)
class SparseAdam(SparseGradOptimizer):
r''' Node embedding optimizer using the Adam algorithm.
This optimizer implements a sparse version of Adagrad algorithm for
optimizing :class:`dgl.nn.NodeEmbedding`. Being sparse means it only
updates the embeddings whose gradients have updates, which are usually
a very small portion of the total embeddings.
Adam maintains a :math:`Gm_{t,i,j}` and `Gp_{t,i,j}` for every parameter
in the embeddings, where
:math:`Gm_{t,i,j}=beta1 * Gm_{t-1,i,j} + (1-beta1) * g_{t,i,j}`,
:math:`Gp_{t,i,j}=beta2 * Gp_{t-1,i,j} + (1-beta2) * g_{t,i,j}^2`,
:math:`g_{t,i,j} = lr * Gm_{t,i,j} / (1 - beta1^t) / \sqrt{Gp_{t,i,j} / (1 - beta2^t)}` and
:math:`g_{t,i,j}` is the gradient of the dimension :math:`j` of embedding :math:`i`
at step :math:`t`.
Parameters
----------
params : list[dgl.nn.NodeEmbedding]
The list of dgl.nn.NodeEmbeddings.
lr : float
The learning rate.
betas : tuple[float, float], Optional
Coefficients used for computing running averages of gradient and its square.
Default: (0.9, 0.999)
eps : float, Optional
The term added to the denominator to improve numerical stability
Default: 1e-8
Examples:
>>> def initializer(emb):
th.nn.init.xavier_uniform_(emb)
return emb
>>> emb = dgl.nn.NodeEmbedding(g.number_of_nodes(), 10, 'emb', init_func=initializer)
>>> optimizer = dgl.optim.SparseAdam([emb], lr=0.001)
>>> for blocks in dataloader:
... ...
... feats = emb(nids, gpu_0)
... loss = F.sum(feats + 1, 0)
... loss.backward()
... optimizer.step()
'''
def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08):
super(SparseAdam, self).__init__(params, lr)
self._lr = lr
self._beta1 = betas[0]
self._beta2 = betas[1]
self._eps = eps
# We need to register a state sum for each embedding in the kvstore.
for emb in params:
assert isinstance(emb, NodeEmbedding), \
'SparseAdam only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0:
emb_name = emb.name
state_step = create_shared_mem_array(emb_name+'_step', \
(emb.emb_tensor.shape[0],), th.float32).zero_()
state_mem = create_shared_mem_array(emb_name+'_mem', \
emb.emb_tensor.shape, th.float32).zero_()
state_power = create_shared_mem_array(emb_name+'_power', \
emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0:
state = (state_step, state_mem, state_power)
emb_name = emb.name
for _ in range(1, self._world_size):
# send embs
emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0:
# receive
emb_name = emb.name
emb.store.wait([emb_name+'_opt'])
state_step = get_shared_mem_array(emb_name+'_step', \
(emb.emb_tensor.shape[0],), th.float32)
state_mem = get_shared_mem_array(emb_name+'_mem', \
emb.emb_tensor.shape, th.float32)
state_power = get_shared_mem_array(emb_name+'_power', \
emb.emb_tensor.shape, th.float32)
state = (state_step, state_mem, state_power)
emb.set_optm_state(state)
def update(self, idx, grad, emb):
""" Update embeddings in a sparse manner
Sparse embeddings are updated in mini batches. we maintains gradient states for
each embedding so they can be updated separately.
Parameters
----------
idx : tensor
Index of the embeddings to be updated.
grad : tensor
Gradient of each embedding.
emb : dgl.nn.NodeEmbedding
Sparse embedding to update.
"""
with th.no_grad():
beta1 = self._beta1
beta2 = self._beta2
eps = self._eps
clr = self._lr
state_step, state_mem, state_power = emb.optm_state
exec_dev = grad.device
state_dev = state_step.device
# There can be duplicated indices due to sampling.
# Thus unique them here and average the gradient here.
grad_indices, inverse, cnt = th.unique(idx,
return_inverse=True,
return_counts=True)
state_idx = grad_indices.to(state_dev)
state_step[state_idx] += 1
state_step = state_step[state_idx].to(exec_dev, non_blocking=True)
orig_mem = state_mem[state_idx].to(exec_dev, non_blocking=True)
orig_power = state_power[state_idx].to(exec_dev, non_blocking=True)
grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=exec_dev)
grad_values.index_add_(0, inverse, grad)
grad_values = grad_values / cnt.unsqueeze(1)
grad_mem = grad_values
grad_power = grad_values * grad_values
update_mem = beta1 * orig_mem + (1.-beta1) * grad_mem
update_power = beta2 * orig_power + (1.-beta2) * grad_power
state_mem[state_idx] = update_mem.to(state_dev, non_blocking=True)
state_power[state_idx] = update_power.to(state_dev, non_blocking=True)
update_mem_corr = update_mem / (1. - th.pow(th.tensor(beta1, device=exec_dev),
state_step)).unsqueeze(1)
update_power_corr = update_power / (1. - th.pow(th.tensor(beta2, device=exec_dev),
state_step)).unsqueeze(1)
std_values = clr * update_mem_corr / (th.sqrt(update_power_corr) + eps)
emb.emb_tensor[state_idx] -= std_values.to(state_dev)
...@@ -2,3 +2,4 @@ ...@@ -2,3 +2,4 @@
from .internal import * from .internal import *
from .data import * from .data import *
from .checks import * from .checks import *
from .shared_mem import *
"""Shared memory utilities."""
from .. import backend as F
from .._ffi.ndarray import empty_shared_mem
def get_shared_mem_array(name, shape, dtype):
""" Get a tensor from shared memory with specific name
Parameters
----------
name : str
The unique name of the shared memory
shape : tuple of int
The shape of the returned tensor
dtype : F.dtype
The dtype of the returned tensor
Returns
-------
F.tensor
The tensor got from shared memory.
"""
name = 'DGL_'+name
new_arr = empty_shared_mem(name, False, shape, F.reverse_data_type_dict[dtype])
dlpack = new_arr.to_dlpack()
return F.zerocopy_from_dlpack(dlpack)
def create_shared_mem_array(name, shape, dtype):
""" Create a tensor from shared memory with the specific name
Parameters
----------
name : str
The unique name of the shared memory
shape : tuple of int
The shape of the returned tensor
dtype : F.dtype
The dtype of the returned tensor
Returns
-------
F.tensor
The created tensor.
"""
name = 'DGL_'+name
new_arr = empty_shared_mem(name, True, shape, F.reverse_data_type_dict[dtype])
dlpack = new_arr.to_dlpack()
return F.zerocopy_from_dlpack(dlpack)
...@@ -32,7 +32,7 @@ class SharedMemoryResource: public Resource { ...@@ -32,7 +32,7 @@ class SharedMemoryResource: public Resource {
} }
void Destroy() { void Destroy() {
LOG(INFO) << "remove " << name << " for shared memory"; // LOG(INFO) << "remove " << name << " for shared memory";
shm_unlink(name.c_str()); shm_unlink(name.c_str());
} }
}; };
...@@ -55,7 +55,7 @@ SharedMemory::~SharedMemory() { ...@@ -55,7 +55,7 @@ SharedMemory::~SharedMemory() {
CHECK(munmap(ptr_, size_) != -1) << strerror(errno); CHECK(munmap(ptr_, size_) != -1) << strerror(errno);
close(fd_); close(fd_);
if (own_) { if (own_) {
LOG(INFO) << "remove " << name << " for shared memory"; // LOG(INFO) << "remove " << name << " for shared memory";
shm_unlink(name.c_str()); shm_unlink(name.c_str());
// The resource has been deleted. We don't need to keep track of it any more. // The resource has been deleted. We don't need to keep track of it any more.
DeleteResource(name); DeleteResource(name);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment