"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "90a2402b11b58705884b564b7bfce325f1c7d7c7"
Unverified Commit 23afe911 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Bug Fix] Fix several sparse optimizer bugs (#2596)



* Fix pytorch TCP kvstore bug

* lint

* Fix

* upd

* Fix lint

* Fix

* trigger

* fix
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-56-220.ec2.internal>
parent 2f71bc50
...@@ -68,7 +68,7 @@ python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-siz ...@@ -68,7 +68,7 @@ python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-siz
OGBN-MAG without node-feats 42.79 OGBN-MAG without node-feats 42.79
``` ```
python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.0 python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.08
``` ```
Test-bd: P2-8xlarge Test-bd: P2-8xlarge
......
...@@ -324,6 +324,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None): ...@@ -324,6 +324,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
validation_time = 0 validation_time = 0
test_time = 0 test_time = 0
last_val_acc = 0.0 last_val_acc = 0.0
do_test = False
if n_gpus > 1 and n_cpus - args.num_workers > 0: if n_gpus > 1 and n_cpus - args.num_workers > 0:
th.set_num_threads(n_cpus-args.num_workers) th.set_num_threads(n_cpus-args.num_workers)
for epoch in range(args.n_epochs): for epoch in range(args.n_epochs):
...@@ -405,7 +406,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None): ...@@ -405,7 +406,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
vend = time.time() vend = time.time()
validation_time += (vend - vstart) validation_time += (vend - vstart)
if (epoch + 1) > (args.n_epochs / 2) and do_test: if epoch > 0 and do_test:
tstart = time.time() tstart = time.time()
if (queue is not None) or (proc_id == 0): if (queue is not None) or (proc_id == 0):
test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats) test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
......
...@@ -75,8 +75,7 @@ class NodeEmbedding: # NodeEmbedding ...@@ -75,8 +75,7 @@ class NodeEmbedding: # NodeEmbedding
emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32) emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
if init_func is not None: if init_func is not None:
emb = init_func(emb) emb = init_func(emb)
if rank == 0: if rank == 0: # the master gpu process
if world_size > 1:
# for multi-gpu training, setup a TCPStore for # for multi-gpu training, setup a TCPStore for
# embeding status synchronization across GPU processes # embeding status synchronization across GPU processes
if _STORE is None: if _STORE is None:
......
"""Node embedding optimizers""" """Node embedding optimizers"""
import abc import abc
from abc import abstractmethod from abc import abstractmethod
import gc
import torch as th import torch as th
from ...utils import get_shared_mem_array, create_shared_mem_array from ...utils import get_shared_mem_array, create_shared_mem_array
...@@ -25,6 +26,34 @@ class SparseGradOptimizer(abc.ABC): ...@@ -25,6 +26,34 @@ class SparseGradOptimizer(abc.ABC):
self._world_size = None self._world_size = None
self._shared_cache = {} self._shared_cache = {}
self._clean_grad = False self._clean_grad = False
self._opt_meta = {}
for emb in params:
assert isinstance(emb, NodeEmbedding), \
'DGL SparseOptimizer only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
emb_name = emb.name
if self._rank == 0: # the master gpu process
opt_meta = create_shared_mem_array(emb_name+'_opt_meta', \
(self._world_size, self._world_size), th.int32).zero_()
if self._rank == 0:
emb.store.set(emb_name+'_opt_meta', emb_name)
self._opt_meta[emb_name] = opt_meta
elif self._rank > 0:
# receive
emb.store.wait([emb_name+'_opt_meta'])
opt_meta = get_shared_mem_array(emb_name+'_opt_meta', \
(self._world_size, self._world_size), th.int32)
self._opt_meta[emb_name] = opt_meta
def step(self): def step(self):
''' The step function. ''' The step function.
...@@ -36,37 +65,49 @@ class SparseGradOptimizer(abc.ABC): ...@@ -36,37 +65,49 @@ class SparseGradOptimizer(abc.ABC):
# We cache shared memory buffers in shared_emb. # We cache shared memory buffers in shared_emb.
shared_emb = {emb.name: ([], []) for emb in self._params} shared_emb = {emb.name: ([], []) for emb in self._params}
# hold released shared memory to let other process to munmap it first
# unless it will crash the training
shmem_ptr_holder = []
# Go through all sparse embeddings # Go through all sparse embeddings
for emb in self._params: # pylint: disable=too-many-nested-blocks for emb in self._params: # pylint: disable=too-many-nested-blocks
num_embeddings = emb.num_embeddings
emb_name = emb.name emb_name = emb.name
# Each gpu process takes the resposibility of update a range of sparse embedding, # we need to combine gradients from multiple forward paths
# thus we can parallel the gradient update. idx = []
range_size = (num_embeddings + self._world_size - 1) // self._world_size \ grad = []
if self._world_size > 0 else 0 for i, data in emb._trace:
for idx, data in emb._trace: idx.append(i)
grad = data.grad.data grad.append(data.grad.data)
idx = th.cat(idx, dim=0)
grad = th.cat(grad, dim=0)
device = grad.device device = grad.device
idx_dtype = idx.dtype idx_dtype = idx.dtype
grad_dtype = grad.dtype grad_dtype = grad.dtype
grad_dim = grad.shape[1] grad_dim = grad.shape[1]
if self._world_size > 1:
if self._world_size > 0:
if emb_name not in self._shared_cache: if emb_name not in self._shared_cache:
self._shared_cache[emb_name] = {} self._shared_cache[emb_name] = {}
# Each training process takes the resposibility of updating a range
# of node embeddings, thus we can parallel the gradient update.
# The overall progress includes:
# 1. In each training process:
# 1.a Deciding which process a node embedding belongs to according
# to the formula: process_id = node_idx mod num_of_process(N)
# 1.b Split the node index tensor and gradient tensor into N parts
# according to step 1.
# 1.c Write each node index sub-tensor and gradient sub-tensor into
# different DGL shared memory buffers.
# 2. Cross training process synchronization
# 3. In each traning process:
# 3.a Collect node index sub-tensors and gradient sub-tensors
# 3.b Do gradient update
# 4. Done
idx_split = th.remainder(idx, self._world_size).long()
for i in range(self._world_size): for i in range(self._world_size):
start = i * range_size mask = idx_split == i
end = (i + 1) * range_size \
if (i + 1) * range_size < num_embeddings \
else num_embeddings
if i == 0:
mask = idx < end
elif i + 1 == self._world_size:
mask = idx >= start
else:
mask = th.logical_and((idx >= start), (idx < end))
idx_i = idx[mask] idx_i = idx[mask]
grad_i = grad[mask] grad_i = grad[mask]
...@@ -82,9 +123,18 @@ class SparseGradOptimizer(abc.ABC): ...@@ -82,9 +123,18 @@ class SparseGradOptimizer(abc.ABC):
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i) idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i) grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
# Create shared memory to hold temporary index and gradient tensor for
# cross-process send and recv.
if idx_shmem_name not in self._shared_cache[emb_name] or \ if idx_shmem_name not in self._shared_cache[emb_name] or \
self._shared_cache[emb_name][idx_shmem_name].shape[0] \ self._shared_cache[emb_name][idx_shmem_name].shape[0] \
< idx_i.shape[0]: < idx_i.shape[0]:
if idx_shmem_name in self._shared_cache[emb_name]:
shmem_ptr_holder.append(
self._shared_cache[emb_name][idx_shmem_name])
shmem_ptr_holder.append(
self._shared_cache[emb_name][grad_shmem_name])
# in case idx_i.shape[0] is 0 # in case idx_i.shape[0] is 0
idx_shmem = create_shared_mem_array(idx_shmem_name, \ idx_shmem = create_shared_mem_array(idx_shmem_name, \
(idx_i.shape[0] * 2 + 2,), idx_dtype) (idx_i.shape[0] * 2 + 2,), idx_dtype)
...@@ -93,18 +143,31 @@ class SparseGradOptimizer(abc.ABC): ...@@ -93,18 +143,31 @@ class SparseGradOptimizer(abc.ABC):
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
# Fill shared memory with temporal index tensor and gradient tensor
self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \ self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
= idx_i = idx_i
self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \ self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
= grad_i = grad_i
emb.store.set(idx_shmem_name, str(idx_i.shape[0])) self._opt_meta[emb_name][self._rank][i] = idx_i.shape[0]
else:
shared_emb[emb_name][0].append(idx)
shared_emb[emb_name][1].append(grad)
# make sure the idx shape is passed to each process through opt_meta
if self._world_size > 1:
th.distributed.barrier()
for emb in self._params: # pylint: disable=too-many-nested-blocks
emb_name = emb.name
if self._world_size > 1:
# gather gradients from all other processes # gather gradients from all other processes
for i in range(self._world_size): for i in range(self._world_size):
if i != self._rank: if i != self._rank:
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank) idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank) grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
size = int(emb.store.get(idx_shmem_name)) size = self._opt_meta[emb_name][i][self._rank]
# Retrive shared memory holding the temporal index and gradient
# tensor that is sent to current training process
if idx_shmem_name not in self._shared_cache[emb_name] or \ if idx_shmem_name not in self._shared_cache[emb_name] or \
self._shared_cache[emb_name][idx_shmem_name].shape[0] < size: self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
idx_shmem = get_shared_mem_array(idx_shmem_name, \ idx_shmem = get_shared_mem_array(idx_shmem_name, \
...@@ -113,15 +176,16 @@ class SparseGradOptimizer(abc.ABC): ...@@ -113,15 +176,16 @@ class SparseGradOptimizer(abc.ABC):
(size * 2 + 2, grad_dim), grad_dtype) (size * 2 + 2, grad_dim), grad_dtype)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
# make sure shared memory are released in child process first
# This will not be called frequently
# TODO(xiangsx) Provide API to mumap shared memory directly
gc.collect()
idx_i = self._shared_cache[emb_name][idx_shmem_name][:size] idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
grad_i = self._shared_cache[emb_name][grad_shmem_name][:size] grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
shared_emb[emb_name][0].append(idx_i.to(device, shared_emb[emb_name][0].append(idx_i.to(device,
non_blocking=True)) non_blocking=True))
shared_emb[emb_name][1].append(grad_i.to(device, shared_emb[emb_name][1].append(grad_i.to(device,
non_blocking=True)) non_blocking=True))
else:
shared_emb[emb_name][0].append(idx)
shared_emb[emb_name][1].append(grad)
if self._clean_grad: if self._clean_grad:
# clean gradient track # clean gradient track
...@@ -205,21 +269,12 @@ class SparseAdagrad(SparseGradOptimizer): ...@@ -205,21 +269,12 @@ class SparseAdagrad(SparseGradOptimizer):
assert isinstance(emb, NodeEmbedding), \ assert isinstance(emb, NodeEmbedding), \
'SparseAdagrad only supports dgl.nn.NodeEmbedding' 'SparseAdagrad only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0: if self._rank <= 0:
emb_name = emb.name emb_name = emb.name
state = create_shared_mem_array(emb_name+'_state', \ state = create_shared_mem_array(emb_name+'_state', \
emb.emb_tensor.shape, th.float32).zero_() emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0: if self._rank == 0:
for _ in range(1, world_size): if self._world_size > 1:
# send embs
emb.store.set(emb_name+'_opt', emb_name) emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0: elif self._rank > 0:
# receive # receive
...@@ -318,14 +373,6 @@ class SparseAdam(SparseGradOptimizer): ...@@ -318,14 +373,6 @@ class SparseAdam(SparseGradOptimizer):
assert isinstance(emb, NodeEmbedding), \ assert isinstance(emb, NodeEmbedding), \
'SparseAdam only supports dgl.nn.NodeEmbedding' 'SparseAdam only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0: if self._rank <= 0:
emb_name = emb.name emb_name = emb.name
state_step = create_shared_mem_array(emb_name+'_step', \ state_step = create_shared_mem_array(emb_name+'_step', \
...@@ -335,10 +382,8 @@ class SparseAdam(SparseGradOptimizer): ...@@ -335,10 +382,8 @@ class SparseAdam(SparseGradOptimizer):
state_power = create_shared_mem_array(emb_name+'_power', \ state_power = create_shared_mem_array(emb_name+'_power', \
emb.emb_tensor.shape, th.float32).zero_() emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0: if self._rank == 0:
state = (state_step, state_mem, state_power)
emb_name = emb.name emb_name = emb.name
for _ in range(1, self._world_size): if self._world_size > 1:
# send embs
emb.store.set(emb_name+'_opt', emb_name) emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0: elif self._rank > 0:
# receive # receive
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment