"...git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "2120b4eee35bcc0db5f3acd3900fb31188ed0160"
Unverified Commit 23afe911 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Bug Fix] Fix several sparse optimizer bugs (#2596)



* Fix pytorch TCP kvstore bug

* lint

* Fix

* upd

* Fix lint

* Fix

* trigger

* fix
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-56-220.ec2.internal>
parent 2f71bc50
...@@ -68,7 +68,7 @@ python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-siz ...@@ -68,7 +68,7 @@ python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-siz
OGBN-MAG without node-feats 42.79 OGBN-MAG without node-feats 42.79
``` ```
python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.0 python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.08
``` ```
Test-bd: P2-8xlarge Test-bd: P2-8xlarge
......
...@@ -324,6 +324,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None): ...@@ -324,6 +324,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
validation_time = 0 validation_time = 0
test_time = 0 test_time = 0
last_val_acc = 0.0 last_val_acc = 0.0
do_test = False
if n_gpus > 1 and n_cpus - args.num_workers > 0: if n_gpus > 1 and n_cpus - args.num_workers > 0:
th.set_num_threads(n_cpus-args.num_workers) th.set_num_threads(n_cpus-args.num_workers)
for epoch in range(args.n_epochs): for epoch in range(args.n_epochs):
...@@ -405,7 +406,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None): ...@@ -405,7 +406,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
vend = time.time() vend = time.time()
validation_time += (vend - vstart) validation_time += (vend - vstart)
if (epoch + 1) > (args.n_epochs / 2) and do_test: if epoch > 0 and do_test:
tstart = time.time() tstart = time.time()
if (queue is not None) or (proc_id == 0): if (queue is not None) or (proc_id == 0):
test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats) test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
......
...@@ -75,16 +75,15 @@ class NodeEmbedding: # NodeEmbedding ...@@ -75,16 +75,15 @@ class NodeEmbedding: # NodeEmbedding
emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32) emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
if init_func is not None: if init_func is not None:
emb = init_func(emb) emb = init_func(emb)
if rank == 0: if rank == 0: # the master gpu process
if world_size > 1: # for multi-gpu training, setup a TCPStore for
# for multi-gpu training, setup a TCPStore for # embeding status synchronization across GPU processes
# embeding status synchronization across GPU processes if _STORE is None:
if _STORE is None: _STORE = th.distributed.TCPStore(
_STORE = th.distributed.TCPStore( host_name, port, world_size, True, timedelta(seconds=30))
host_name, port, world_size, True, timedelta(seconds=30)) for _ in range(1, world_size):
for _ in range(1, world_size): # send embs
# send embs _STORE.set(name, name)
_STORE.set(name, name)
elif rank > 0: elif rank > 0:
# receive # receive
if _STORE is None: if _STORE is None:
......
"""Node embedding optimizers""" """Node embedding optimizers"""
import abc import abc
from abc import abstractmethod from abc import abstractmethod
import gc
import torch as th import torch as th
from ...utils import get_shared_mem_array, create_shared_mem_array from ...utils import get_shared_mem_array, create_shared_mem_array
...@@ -25,6 +26,34 @@ class SparseGradOptimizer(abc.ABC): ...@@ -25,6 +26,34 @@ class SparseGradOptimizer(abc.ABC):
self._world_size = None self._world_size = None
self._shared_cache = {} self._shared_cache = {}
self._clean_grad = False self._clean_grad = False
self._opt_meta = {}
for emb in params:
assert isinstance(emb, NodeEmbedding), \
'DGL SparseOptimizer only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
emb_name = emb.name
if self._rank == 0: # the master gpu process
opt_meta = create_shared_mem_array(emb_name+'_opt_meta', \
(self._world_size, self._world_size), th.int32).zero_()
if self._rank == 0:
emb.store.set(emb_name+'_opt_meta', emb_name)
self._opt_meta[emb_name] = opt_meta
elif self._rank > 0:
# receive
emb.store.wait([emb_name+'_opt_meta'])
opt_meta = get_shared_mem_array(emb_name+'_opt_meta', \
(self._world_size, self._world_size), th.int32)
self._opt_meta[emb_name] = opt_meta
def step(self): def step(self):
''' The step function. ''' The step function.
...@@ -36,92 +65,127 @@ class SparseGradOptimizer(abc.ABC): ...@@ -36,92 +65,127 @@ class SparseGradOptimizer(abc.ABC):
# We cache shared memory buffers in shared_emb. # We cache shared memory buffers in shared_emb.
shared_emb = {emb.name: ([], []) for emb in self._params} shared_emb = {emb.name: ([], []) for emb in self._params}
# hold released shared memory to let other process to munmap it first
# unless it will crash the training
shmem_ptr_holder = []
# Go through all sparse embeddings # Go through all sparse embeddings
for emb in self._params: # pylint: disable=too-many-nested-blocks for emb in self._params: # pylint: disable=too-many-nested-blocks
num_embeddings = emb.num_embeddings
emb_name = emb.name emb_name = emb.name
# Each gpu process takes the resposibility of update a range of sparse embedding, # we need to combine gradients from multiple forward paths
# thus we can parallel the gradient update. idx = []
range_size = (num_embeddings + self._world_size - 1) // self._world_size \ grad = []
if self._world_size > 0 else 0 for i, data in emb._trace:
for idx, data in emb._trace: idx.append(i)
grad = data.grad.data grad.append(data.grad.data)
device = grad.device idx = th.cat(idx, dim=0)
idx_dtype = idx.dtype grad = th.cat(grad, dim=0)
grad_dtype = grad.dtype
grad_dim = grad.shape[1] device = grad.device
idx_dtype = idx.dtype
if self._world_size > 0: grad_dtype = grad.dtype
if emb_name not in self._shared_cache: grad_dim = grad.shape[1]
self._shared_cache[emb_name] = {} if self._world_size > 1:
if emb_name not in self._shared_cache:
for i in range(self._world_size): self._shared_cache[emb_name] = {}
start = i * range_size
end = (i + 1) * range_size \ # Each training process takes the resposibility of updating a range
if (i + 1) * range_size < num_embeddings \ # of node embeddings, thus we can parallel the gradient update.
else num_embeddings # The overall progress includes:
if i == 0: # 1. In each training process:
mask = idx < end # 1.a Deciding which process a node embedding belongs to according
elif i + 1 == self._world_size: # to the formula: process_id = node_idx mod num_of_process(N)
mask = idx >= start # 1.b Split the node index tensor and gradient tensor into N parts
else: # according to step 1.
mask = th.logical_and((idx >= start), (idx < end)) # 1.c Write each node index sub-tensor and gradient sub-tensor into
idx_i = idx[mask] # different DGL shared memory buffers.
grad_i = grad[mask] # 2. Cross training process synchronization
# 3. In each traning process:
if i == self._rank: # 3.a Collect node index sub-tensors and gradient sub-tensors
shared_emb[emb_name][0].append(idx_i) # 3.b Do gradient update
shared_emb[emb_name][1].append(grad_i) # 4. Done
else: idx_split = th.remainder(idx, self._world_size).long()
# currently nccl does not support Alltoallv operation for i in range(self._world_size):
# we need to use CPU shared memory to share gradient mask = idx_split == i
# across processes idx_i = idx[mask]
idx_i = idx_i.to(th.device('cpu')) grad_i = grad[mask]
grad_i = grad_i.to(th.device('cpu'))
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i) if i == self._rank:
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i) shared_emb[emb_name][0].append(idx_i)
shared_emb[emb_name][1].append(grad_i)
if idx_shmem_name not in self._shared_cache[emb_name] or \ else:
self._shared_cache[emb_name][idx_shmem_name].shape[0] \ # currently nccl does not support Alltoallv operation
< idx_i.shape[0]: # we need to use CPU shared memory to share gradient
# in case idx_i.shape[0] is 0 # across processes
idx_shmem = create_shared_mem_array(idx_shmem_name, \ idx_i = idx_i.to(th.device('cpu'))
(idx_i.shape[0] * 2 + 2,), idx_dtype) grad_i = grad_i.to(th.device('cpu'))
grad_shmem = create_shared_mem_array(grad_shmem_name, \ idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
(idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype) grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem # Create shared memory to hold temporary index and gradient tensor for
# cross-process send and recv.
self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \ if idx_shmem_name not in self._shared_cache[emb_name] or \
= idx_i self._shared_cache[emb_name][idx_shmem_name].shape[0] \
self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \ < idx_i.shape[0]:
= grad_i
emb.store.set(idx_shmem_name, str(idx_i.shape[0])) if idx_shmem_name in self._shared_cache[emb_name]:
shmem_ptr_holder.append(
# gather gradients from all other processes self._shared_cache[emb_name][idx_shmem_name])
for i in range(self._world_size): shmem_ptr_holder.append(
if i != self._rank: self._shared_cache[emb_name][grad_shmem_name])
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank) # in case idx_i.shape[0] is 0
size = int(emb.store.get(idx_shmem_name)) idx_shmem = create_shared_mem_array(idx_shmem_name, \
if idx_shmem_name not in self._shared_cache[emb_name] or \ (idx_i.shape[0] * 2 + 2,), idx_dtype)
self._shared_cache[emb_name][idx_shmem_name].shape[0] < size: grad_shmem = create_shared_mem_array(grad_shmem_name, \
idx_shmem = get_shared_mem_array(idx_shmem_name, \ (idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype)
(size * 2 + 2,), idx_dtype) self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
grad_shmem = get_shared_mem_array(grad_shmem_name, \ self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
(size * 2 + 2, grad_dim), grad_dtype)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem # Fill shared memory with temporal index tensor and gradient tensor
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
idx_i = self._shared_cache[emb_name][idx_shmem_name][:size] = idx_i
grad_i = self._shared_cache[emb_name][grad_shmem_name][:size] self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
shared_emb[emb_name][0].append(idx_i.to(device, = grad_i
non_blocking=True)) self._opt_meta[emb_name][self._rank][i] = idx_i.shape[0]
shared_emb[emb_name][1].append(grad_i.to(device, else:
non_blocking=True)) shared_emb[emb_name][0].append(idx)
else: shared_emb[emb_name][1].append(grad)
shared_emb[emb_name][0].append(idx)
shared_emb[emb_name][1].append(grad) # make sure the idx shape is passed to each process through opt_meta
if self._world_size > 1:
th.distributed.barrier()
for emb in self._params: # pylint: disable=too-many-nested-blocks
emb_name = emb.name
if self._world_size > 1:
# gather gradients from all other processes
for i in range(self._world_size):
if i != self._rank:
idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
size = self._opt_meta[emb_name][i][self._rank]
# Retrive shared memory holding the temporal index and gradient
# tensor that is sent to current training process
if idx_shmem_name not in self._shared_cache[emb_name] or \
self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
idx_shmem = get_shared_mem_array(idx_shmem_name, \
(size * 2 + 2,), idx_dtype)
grad_shmem = get_shared_mem_array(grad_shmem_name, \
(size * 2 + 2, grad_dim), grad_dtype)
self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
# make sure shared memory are released in child process first
# This will not be called frequently
# TODO(xiangsx) Provide API to mumap shared memory directly
gc.collect()
idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
shared_emb[emb_name][0].append(idx_i.to(device,
non_blocking=True))
shared_emb[emb_name][1].append(grad_i.to(device,
non_blocking=True))
if self._clean_grad: if self._clean_grad:
# clean gradient track # clean gradient track
...@@ -205,21 +269,12 @@ class SparseAdagrad(SparseGradOptimizer): ...@@ -205,21 +269,12 @@ class SparseAdagrad(SparseGradOptimizer):
assert isinstance(emb, NodeEmbedding), \ assert isinstance(emb, NodeEmbedding), \
'SparseAdagrad only supports dgl.nn.NodeEmbedding' 'SparseAdagrad only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0: if self._rank <= 0:
emb_name = emb.name emb_name = emb.name
state = create_shared_mem_array(emb_name+'_state', \ state = create_shared_mem_array(emb_name+'_state', \
emb.emb_tensor.shape, th.float32).zero_() emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0: if self._rank == 0:
for _ in range(1, world_size): if self._world_size > 1:
# send embs
emb.store.set(emb_name+'_opt', emb_name) emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0: elif self._rank > 0:
# receive # receive
...@@ -318,14 +373,6 @@ class SparseAdam(SparseGradOptimizer): ...@@ -318,14 +373,6 @@ class SparseAdam(SparseGradOptimizer):
assert isinstance(emb, NodeEmbedding), \ assert isinstance(emb, NodeEmbedding), \
'SparseAdam only supports dgl.nn.NodeEmbedding' 'SparseAdam only supports dgl.nn.NodeEmbedding'
if self._rank is None:
self._rank = emb.rank
self._world_size = emb.world_size
else:
assert self._rank == emb.rank, \
'MultiGPU rank for each embedding should be same.'
assert self._world_size == emb.world_size, \
'MultiGPU world_size for each embedding should be same.'
if self._rank <= 0: if self._rank <= 0:
emb_name = emb.name emb_name = emb.name
state_step = create_shared_mem_array(emb_name+'_step', \ state_step = create_shared_mem_array(emb_name+'_step', \
...@@ -335,10 +382,8 @@ class SparseAdam(SparseGradOptimizer): ...@@ -335,10 +382,8 @@ class SparseAdam(SparseGradOptimizer):
state_power = create_shared_mem_array(emb_name+'_power', \ state_power = create_shared_mem_array(emb_name+'_power', \
emb.emb_tensor.shape, th.float32).zero_() emb.emb_tensor.shape, th.float32).zero_()
if self._rank == 0: if self._rank == 0:
state = (state_step, state_mem, state_power)
emb_name = emb.name emb_name = emb.name
for _ in range(1, self._world_size): if self._world_size > 1:
# send embs
emb.store.set(emb_name+'_opt', emb_name) emb.store.set(emb_name+'_opt', emb_name)
elif self._rank > 0: elif self._rank > 0:
# receive # receive
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment