"tests/vscode:/vscode.git/clone" did not exist on "6694f7b9451692f1c91c595c56eb2ce5e218524f"
Unverified Commit 020f0249 authored by nv-dlasalle's avatar nv-dlasalle Committed by GitHub
Browse files

[Performance][Optimizer] Enable using UVA and FP16 with SparseAdam Optimizer (#3885)



* Add uva by default to embedding

* More updates

* Update optimizer

* Add new uva functions

* Expose new pinned memory function

* Add unit tests

* Update formatting

* Fix unit test

* Handle auto UVA case when training is on CPU

* Allow per-embedding decisions for whether to use UVA

* Address spares_optim.py comments

* Remove unused templates

* Update unit test

* Use dgl allocate memory for pinning

* allow automatically unpin

* workaround for d2h copy with a different dtype

* fix linting

* update error message

* update copyright
Co-authored-by: default avatarXin Yao <xiny@nvidia.com>
Co-authored-by: default avatarMinjie Wang <wmjlyjemaine@gmail.com>
parent 548c85ff
......@@ -3,7 +3,9 @@ import abc
from abc import abstractmethod
import torch as th
from ...utils import get_shared_mem_array, create_shared_mem_array
from ...utils import get_shared_mem_array, create_shared_mem_array, \
pin_memory_inplace, gather_pinned_tensor_rows, \
scatter_pinned_tensor_rows
from ...nn.pytorch import NodeEmbedding
from ...cuda import nccl
from ...partition import NDArrayPartition
......@@ -434,7 +436,7 @@ class SparseAdagrad(SparseGradOptimizer):
state = th.empty(
emb.weight.shape,
dtype=th.float32,
device=eth.device('cpu')).zero_()
device=th.device('cpu')).zero_()
elif self._rank == 0:
state = create_shared_mem_array(emb_name+'_state', \
emb.weight.shape, th.float32).zero_()
......@@ -519,6 +521,16 @@ class SparseAdam(SparseGradOptimizer):
eps : float, Optional
The term added to the denominator to improve numerical stability
Default: 1e-8
use_uva : bool, Optional
Whether to use pinned memory for storing 'mem' and 'power' parameters,
when the embedding is stored on the CPU. This will improve training
speed, but will require locking a large number of virtual memory pages.
For embeddings which are stored in GPU memory, this setting will have
no effect.
Default: True if the gradients are generated on the GPU, and False
if the gradients are on the CPU.
dtype : torch.dtype, Optional
The type to store optimizer state with. Default: th.float32.
Examples
--------
......@@ -534,12 +546,26 @@ class SparseAdam(SparseGradOptimizer):
... loss.backward()
... optimizer.step()
'''
def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08):
def __init__(self, params, lr, betas=(0.9, 0.999), eps=1e-08, \
use_uva=None, dtype=th.float32):
super(SparseAdam, self).__init__(params, lr)
self._lr = lr
self._beta1 = betas[0]
self._beta2 = betas[1]
self._eps = eps
self._use_uva = use_uva
self._nd_handle = {}
self._is_using_uva = {}
assert dtype in [th.float16, th.float32], \
"Unsupported dtype {}. Valid choices are th.float32 " \
"and th.float32".format(dtype)
self._dtype = dtype
def _setup_uva(self, name, mem, power):
self._is_using_uva[name] = True
mem_nd = pin_memory_inplace(mem)
power_nd = pin_memory_inplace(power)
self._nd_handle[name] = [mem_nd, power_nd]
def setup(self, params):
# We need to register a state sum for each embedding in the kvstore.
......@@ -547,28 +573,29 @@ class SparseAdam(SparseGradOptimizer):
assert isinstance(emb, NodeEmbedding), \
'SparseAdam only supports dgl.nn.NodeEmbedding'
emb_name = emb.name
self._is_using_uva[emb_name] = self._use_uva
if th.device(emb.emb_tensor.device) == th.device('cpu'):
# if our embedding is on the CPU, our state also has to be
if self._rank < 0:
state_step = th.empty(
(emb.weight.shape[0],),
dtype=th.float32,
dtype=th.int32,
device=th.device('cpu')).zero_()
state_mem = th.empty(
emb.weight.shape,
dtype=th.float32,
dtype=self._dtype,
device=th.device('cpu')).zero_()
state_power = th.empty(
emb.weight.shape,
dtype=th.float32,
dtype=self._dtype,
device=th.device('cpu')).zero_()
elif self._rank == 0:
state_step = create_shared_mem_array(emb_name+'_step', \
(emb.weight.shape[0],), th.float32).zero_()
(emb.weight.shape[0],), th.int32).zero_()
state_mem = create_shared_mem_array(emb_name+'_mem', \
emb.weight.shape, th.float32).zero_()
emb.weight.shape, self._dtype).zero_()
state_power = create_shared_mem_array(emb_name+'_power', \
emb.weight.shape, th.float32).zero_()
emb.weight.shape, self._dtype).zero_()
if self._world_size > 1:
emb.store.set(emb_name+'_opt', emb_name)
......@@ -576,24 +603,32 @@ class SparseAdam(SparseGradOptimizer):
# receive
emb.store.wait([emb_name+'_opt'])
state_step = get_shared_mem_array(emb_name+'_step', \
(emb.weight.shape[0],), th.float32)
(emb.weight.shape[0],), th.int32)
state_mem = get_shared_mem_array(emb_name+'_mem', \
emb.weight.shape, th.float32)
emb.weight.shape, self._dtype)
state_power = get_shared_mem_array(emb_name+'_power', \
emb.weight.shape, th.float32)
emb.weight.shape, self._dtype)
if self._is_using_uva[emb_name]:
# if use_uva has been explicitly set to true, otherwise
# wait until first step to decide
self._setup_uva(emb_name, state_mem, state_power)
else:
# make sure we don't use UVA when data is on the GPU
self._is_using_uva[emb_name] = False
# distributed state on on gpu
state_step = th.empty(
[emb.emb_tensor.shape[0]],
dtype=th.float32,
dtype=th.int32,
device=emb.emb_tensor.device).zero_()
state_mem = th.empty(
emb.emb_tensor.shape,
dtype=th.float32,
dtype=self._dtype,
device=emb.emb_tensor.device).zero_()
state_power = th.empty(
emb.emb_tensor.shape,
dtype=th.float32,
dtype=self._dtype,
device=emb.emb_tensor.device).zero_()
state = (state_step, state_mem, state_power)
emb.set_optm_state(state)
......@@ -613,20 +648,34 @@ class SparseAdam(SparseGradOptimizer):
Sparse embedding to update.
"""
with th.no_grad():
beta1 = self._beta1
beta2 = self._beta2
eps = self._eps
clr = self._lr
state_step, state_mem, state_power = emb.optm_state
exec_dtype = grad.dtype
exec_dev = grad.device
state_dev = state_step.device
# whether or not we need to transfer data from the GPU to the CPU
# while updating the weights
is_d2h = state_dev.type == 'cpu' and exec_dev.type == 'cuda'
# only perform async copies cpu -> gpu, or gpu-> gpu, but block
# when copying to the cpu, so as to ensure the copy is finished
# before operating on the data on the cpu
state_block = state_dev == th.device('cpu') and exec_dev != state_dev
state_block = is_d2h
if self._is_using_uva[emb.name] is None and is_d2h:
# we should use UVA going forward
self._setup_uva(emb.name, state_mem, state_power)
elif self._is_using_uva[emb.name] is None:
# we shouldn't use UVA going forward
self._is_using_uva[emb.name] = False
use_uva = self._is_using_uva[emb.name]
beta1 = self._beta1
beta2 = self._beta2
eps = self._eps
clr = self._lr
# There can be duplicated indices due to sampling.
# Thus unique them here and average the gradient here.
grad_indices, inverse, cnt = th.unique(idx,
......@@ -635,8 +684,16 @@ class SparseAdam(SparseGradOptimizer):
state_idx = grad_indices.to(state_dev)
state_step[state_idx] += 1
state_step = state_step[state_idx].to(exec_dev)
if use_uva:
orig_mem = gather_pinned_tensor_rows(state_mem, grad_indices)
orig_power = gather_pinned_tensor_rows(state_power, grad_indices)
else:
orig_mem = state_mem[state_idx].to(exec_dev)
orig_power = state_power[state_idx].to(exec_dev)
# convert to exec dtype
orig_mem = orig_mem.to(dtype=exec_dtype)
orig_power = orig_power.to(dtype=exec_dtype)
grad_values = th.zeros((grad_indices.shape[0], grad.shape[1]), device=exec_dev)
grad_values.index_add_(0, inverse, grad)
......@@ -647,8 +704,19 @@ class SparseAdam(SparseGradOptimizer):
update_mem = beta1 * orig_mem + (1.-beta1) * grad_mem
update_power = beta2 * orig_power + (1.-beta2) * grad_power
update_mem_dst = update_mem.to(state_dev, non_blocking=True)
update_power_dst = update_power.to(state_dev, non_blocking=True)
if use_uva:
scatter_pinned_tensor_rows(state_mem, \
grad_indices, \
update_mem.to(dtype=self._dtype))
scatter_pinned_tensor_rows(state_power, \
grad_indices, \
update_power.to(dtype=self._dtype))
else:
update_mem_dst = update_mem.to(dtype=self._dtype).to(
state_dev, non_blocking=True)
update_power_dst = update_power.to(dtype=self._dtype).to(
state_dev, non_blocking=True)
if state_block:
# use events to try and overlap CPU and GPU as much as possible
update_event = th.cuda.Event()
......@@ -664,6 +732,9 @@ class SparseAdam(SparseGradOptimizer):
if state_block:
std_event = th.cuda.Event()
std_event.record()
if not use_uva:
if state_block:
# wait for our transfers from exec_dev to state_dev to finish
# before we can use them
update_event.wait()
......
......@@ -49,4 +49,21 @@ def gather_pinned_tensor_rows(tensor, rows):
"""
return F.from_dgl_nd(_CAPI_DGLIndexSelectCPUFromGPU(F.to_dgl_nd(tensor), F.to_dgl_nd(rows)))
def scatter_pinned_tensor_rows(dest, rows, source):
"""Directly scatter rows from a GPU tensor given an indices array on CUDA devices,
to a pinned tensor on the CPU.
Parameters
----------
dest : Tensor
The tensor on the CPU to scatter rows to. Must be in pinned memory.
rows : Tensor
The rows to scatter. Must be a CUDA tensor with unique entries.
source : Tensor
The tensor on the GPU to scatter rows from.
"""
_CAPI_DGLIndexScatterGPUToCPU(F.to_dgl_nd(dest), F.to_dgl_nd(rows),
F.to_dgl_nd(source))
_init_api("dgl.ndarray.uvm", __name__)
/*!
* Copyright (c) 2021 by Contributors
* \file array/cpu/array_index_select.cuh
* Copyright (c) 2021-2022 by Contributors
* \file array/cuda/array_index_select.cuh
* \brief Array index select GPU kernel implementation
*/
......@@ -50,6 +50,45 @@ __global__ void IndexSelectMultiKernel(
}
}
template <typename DType, typename IdType>
__global__ void IndexScatterSingleKernel(const DType* array,
const IdType* index,
const int64_t length,
const int64_t arr_len,
DType* out) {
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int stride_x = gridDim.x * blockDim.x;
while (tx < length) {
assert(index[tx] >= 0 && index[tx] < arr_len);
out[index[tx]] = array[tx];
tx += stride_x;
}
}
template <typename DType, typename IdType>
__global__ void IndexScatterMultiKernel(
const DType* const array,
const int64_t num_feat,
const IdType* const index,
const int64_t length,
const int64_t arr_len,
DType* const out) {
int64_t in_row = blockIdx.x*blockDim.y+threadIdx.y;
const int64_t stride = blockDim.y*gridDim.x;
while (in_row < length) {
int64_t col = threadIdx.x;
const int64_t out_row = index[in_row];
assert(out_row >= 0 && out_row < arr_len);
while (col < num_feat) {
out[out_row*num_feat+col] = array[in_row*num_feat+col];
col += blockDim.x;
}
in_row += stride;
}
}
} // namespace impl
} // namespace aten
} // namespace dgl
......
/*!
* Copyright (c) 2019 by Contributors
* \file array/cpu/array_index_select_uvm.cu
* Copyright (c) 2019-2022 by Contributors
* \file array/cuda/uvm/array_index_select_uvm.cu
* \brief Array index select GPU implementation
*/
#include <dgl/array.h>
......@@ -62,6 +62,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
return ret;
}
// floating point types are treated as their equal width integer types
template NDArray IndexSelectCPUFromGPU<int8_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int8_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int16_t, int32_t>(NDArray, IdArray);
......@@ -70,10 +71,57 @@ template NDArray IndexSelectCPUFromGPU<int32_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<float, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<float, int64_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<double, int32_t>(NDArray, IdArray);
template NDArray IndexSelectCPUFromGPU<double, int64_t>(NDArray, IdArray);
template<typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
DType* dest_data = static_cast<DType*>(dest->data);
const DType* source_data = static_cast<DType*>(source->data);
const IdType* idx_data = static_cast<IdType*>(index->data);
const int64_t arr_len = dest->shape[0];
const int64_t len = index->shape[0];
int64_t num_feat = 1;
std::vector<int64_t> shape{len};
CHECK(dest.IsPinned());
CHECK_EQ(index->ctx.device_type, kDLGPU);
CHECK_EQ(source->ctx.device_type, kDLGPU);
for (int d = 1; d < source->ndim; ++d) {
num_feat *= source->shape[d];
}
if (len == 0)
return;
if (num_feat == 1) {
const int nt = cuda::FindNumThreads(len);
const int nb = (len + nt - 1) / nt;
CUDA_KERNEL_CALL(IndexScatterSingleKernel, nb, nt, 0,
thr_entry->stream, source_data, idx_data, len, arr_len, dest_data);
} else {
dim3 block(256, 1);
while (static_cast<int64_t>(block.x) >= 2*num_feat) {
block.x /= 2;
block.y *= 2;
}
const dim3 grid((len+block.y-1)/block.y);
CUDA_KERNEL_CALL(IndexScatterMultiKernel, grid, block, 0,
thr_entry->stream, source_data, num_feat, idx_data,
len, arr_len, dest_data);
}
}
// floating point types are treated as their equal width integer types
template void IndexScatterGPUToCPU<int8_t, int32_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int8_t, int64_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int16_t, int32_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int16_t, int64_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int32_t, int32_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int32_t, int64_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int64_t, int32_t>(NDArray, IdArray, NDArray);
template void IndexScatterGPUToCPU<int64_t, int64_t>(NDArray, IdArray, NDArray);
} // namespace impl
} // namespace aten
......
/*!
* Copyright (c) 2019 by Contributors
* Copyright (c) 2019-2022 by Contributors
* \file array/uvm_array.cc
* \brief DGL array utilities implementation
*/
......@@ -15,24 +15,42 @@ namespace aten {
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
#ifdef DGL_USE_CUDA
CHECK(array.IsPinned())
<< "Only the CPUPinned device type input array is supported";
CHECK_EQ(index->ctx.device_type, kDLGPU)
<< "Only the GPU device type input index is supported";
CHECK(array.IsPinned()) << "Input array must be in pinned memory.";
CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension.";
CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
CHECK_GE(array->ndim, 1) << "Only support array with at least 1 dimension";
CHECK_EQ(index->ndim, 1) << "Index array must be an 1D array.";
ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, "values", {
ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
return impl::IndexSelectCPUFromGPU<DType, IdType>(array, index);
});
});
#endif
LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA";
LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA.";
// Should be unreachable
return NDArray{};
}
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
#ifdef DGL_USE_CUDA
CHECK(dest.IsPinned()) << "Destination array must be in pinned memory.";
CHECK_EQ(index->ctx.device_type, kDLGPU) << "Index must be on the GPU.";
CHECK_EQ(source->ctx.device_type, kDLGPU) << "Source array must be on the GPU.";
CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
"array must have the same dtype.";
CHECK_GE(dest->ndim, 1) << "Destination array must have at least 1 dimension.";
CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", {
ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
impl::IndexScatterGPUToCPU<DType, IdType>(dest, index, source);
});
});
#else
LOG(FATAL) << "IndexScatterGPUToCPU requires CUDA.";
#endif
}
DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
NDArray array = args[0];
......@@ -40,5 +58,15 @@ DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
*rv = IndexSelectCPUFromGPU(array, index);
});
DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
NDArray dest = args[0];
IdArray index = args[1];
NDArray source = args[2];
IndexScatterGPUToCPU(dest, index, source);
});
} // namespace aten
} // namespace dgl
/*!
* Copyright (c) 2019 by Contributors
* \file array/array_op.h
* Copyright (c) 2019-2022 by Contributors
* \file array/uvm_array_op.h
* \brief Array operator templates
*/
#ifndef DGL_ARRAY_UVM_ARRAY_OP_H_
......@@ -17,6 +17,9 @@ namespace impl {
template <typename DType, typename IdType>
NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index);
template <typename DType, typename IdType>
void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source);
} // namespace impl
} // namespace aten
} // namespace dgl
......
......@@ -10,9 +10,9 @@ from dgl.nn import NodeEmbedding
from dgl.optim import SparseAdam, SparseAdagrad
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_sparse_adam():
@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
def test_sparse_adam(emb_dim):
num_embs = 10
emb_dim = 4
device=F.ctx()
dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test')
torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
......@@ -28,7 +28,50 @@ def test_sparse_adam():
idx = th.randint(0, num_embs, size=(4,))
dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
torch_value = torch_emb(idx)
labels = th.ones((4,)).long()
labels = th.zeros((4,)).long()
print("dgl_value = {}".format(dgl_value))
print("labels = {}".format(labels))
dgl_adam.zero_grad()
torch_adam.zero_grad()
dgl_loss = th.nn.functional.cross_entropy(dgl_value, labels)
torch_loss = th.nn.functional.cross_entropy(torch_value, labels)
dgl_loss.backward()
torch_loss.backward()
dgl_adam.step()
torch_adam.step()
assert F.allclose(dgl_emb.weight, torch_emb.weight)
# Can not test second step
# Pytorch sparseAdam maintains a global step
# DGL sparseAdam use a per embedding step
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@pytest.mark.parametrize('use_uva', [False, True, None])
@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
def test_sparse_adam_uva(use_uva, emb_dim):
if F.ctx().type == 'cpu' and use_uva == True:
# we want to only test values of False and None when not using GPU
pytest.skip("UVA cannot be used without GPUs.")
num_embs = 10
device=F.ctx()
dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test_uva{}'.format(use_uva))
torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
th.manual_seed(0)
th.nn.init.uniform_(torch_emb.weight, 0, 1.0)
th.manual_seed(0)
th.nn.init.uniform_(dgl_emb.weight, 0, 1.0)
dgl_adam = SparseAdam(params=[dgl_emb], lr=0.01, use_uva=use_uva)
torch_adam = th.optim.SparseAdam(list(torch_emb.parameters()), lr=0.01)
# first step
idx = th.randint(0, num_embs, size=(4,))
dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
torch_value = torch_emb(idx)
labels = th.zeros((4,)).long()
dgl_adam.zero_grad()
torch_adam.zero_grad()
......@@ -45,6 +88,45 @@ def test_sparse_adam():
# Pytorch sparseAdam maintains a global step
# DGL sparseAdam use a per embedding step
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
@pytest.mark.parametrize('dtype', [th.float32, th.float16])
@pytest.mark.parametrize('emb_dim', [1, 4, 101, 1024])
def test_sparse_adam_dtype(dtype, emb_dim):
num_embs = 10
device=F.ctx()
dgl_emb = NodeEmbedding(num_embs, emb_dim, 'test_dtype{}'.format(dtype))
torch_emb = th.nn.Embedding(num_embs, emb_dim, sparse=True)
th.manual_seed(0)
th.nn.init.uniform_(torch_emb.weight, 0, 1.0)
th.manual_seed(0)
th.nn.init.uniform_(dgl_emb.weight, 0, 1.0)
dgl_adam = SparseAdam(params=[dgl_emb], lr=0.01, dtype=dtype)
torch_adam = th.optim.SparseAdam(list(torch_emb.parameters()), lr=0.01)
# first step
idx = th.randint(0, num_embs, size=(4,))
dgl_value = dgl_emb(idx, device).to(th.device('cpu'))
torch_value = torch_emb(idx)
labels = th.zeros((4,)).long()
dgl_adam.zero_grad()
torch_adam.zero_grad()
dgl_loss = th.nn.functional.cross_entropy(dgl_value, labels)
torch_loss = th.nn.functional.cross_entropy(torch_value, labels)
dgl_loss.backward()
torch_loss.backward()
dgl_adam.step()
torch_adam.step()
assert F.allclose(dgl_emb.weight, torch_emb.weight)
# Can not test second step
# Pytorch sparseAdam maintains a global step
# DGL sparseAdam use a per embedding step
@unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
def test_sparse_adam_zero_step():
num_embs = 10
......@@ -385,7 +467,10 @@ def test_multiprocess_sparse_adam_zero_step_cuda_tensor(num_workers):
assert F.allclose(dgl_weight, torch_weight)
if __name__ == '__main__':
test_sparse_adam()
test_sparse_adam(1)
test_sparse_adam(4)
test_sparse_adam(101)
test_sparse_adam(1024)
test_sparse_adam_zero_step()
test_multiprocess_cpu_sparse_adam(2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment