Commit dcc7b513 authored by Jeff Daily's avatar Jeff Daily
Browse files

Merge remote-tracking branch 'upstream/master'

Conflicts:
csrc/multi_tensor_apply.cuh
setup.py
tests/L0/run_optimizers/test_adagrad.py
tests/L0/run_optimizers/test_fused_optimizer.py
tests/L0/run_optimizers/test_lamb.py
parents d061bf20 154c6336
......@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
if channel_last:
count = int(input.numel()/input.size(-1))
mean, var_biased = syncbn.welford_mean_var_c_last(input)
num_channels = input.size(-1)
else:
count = int(input.numel()/input.size(1))
mean, var_biased = syncbn.welford_mean_var(input)
num_channels = input.size(1)
if torch.distributed.is_initialized():
if not process_group:
process_group = torch.distributed.group.WORLD
device = mean.device
world_size = torch.distributed.get_world_size(process_group)
mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=device)
var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=device)
count_all = torch.cuda.IntTensor(world_size, device=device)
mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)]
var_l = [var_all.narrow(0, i, 1) for i in range(world_size)]
count_l = [count_all.narrow(0, i, 1) for i in range(world_size)]
torch.distributed.all_gather(mean_l, mean, process_group)
torch.distributed.all_gather(var_l, var_biased, process_group)
torch.distributed.all_gather(
count_l,
torch.cuda.IntTensor([count], device=device),
process_group)
mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count_all, eps)
count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill_(count)
combined = torch.cat([mean.view(-1), var_biased.view(-1), count_t], dim=0)
combined_list = [torch.empty_like(combined) for k in range(world_size)]
torch.distributed.all_gather(combined_list, combined, process_group)
combined = torch.stack(combined_list, dim=0)
mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
count_all = count_all.view(-1)
mean, var, inv_std = syncbn.welford_parallel(mean_all, invstd_all, count_all.to(torch.int32), eps)
else:
device = mean.device
count_all = torch.cuda.IntTensor([count], device=device)
inv_std = 1.0 / torch.sqrt(var_biased + eps)
var = var_biased * (count) / (count-1)
var = var_biased * (count) / (count-1)
if count == 1 and world_size < 2:
raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
......@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
mean = running_mean.data
inv_std = 1.0 / torch.sqrt(running_variance.data + eps)
ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all)
ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all.to(torch.int32))
ctx.process_group = process_group
ctx.channel_last = channel_last
ctx.world_size = world_size
......@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
if ctx.needs_input_grad[0]:
if torch.distributed.is_initialized():
num_channels = sum_dy.shape[0]
combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
torch.distributed.all_reduce(
sum_dy, ReduceOp.SUM, process_group)
torch.distributed.all_reduce(
sum_dy_xmu, ReduceOp.SUM, process_group)
combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
if channel_last:
grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count)
else:
......
......@@ -35,7 +35,7 @@ __global__ void multi_tensor_apply_kernel(
ArgTypes... args)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...);
callable(chunk_size, noop_flag, tl, args...);
}
template<int depth, typename T, typename... ArgTypes>
......@@ -50,8 +50,9 @@ void multi_tensor_apply(
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
for(int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
auto ref_device = tensor_lists[0][0].device();
TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
{
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for(int t = 0; t < tensor_lists[l].size(); t++)
......@@ -62,7 +63,7 @@ void multi_tensor_apply(
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].is_cuda(), "A tensor was not cuda.");
TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
}
}
......@@ -71,8 +72,9 @@ void multi_tensor_apply(
TensorListMetadata<depth> tl;
const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
auto stream = at::cuda::getCurrentCUDAStream();
tl.start_tensor_this_launch = 0;
int loc_block_info = 0;
int loc_tensor_info = 0;
......@@ -98,7 +100,7 @@ void multi_tensor_apply(
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
......@@ -124,7 +126,7 @@ void multi_tensor_apply(
if(chunk == chunks_this_tensor - 1)
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
loc_tensor_info = 0;
loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1;
}
else
......
......@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>
......@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
max_chunks_per_tensor);)
AT_CUDA_CHECK(cudaGetLastError());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
auto ret = at::empty({1}, output.options());
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
auto stream = at::cuda::getCurrentCUDAStream();
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
output.DATA_PTR<float>(),
......@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
const int norm_type)
{
auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
TORCH_CHECK(tensor_lists[0][0].device() == noop_flag.device(), "noop flag should be on the same device as tensors");
// we don't need global thus uses empty here
auto output = at::empty({320}, float_options);
......
......@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half,
"Additional output tensors should always be fp16.");
TORCH_CHECK(noop_flag.device() == tensor_lists[0][0].device(), "expected noop flag to be on the same device as tensors");
// We have 3 possibilities to handle here, in terms of
// grad_type, param_type, momentum_type, requires_fp16_copy
// 1. fp16, fp16, fp16, No
......
......@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
at::Tensor inv_std = at::empty_like(out_var);
at::Tensor out_mean = at::empty_like(out_var);
at::Tensor mean_feature_nodes_ = mean_feature_nodes.contiguous();
at::Tensor var_biased_ = var_biased.contiguous();
at::Tensor numel_ = numel.contiguous();
// TODO(jie): tile this for memory coalescing!
const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE);
const int grid = std::max<int>(1, feature_size / block);
......@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
using namespace at;
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
mean_feature_nodes.DATA_PTR<scalar_t_0>(),
var_biased.DATA_PTR<scalar_t_0>(),
numel.DATA_PTR<int>(),
mean_feature_nodes_.DATA_PTR<scalar_t_0>(),
var_biased_.DATA_PTR<scalar_t_0>(),
numel_.DATA_PTR<int>(),
out_mean.DATA_PTR<scalar_t_0>(),
out_var.DATA_PTR<scalar_t_0>(),
inv_std.DATA_PTR<scalar_t_0>(),
......
......@@ -182,6 +182,7 @@ def main():
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
args.start_epoch = checkpoint['epoch']
global best_prec1
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
......@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
......
import torch
from torch.utils import cpp_extension
from setuptools import setup, find_packages
import subprocess
......@@ -9,6 +10,16 @@ import os
# ninja build does not work unless include_dirs are abs path
this_dir = os.path.dirname(os.path.abspath(__file__))
def get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return raw_output, bare_metal_major, bare_metal_minor
if not torch.cuda.is_available():
# https://github.com/NVIDIA/apex/issues/486
# Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
......@@ -16,11 +27,16 @@ if not torch.cuda.is_available():
print('\nWarning: Torch did not find available GPUs on this system.\n',
'If your intention is to cross-compile, this is not an error.\n'
'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
'Volta (compute capability 7.0), and Turing (compute capability 7.5).\n'
'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
'If you wish to cross-compile for a single specific architecture,\n'
'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
_, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
if int(bare_metal_major) == 11:
os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
else:
os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__))
TORCH_MAJOR = int(torch.__version__.split('.')[0])
......@@ -64,13 +80,18 @@ if "--cpp_ext" in sys.argv:
CppExtension('apex_C',
['csrc/flatten_unflatten.cpp',]))
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
def get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return raw_output, bare_metal_major, bare_metal_minor
def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
torch_binary_major = torch.version.cuda.split(".")[0]
torch_binary_minor = torch.version.cuda.split(".")[1]
......@@ -109,6 +130,25 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
if "--distributed_adam" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--distributed_adam")
from torch.utils.cpp_extension import BuildExtension
cmdclass['build_ext'] = BuildExtension
if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--distributed_adam was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else:
ext_modules.append(
CUDAExtension(name='distributed_adam_cuda',
sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu'],
include_dirs=[os.path.join(this_dir, 'csrc')],
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
'nvcc':['-O3',
'--use_fast_math'] + version_dependent_macros}))
if "--distributed_lamb" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--distributed_lamb")
......@@ -228,7 +268,7 @@ if "--xentropy" in sys.argv:
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
include_dirs=[os.path.join(this_dir, 'csrc')],
extra_compile_args=['-O3'] + version_dependent_macros))
if "--deprecated_fused_adam" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
......@@ -281,6 +321,7 @@ torch_dir = torch.__path__[0]
if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
generator_flag = ['-DOLD_GENERATOR']
if "--fast_multihead_attn" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--fast_multihead_attn")
......@@ -291,6 +332,13 @@ if "--fast_multihead_attn" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else:
# Check, if CUDA11 is installed for compute capability 8.0
cc_flag = []
_, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
if int(bare_metal_major) >= 11:
cc_flag.append('-gencode')
cc_flag.append('arch=compute_80,code=sm_80')
subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
ext_modules.append(
CUDAExtension(name='fast_additive_mask_softmax_dropout',
......@@ -304,7 +352,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_mask_softmax_dropout',
sources=['apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp',
......@@ -317,7 +365,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_self_multihead_attn_bias_additive_mask',
sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp',
......@@ -330,7 +378,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_self_multihead_attn_bias',
sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp',
......@@ -343,7 +391,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_self_multihead_attn',
sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',
......@@ -356,7 +404,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_self_multihead_attn_norm_add',
sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp',
......@@ -369,7 +417,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_encdec_multihead_attn',
sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp',
......@@ -382,7 +430,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
ext_modules.append(
CUDAExtension(name='fast_encdec_multihead_attn_norm_add',
sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp',
......@@ -395,7 +443,7 @@ if "--fast_multihead_attn" in sys.argv:
'-U__CUDA_NO_HALF_CONVERSIONS__',
'--expt-relaxed-constexpr',
'--expt-extended-lambda',
'--use_fast_math'] + version_dependent_macros + generator_flag}))
'--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
setup(
name='apex',
......
import unittest
import apex
import torch
from apex.testing.common_utils import skipIfRocm
class TestFusedAdagrad(unittest.TestCase):
def setUp(self, max_abs_diff=1e-6, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, adagrad_option, apex_only=False):
ref_param = []
tst_param = []
for tensor in tensors:
if apex_only:
ref_param.append(torch.nn.Parameter(tensor.clone().float()))
else:
ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone()))
if apex_only:
ref_optim = apex.optimizers.FusedAdagrad(ref_param, **adagrad_option)
else:
ref_optim = torch.optim.Adagrad(ref_param, **adagrad_option)
tst_optim = apex.optimizers.FusedAdagrad(tst_param, **adagrad_option)
return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param, apex_only=False):
for p_ref, p_tst in zip(ref_param, tst_param):
p_tst.grad = torch.rand_like(p_tst)
p_ref.grad = p_tst.grad.detach().float() if apex_only else p_tst.grad
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
half_grads = []
for p_ref, _ in zip(ref_param, tst_param):
half_grads.append(torch.rand_like(p_ref).half())
p_ref.grad = half_grads[-1].float() / scale
return half_grads
def get_max_diff(self, ref_param, tst_param, apex_only=False):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
if apex_only:
p_tst = p_tst.float()
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff:
max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff:
max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, apex_only=False):
nelem = 278011
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
tensor = torch.rand(nelem, dtype=param_type, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option, apex_only=apex_only
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param, apex_only=apex_only)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
if not apex_only:
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types(see skip note for test above).
@skipIfRocm
def test_bfloat16(self):
self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@skipIfRocm
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_adagrad_option(self):
nelem = 1
adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
import argparse
import random
import sys
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from apex import amp
from apex.optimizers import FusedAdam
from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
class TestModel(torch.nn.Module):
def __init__(self, args):
super(TestModel, self).__init__()
self.linear = torch.nn.Sequential(*[torch.nn.Linear(args.dim, args.dim, bias=args.bias) for _ in range(args.layers)])
def forward(self, x):
return self.linear(x)
def setup(args):
## Model
ref_model = TestModel(args).cuda()
dist_model = TestModel(args).cuda()
# Same weights
with torch.no_grad():
for dp, rp in zip(dist_model.parameters(), ref_model.parameters()):
dp.data.copy_(rp.data)
dist_model = dist_model.half()
## Optimizer
# same hyperparameters
ref_opt_args = { 'lr': 1e-3, 'eps': 1e-6, 'weight_decay': 0.01 }
ref_opt = FusedAdam(ref_model.parameters(), **ref_opt_args)
dist_opt_args = ref_opt_args.copy()
dist_opt_args.update( {'overlap_reductions' : False} )
dist_opt_args.update( {'process_group_size' : args.n_gpu} )
dist_opt_args.update( {'dwu_group_size' : args.dwu_group_size} )
dist_opt_args.update( {'dwu_num_blocks' : 1} )
dist_opt_args.update( {'dwu_num_chunks' : 1} )
dist_opt = DistributedFusedAdam(dist_model.parameters(), **dist_opt_args)
dist_opt.set_global_scale(1.)
## amp-init
amp_args = { 'loss_scale' : 'dynamic' , 'opt_level' : 'O2'}
ref_model, ref_opt = amp.initialize(ref_model, ref_opt, **amp_args)
## DDP
ref_model = DDP(ref_model, device_ids=[args.rank])
with torch.no_grad():
for dp in dist_model.parameters():
torch.distributed.broadcast(dp.data, src=0)
for rp in ref_model.parameters():
torch.distributed.broadcast(rp.data, src=0)
torch.cuda.synchronize()
torch.distributed.barrier()
if get_rank() == 0:
print(f'dist opt with {args.n_gpu} GPUs')
return ref_model, ref_opt, dist_model, dist_opt
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=-1)
parser.add_argument('--steps', type=int, default=20)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--dim', type=int, default=4)
parser.add_argument('--layers', type=int, default=2)
parser.add_argument('--bias', action='store_true')
parser.add_argument('--atol', type=float, default=1e-3)
parser.add_argument('--rtol', type=float, default=1)
parser.add_argument('--dwu_group_size', type=float, default=1)
args = parser.parse_args()
return args
def setup_env(args):
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.rank = torch.distributed.get_rank()
args.n_gpu = torch.distributed.get_world_size()
seed = 42 + get_rank()
random.seed(seed)
torch.manual_seed(seed)
return args
def get_rank():
return torch.distributed.get_rank()
def main():
args = parse_args()
args = setup_env(args)
tol_args = { 'atol' : args.atol, 'rtol' : args.rtol }
torch.set_printoptions(precision=16)
ref_model, ref_opt, dist_model, dist_opt = setup(args)
# lazy_init not called yet, initialize stash
stash = ref_opt._amp_stash
stash.all_fp16_params, stash.all_fp32_from_fp16_params = [], []
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt._init_everything()
for i in range(args.steps):
x_ref = torch.randn(args.batch, args.dim, dtype=torch.half).cuda().requires_grad_(True)
x_dist = x_ref.clone().detach().requires_grad_(True)
if get_rank() == 0:
print(f'[{i}] Checking input')
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert(torch.allclose(x_ref, x_dist, **tol_args))
y_ref = ref_model(x_ref).half()
y_dist = dist_model(x_dist)
if get_rank() == 0:
print(f'[{i}] Checking output')
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert(torch.allclose(y_ref, y_dist, **tol_args))
dy = torch.randn_like(y_ref)
y_ref.backward(dy)
y_dist.backward(dy)
if get_rank() == 0:
print(f'[{i}] Checking gradients')
torch.distributed.barrier()
torch.cuda.synchronize()
assert(torch.allclose(x_ref.grad, x_dist.grad, **tol_args))
# gradient all-reduce within distributed optimizer
dist_opt.complete_reductions()
if get_rank() == 0:
print(f'[{i}] Stepping')
ref_opt.step()
dist_opt.step()
torch.cuda.synchronize()
torch.distributed.barrier()
print('Checking new weights')
if get_rank() == 0:
print("ref param:", ref_model.module.linear[0].weight)
print("dist param:", dist_model.linear[0].weight)
for i, (rp, dp) in enumerate(zip(ref_model.parameters(), dist_model.parameters())):
if not torch.allclose(rp, dp, **tol_args):
if get_rank() == 0:
print(f'Rank: {get_rank()}, Param: {i}')
print(f'ref: {rp.sum().item()}, dist: {dp.sum().item()}')
print(rp)
print(dp)
print(torch.abs(rp-dp) > tol_args['atol'])
sys.exit(0)
# zero grads
for rp, dp in zip(ref_model.parameters(), dist_model.parameters()):
rp.grad = None
dp.grad = None
if __name__ == "__main__":
main()
......@@ -4,10 +4,11 @@ import random
import torch
import apex
from itertools import product
from apex.testing.common_utils import skipIfRocm
class TestFusedAdam(unittest.TestCase):
class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
......@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
def tearDown(self):
pass
def gen_param_optim(self, tensors, adam_option, apex_only=False):
def gen_param_optim(self, tensors, options, apex_only=False):
ref_param = []
tst_param = []
for tensor in tensors:
......@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
tst_param.append(torch.nn.Parameter(tensor.clone()))
if apex_only:
ref_optim = apex.optimizers.FusedAdam(ref_param, **adam_option)
ref_optim = self.fused_optim(ref_param, **options)
else:
ref_optim = torch.optim.Adam(ref_param, **adam_option)
tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
ref_optim = self.ref_optim(ref_param, **options)
tst_optim = self.fused_optim(tst_param, **options)
return (ref_param, tst_param, ref_optim, tst_optim)
......@@ -60,26 +61,32 @@ class TestFusedAdam(unittest.TestCase):
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, apex_only=False):
def gen_single_type_test(self, param_type=torch.float, apex_only=False, device='cuda'):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=param_type, device='cuda')
tensor = torch.rand(nelem, dtype=param_type, device=device)
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option, apex_only=apex_only)
self.gen_param_optim([tensor], self.options, apex_only=apex_only)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param, apex_only=apex_only)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
if not apex_only:
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
class TestFusedAdam(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedAdam, self).__init__(*args, **kwargs)
self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay': 0, 'amsgrad': False}
self.ref_optim = torch.optim.Adam
self.fused_optim = apex.optimizers.FusedAdam
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
......@@ -95,17 +102,22 @@ class TestFusedAdam(unittest.TestCase):
self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked')
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, adam_option)
self.gen_param_optim(tensors, self.options)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
......@@ -118,12 +130,9 @@ class TestFusedAdam(unittest.TestCase):
@unittest.skip('No longer support fuse scaling')
def test_scale(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
self.gen_param_optim([tensor], self.options)
for i in range(self.iters):
scale = random.random() * 1000
......@@ -138,12 +147,10 @@ class TestFusedAdam(unittest.TestCase):
@unittest.skip('No longer support output fp16 param')
def test_fp16_output(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
self.gen_param_optim([tensor], self.options)
fp16_param = torch.nn.Parameter(tensor.clone().half())
......@@ -180,6 +187,103 @@ class TestFusedAdam(unittest.TestCase):
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedAdagrad(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedAdagrad, self).__init__(*args, **kwargs)
self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
self.ref_optim = torch.optim.Adagrad
self.fused_optim = apex.optimizers.FusedAdagrad
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_params_different_devices_throws(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for i, size in enumerate(sizes):
tensors.append(torch.rand(size, dtype=torch.float, device="cuda:"+str(i % 2)))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
self.gen_grad(ref_param, tst_param)
with self.assertRaisesRegex(RuntimeError, "not on the same device"):
tst_optim.step()
def test_adagrad_option(self):
nelem = 1
adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedSGD(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedSGD, self).__init__(*args, **kwargs)
self.options = {"lr": .25, "momentum": .125}
self.ref_optim = torch.optim.SGD
self.fused_optim = apex.optimizers.FusedSGD
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
......@@ -6,6 +6,7 @@ from torch.optim import Optimizer
import apex
from apex.multi_tensor_apply import multi_tensor_applier
from apex.testing.common_utils import skipIfRocm
from itertools import product
class RefLAMB(Optimizer):
r"""Implements Lamb algorithm.
......@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
import amp_C
self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
self.multi_tensor_lamb = amp_C.multi_tensor_lamb
else:
raise RuntimeError('apex.optimizers.FusedLAMB requires cuda extensions')
......@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.')
g_norm_32, g_norm_16 = torch.zeros(1, device='cuda'), torch.zeros(1, device='cuda')
device = self.param_groups[0]["params"][0].device
g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
# compute grad norm for two lists
if len(g_all_32) > 0:
g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
......@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
self._dummy_overflow_buf,
[[g_norm_32, g_norm_16]],
False)[0]
max_grad_norm = 1.0
clipped_ratio = max_grad_norm / max(global_grad_norm, max_grad_norm)
......@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
if p.grad is None:
continue
p.grad.data *= clipped_ratio
grad = p.grad.data
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
......@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
state['g_norm'] = g_norm
state['trust_ratio'] = trust_ratio
step_size = group['lr']
step_size = group['lr']
p.data.add_(update, alpha=-step_size*trust_ratio)
......@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float):
def gen_single_type_test(self, param_type=torch.float, device="cuda"):
nelem = 278011
tensor = torch.rand(nelem, dtype=param_type, device='cuda')
tensor = torch.rand(nelem, dtype=param_type, device=device)
weight_decay = [0, 0.01]
for wd in weight_decay:
lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
ref_param, tst_param, ref_optim, tst_optim = \
......@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
torch.cuda.synchronize()
tst_optim.step()
torch.cuda.synchronize()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
......@@ -217,10 +221,17 @@ class TestFusedLAMB(unittest.TestCase):
self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
weight_decay = [0, 0.01]
for wd in weight_decay:
lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
tensors = []
......@@ -242,7 +253,7 @@ class TestFusedLAMB(unittest.TestCase):
nelem = 1
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
weight_decay = [0, 0.01]
for wd in weight_decay:
lamb_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, 'weight_decay':wd}
ref_param, tst_param, ref_optim, tst_optim = \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment