Unverified Commit 85b56d01 authored by Jeff Daily's avatar Jeff Daily Committed by GitHub
Browse files

Merge pull request #43 from ROCmSoftwarePlatform/IFU-2021-01-18

IFU-2021-01-18
parents d061bf20 13c8d152
...@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function): ...@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
if channel_last: if channel_last:
count = int(input.numel()/input.size(-1)) count = int(input.numel()/input.size(-1))
mean, var_biased = syncbn.welford_mean_var_c_last(input) mean, var_biased = syncbn.welford_mean_var_c_last(input)
num_channels = input.size(-1)
else: else:
count = int(input.numel()/input.size(1)) count = int(input.numel()/input.size(1))
mean, var_biased = syncbn.welford_mean_var(input) mean, var_biased = syncbn.welford_mean_var(input)
num_channels = input.size(1)
if torch.distributed.is_initialized(): if torch.distributed.is_initialized():
if not process_group: if not process_group:
process_group = torch.distributed.group.WORLD process_group = torch.distributed.group.WORLD
device = mean.device device = mean.device
world_size = torch.distributed.get_world_size(process_group) world_size = torch.distributed.get_world_size(process_group)
mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=device)
var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=device) count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill_(count)
count_all = torch.cuda.IntTensor(world_size, device=device) combined = torch.cat([mean.view(-1), var_biased.view(-1), count_t], dim=0)
mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)] combined_list = [torch.empty_like(combined) for k in range(world_size)]
var_l = [var_all.narrow(0, i, 1) for i in range(world_size)] torch.distributed.all_gather(combined_list, combined, process_group)
count_l = [count_all.narrow(0, i, 1) for i in range(world_size)] combined = torch.stack(combined_list, dim=0)
torch.distributed.all_gather(mean_l, mean, process_group) mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
torch.distributed.all_gather(var_l, var_biased, process_group) count_all = count_all.view(-1)
torch.distributed.all_gather( mean, var, inv_std = syncbn.welford_parallel(mean_all, invstd_all, count_all.to(torch.int32), eps)
count_l,
torch.cuda.IntTensor([count], device=device),
process_group)
mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count_all, eps)
else: else:
device = mean.device device = mean.device
count_all = torch.cuda.IntTensor([count], device=device) count_all = torch.cuda.IntTensor([count], device=device)
inv_std = 1.0 / torch.sqrt(var_biased + eps) inv_std = 1.0 / torch.sqrt(var_biased + eps)
var = var_biased * (count) / (count-1) var = var_biased * (count) / (count-1)
if count == 1 and world_size < 2: if count == 1 and world_size < 2:
raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size())) raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
...@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function): ...@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
mean = running_mean.data mean = running_mean.data
inv_std = 1.0 / torch.sqrt(running_variance.data + eps) inv_std = 1.0 / torch.sqrt(running_variance.data + eps)
ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all) ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all.to(torch.int32))
ctx.process_group = process_group ctx.process_group = process_group
ctx.channel_last = channel_last ctx.channel_last = channel_last
ctx.world_size = world_size ctx.world_size = world_size
...@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function): ...@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
if ctx.needs_input_grad[0]: if ctx.needs_input_grad[0]:
if torch.distributed.is_initialized(): if torch.distributed.is_initialized():
num_channels = sum_dy.shape[0]
combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
torch.distributed.all_reduce( torch.distributed.all_reduce(
sum_dy, ReduceOp.SUM, process_group) combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
torch.distributed.all_reduce( sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
sum_dy_xmu, ReduceOp.SUM, process_group)
if channel_last: if channel_last:
grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count) grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count)
else: else:
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THC.h> #include <THC/THC.h>
#include "compat.h" #include "compat.h"
...@@ -35,7 +36,7 @@ __global__ void multi_tensor_apply_kernel( ...@@ -35,7 +36,7 @@ __global__ void multi_tensor_apply_kernel(
ArgTypes... args) ArgTypes... args)
{ {
// Hand the chunk information to the user-supplied functor to process however it likes. // Hand the chunk information to the user-supplied functor to process however it likes.
callable(chunk_size, noop_flag, tl, args...); callable(chunk_size, noop_flag, tl, args...);
} }
template<int depth, typename T, typename... ArgTypes> template<int depth, typename T, typename... ArgTypes>
...@@ -50,8 +51,9 @@ void multi_tensor_apply( ...@@ -50,8 +51,9 @@ void multi_tensor_apply(
TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth"); TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
int len0 = tensor_lists[0].size(); int len0 = tensor_lists[0].size();
TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0"); TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
auto ref_device = tensor_lists[0][0].device();
for(int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
{ {
TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists"); TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
for(int t = 0; t < tensor_lists[l].size(); t++) for(int t = 0; t < tensor_lists[l].size(); t++)
...@@ -62,7 +64,7 @@ void multi_tensor_apply( ...@@ -62,7 +64,7 @@ void multi_tensor_apply(
contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast)); contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
#endif #endif
TORCH_CHECK(contiguous_memory, "A tensor was not contiguous."); TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
TORCH_CHECK(tensor_lists[l][t].is_cuda(), "A tensor was not cuda."); TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch"); TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
} }
} }
...@@ -71,8 +73,9 @@ void multi_tensor_apply( ...@@ -71,8 +73,9 @@ void multi_tensor_apply(
TensorListMetadata<depth> tl; TensorListMetadata<depth> tl;
const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
tl.start_tensor_this_launch = 0; tl.start_tensor_this_launch = 0;
int loc_block_info = 0; int loc_block_info = 0;
int loc_tensor_info = 0; int loc_tensor_info = 0;
...@@ -98,7 +101,7 @@ void multi_tensor_apply( ...@@ -98,7 +101,7 @@ void multi_tensor_apply(
tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1; tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
tl.block_to_chunk[loc_block_info] = chunk; tl.block_to_chunk[loc_block_info] = chunk;
loc_block_info++; loc_block_info++;
bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] && bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
chunk == chunks_this_tensor - 1); chunk == chunks_this_tensor - 1);
bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]); bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
...@@ -124,7 +127,7 @@ void multi_tensor_apply( ...@@ -124,7 +127,7 @@ void multi_tensor_apply(
if(chunk == chunks_this_tensor - 1) if(chunk == chunks_this_tensor - 1)
{ {
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl; // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
loc_tensor_info = 0; loc_tensor_info = 0;
tl.start_tensor_this_launch = t + 1; tl.start_tensor_this_launch = t + 1;
} }
else else
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h> #include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h> #include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility: // Another possibility:
// #include <torch/all.h> // #include <torch/all.h>
...@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda( ...@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
max_chunks_per_tensor);) max_chunks_per_tensor);)
AT_CUDA_CHECK(cudaGetLastError()); AT_CUDA_CHECK(cudaGetLastError());
// AT_CUDA_CHECK(cudaDeviceSynchronize()); // AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end. // This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence // I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now // logic, but keeping it simple for now
auto ret = at::empty({1}, output.options()); auto ret = at::empty({1}, output.options());
const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
auto stream = at::cuda::getCurrentCUDAStream(); auto stream = at::cuda::getCurrentCUDAStream();
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>( cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
output.DATA_PTR<float>(), output.DATA_PTR<float>(),
...@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda( ...@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
const int norm_type) const int norm_type)
{ {
auto float_options = tensor_lists[0][0].options().dtype(at::kFloat); auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
TORCH_CHECK(tensor_lists[0][0].device() == noop_flag.device(), "noop flag should be on the same device as tensors");
// we don't need global thus uses empty here // we don't need global thus uses empty here
auto output = at::empty({320}, float_options); auto output = at::empty({320}, float_options);
......
...@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda( ...@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half, TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half,
"Additional output tensors should always be fp16."); "Additional output tensors should always be fp16.");
TORCH_CHECK(noop_flag.device() == tensor_lists[0][0].device(), "expected noop flag to be on the same device as tensors");
// We have 3 possibilities to handle here, in terms of // We have 3 possibilities to handle here, in terms of
// grad_type, param_type, momentum_type, requires_fp16_copy // grad_type, param_type, momentum_type, requires_fp16_copy
// 1. fp16, fp16, fp16, No // 1. fp16, fp16, fp16, No
......
...@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node ...@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
at::Tensor inv_std = at::empty_like(out_var); at::Tensor inv_std = at::empty_like(out_var);
at::Tensor out_mean = at::empty_like(out_var); at::Tensor out_mean = at::empty_like(out_var);
at::Tensor mean_feature_nodes_ = mean_feature_nodes.contiguous();
at::Tensor var_biased_ = var_biased.contiguous();
at::Tensor numel_ = numel.contiguous();
// TODO(jie): tile this for memory coalescing! // TODO(jie): tile this for memory coalescing!
const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE); const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE);
const int grid = std::max<int>(1, feature_size / block); const int grid = std::max<int>(1, feature_size / block);
...@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node ...@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
using namespace at; using namespace at;
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel", DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>( welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
mean_feature_nodes.DATA_PTR<scalar_t_0>(), mean_feature_nodes_.DATA_PTR<scalar_t_0>(),
var_biased.DATA_PTR<scalar_t_0>(), var_biased_.DATA_PTR<scalar_t_0>(),
numel.DATA_PTR<int>(), numel_.DATA_PTR<int>(),
out_mean.DATA_PTR<scalar_t_0>(), out_mean.DATA_PTR<scalar_t_0>(),
out_var.DATA_PTR<scalar_t_0>(), out_var.DATA_PTR<scalar_t_0>(),
inv_std.DATA_PTR<scalar_t_0>(), inv_std.DATA_PTR<scalar_t_0>(),
......
...@@ -182,6 +182,7 @@ def main(): ...@@ -182,6 +182,7 @@ def main():
print("=> loading checkpoint '{}'".format(args.resume)) print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
args.start_epoch = checkpoint['epoch'] args.start_epoch = checkpoint['epoch']
global best_prec1
best_prec1 = checkpoint['best_prec1'] best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict']) model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer']) optimizer.load_state_dict(checkpoint['optimizer'])
...@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)): ...@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):
res = [] res = []
for k in topk: for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size)) res.append(correct_k.mul_(100.0 / batch_size))
return res return res
......
This diff is collapsed.
import unittest
import apex
import torch
from apex.testing.common_utils import skipIfRocm
class TestFusedAdagrad(unittest.TestCase):
def setUp(self, max_abs_diff=1e-6, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, adagrad_option, apex_only=False):
ref_param = []
tst_param = []
for tensor in tensors:
if apex_only:
ref_param.append(torch.nn.Parameter(tensor.clone().float()))
else:
ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone()))
if apex_only:
ref_optim = apex.optimizers.FusedAdagrad(ref_param, **adagrad_option)
else:
ref_optim = torch.optim.Adagrad(ref_param, **adagrad_option)
tst_optim = apex.optimizers.FusedAdagrad(tst_param, **adagrad_option)
return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param, apex_only=False):
for p_ref, p_tst in zip(ref_param, tst_param):
p_tst.grad = torch.rand_like(p_tst)
p_ref.grad = p_tst.grad.detach().float() if apex_only else p_tst.grad
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
half_grads = []
for p_ref, _ in zip(ref_param, tst_param):
half_grads.append(torch.rand_like(p_ref).half())
p_ref.grad = half_grads[-1].float() / scale
return half_grads
def get_max_diff(self, ref_param, tst_param, apex_only=False):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
if apex_only:
p_tst = p_tst.float()
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff:
max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff:
max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, apex_only=False):
nelem = 278011
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
tensor = torch.rand(nelem, dtype=param_type, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option, apex_only=apex_only
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param, apex_only=apex_only)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
if not apex_only:
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types(see skip note for test above).
@skipIfRocm
def test_bfloat16(self):
self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@skipIfRocm
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_adagrad_option(self):
nelem = 1
adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
import argparse
import random
import sys
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from apex import amp
from apex.optimizers import FusedAdam
from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
class TestModel(torch.nn.Module):
def __init__(self, args):
super(TestModel, self).__init__()
self.linear = torch.nn.Sequential(*[torch.nn.Linear(args.dim, args.dim, bias=args.bias) for _ in range(args.layers)])
def forward(self, x):
return self.linear(x)
def setup(args):
## Model
ref_model = TestModel(args).cuda()
dist_model = TestModel(args).cuda()
# Same weights
with torch.no_grad():
for dp, rp in zip(dist_model.parameters(), ref_model.parameters()):
dp.data.copy_(rp.data)
dist_model = dist_model.half()
## Optimizer
# same hyperparameters
ref_opt_args = { 'lr': 1e-3, 'eps': 1e-6, 'weight_decay': 0.01 }
ref_opt = FusedAdam(ref_model.parameters(), **ref_opt_args)
dist_opt_args = ref_opt_args.copy()
dist_opt_args.update( {'overlap_reductions' : False} )
dist_opt_args.update( {'process_group_size' : args.n_gpu} )
dist_opt_args.update( {'dwu_group_size' : args.dwu_group_size} )
dist_opt_args.update( {'dwu_num_blocks' : 1} )
dist_opt_args.update( {'dwu_num_chunks' : 1} )
dist_opt = DistributedFusedAdam(dist_model.parameters(), **dist_opt_args)
dist_opt.set_global_scale(1.)
## amp-init
amp_args = { 'loss_scale' : 'dynamic' , 'opt_level' : 'O2'}
ref_model, ref_opt = amp.initialize(ref_model, ref_opt, **amp_args)
## DDP
ref_model = DDP(ref_model, device_ids=[args.rank])
with torch.no_grad():
for dp in dist_model.parameters():
torch.distributed.broadcast(dp.data, src=0)
for rp in ref_model.parameters():
torch.distributed.broadcast(rp.data, src=0)
torch.cuda.synchronize()
torch.distributed.barrier()
if get_rank() == 0:
print(f'dist opt with {args.n_gpu} GPUs')
return ref_model, ref_opt, dist_model, dist_opt
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=-1)
parser.add_argument('--steps', type=int, default=20)
parser.add_argument('--batch', type=int, default=32)
parser.add_argument('--dim', type=int, default=4)
parser.add_argument('--layers', type=int, default=2)
parser.add_argument('--bias', action='store_true')
parser.add_argument('--atol', type=float, default=1e-3)
parser.add_argument('--rtol', type=float, default=1)
parser.add_argument('--dwu_group_size', type=float, default=1)
args = parser.parse_args()
return args
def setup_env(args):
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://')
args.rank = torch.distributed.get_rank()
args.n_gpu = torch.distributed.get_world_size()
seed = 42 + get_rank()
random.seed(seed)
torch.manual_seed(seed)
return args
def get_rank():
return torch.distributed.get_rank()
def main():
args = parse_args()
args = setup_env(args)
tol_args = { 'atol' : args.atol, 'rtol' : args.rtol }
torch.set_printoptions(precision=16)
ref_model, ref_opt, dist_model, dist_opt = setup(args)
# lazy_init not called yet, initialize stash
stash = ref_opt._amp_stash
stash.all_fp16_params, stash.all_fp32_from_fp16_params = [], []
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt._init_everything()
for i in range(args.steps):
x_ref = torch.randn(args.batch, args.dim, dtype=torch.half).cuda().requires_grad_(True)
x_dist = x_ref.clone().detach().requires_grad_(True)
if get_rank() == 0:
print(f'[{i}] Checking input')
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert(torch.allclose(x_ref, x_dist, **tol_args))
y_ref = ref_model(x_ref).half()
y_dist = dist_model(x_dist)
if get_rank() == 0:
print(f'[{i}] Checking output')
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert(torch.allclose(y_ref, y_dist, **tol_args))
dy = torch.randn_like(y_ref)
y_ref.backward(dy)
y_dist.backward(dy)
if get_rank() == 0:
print(f'[{i}] Checking gradients')
torch.distributed.barrier()
torch.cuda.synchronize()
assert(torch.allclose(x_ref.grad, x_dist.grad, **tol_args))
# gradient all-reduce within distributed optimizer
dist_opt.complete_reductions()
if get_rank() == 0:
print(f'[{i}] Stepping')
ref_opt.step()
dist_opt.step()
torch.cuda.synchronize()
torch.distributed.barrier()
print('Checking new weights')
if get_rank() == 0:
print("ref param:", ref_model.module.linear[0].weight)
print("dist param:", dist_model.linear[0].weight)
for i, (rp, dp) in enumerate(zip(ref_model.parameters(), dist_model.parameters())):
if not torch.allclose(rp, dp, **tol_args):
if get_rank() == 0:
print(f'Rank: {get_rank()}, Param: {i}')
print(f'ref: {rp.sum().item()}, dist: {dp.sum().item()}')
print(rp)
print(dp)
print(torch.abs(rp-dp) > tol_args['atol'])
sys.exit(0)
# zero grads
for rp, dp in zip(ref_model.parameters(), dist_model.parameters()):
rp.grad = None
dp.grad = None
if __name__ == "__main__":
main()
...@@ -4,10 +4,11 @@ import random ...@@ -4,10 +4,11 @@ import random
import torch import torch
import apex import apex
from itertools import product
from apex.testing.common_utils import skipIfRocm from apex.testing.common_utils import skipIfRocm
class TestFusedAdam(unittest.TestCase): class TestFusedOptimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7): def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff self.max_rel_diff = max_rel_diff
...@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase): ...@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
def tearDown(self): def tearDown(self):
pass pass
def gen_param_optim(self, tensors, adam_option, apex_only=False): def gen_param_optim(self, tensors, options, apex_only=False):
ref_param = [] ref_param = []
tst_param = [] tst_param = []
for tensor in tensors: for tensor in tensors:
...@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase): ...@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
tst_param.append(torch.nn.Parameter(tensor.clone())) tst_param.append(torch.nn.Parameter(tensor.clone()))
if apex_only: if apex_only:
ref_optim = apex.optimizers.FusedAdam(ref_param, **adam_option) ref_optim = self.fused_optim(ref_param, **options)
else: else:
ref_optim = torch.optim.Adam(ref_param, **adam_option) ref_optim = self.ref_optim(ref_param, **options)
tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option) tst_optim = self.fused_optim(tst_param, **options)
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
...@@ -60,25 +61,32 @@ class TestFusedAdam(unittest.TestCase): ...@@ -60,25 +61,32 @@ class TestFusedAdam(unittest.TestCase):
return max_abs_diff, max_rel_diff return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float, apex_only=False): def gen_single_type_test(self, param_type=torch.float, apex_only=False, device='cuda'):
nelem = 278011 nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=param_type, device='cuda') tensor = torch.rand(nelem, dtype=param_type, device=device)
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option, apex_only=apex_only) self.gen_param_optim([tensor], self.options, apex_only=apex_only)
for i in range(self.iters): for i in range(self.iters):
self.gen_grad(ref_param, tst_param, apex_only=apex_only) self.gen_grad(ref_param, tst_param, apex_only=apex_only)
ref_optim.step() ref_optim.step()
tst_optim.step() tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only) max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
if not apex_only: if not apex_only:
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedAdam(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedAdam, self).__init__(*args, **kwargs)
self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay': 0, 'amsgrad': False}
self.ref_optim = torch.optim.Adam
self.fused_optim = apex.optimizers.FusedAdam
@skipIfRocm @skipIfRocm
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -95,17 +103,23 @@ class TestFusedAdam(unittest.TestCase): ...@@ -95,17 +103,23 @@ class TestFusedAdam(unittest.TestCase):
self.max_abs_diff = 1e-2 self.max_abs_diff = 1e-2
self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True) self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked') @unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked')
def test_multi_params(self): def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]] sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensors = [] tensors = []
for size in sizes: for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda')) tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, adam_option) self.gen_param_optim(tensors, self.options)
for i in range(self.iters): for i in range(self.iters):
self.gen_grad(ref_param, tst_param) self.gen_grad(ref_param, tst_param)
...@@ -118,12 +132,9 @@ class TestFusedAdam(unittest.TestCase): ...@@ -118,12 +132,9 @@ class TestFusedAdam(unittest.TestCase):
@unittest.skip('No longer support fuse scaling') @unittest.skip('No longer support fuse scaling')
def test_scale(self): def test_scale(self):
nelem = 278011 nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda') tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option) self.gen_param_optim([tensor], self.options)
for i in range(self.iters): for i in range(self.iters):
scale = random.random() * 1000 scale = random.random() * 1000
...@@ -138,12 +149,10 @@ class TestFusedAdam(unittest.TestCase): ...@@ -138,12 +149,10 @@ class TestFusedAdam(unittest.TestCase):
@unittest.skip('No longer support output fp16 param') @unittest.skip('No longer support output fp16 param')
def test_fp16_output(self): def test_fp16_output(self):
nelem = 278011 nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda') tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option) self.gen_param_optim([tensor], self.options)
fp16_param = torch.nn.Parameter(tensor.clone().half()) fp16_param = torch.nn.Parameter(tensor.clone().half())
...@@ -180,6 +189,106 @@ class TestFusedAdam(unittest.TestCase): ...@@ -180,6 +189,106 @@ class TestFusedAdam(unittest.TestCase):
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedAdagrad(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedAdagrad, self).__init__(*args, **kwargs)
self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
self.ref_optim = torch.optim.Adagrad
self.fused_optim = apex.optimizers.FusedAdagrad
@skipIfRocm
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_params_different_devices_throws(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
tensors = []
for i, size in enumerate(sizes):
tensors.append(torch.rand(size, dtype=torch.float, device="cuda:"+str(i % 2)))
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
tensors, adagrad_option
)
self.gen_grad(ref_param, tst_param)
with self.assertRaisesRegex(RuntimeError, "not on the same device"):
tst_optim.step()
def test_adagrad_option(self):
nelem = 1
adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
[tensor], adagrad_option
)
for _ in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
class TestFusedSGD(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedSGD, self).__init__(*args, **kwargs)
self.options = {"lr": .25, "momentum": .125}
self.ref_optim = torch.optim.SGD
self.fused_optim = apex.optimizers.FusedSGD
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
def test_half(self):
self.gen_single_type_test(param_type=torch.float16)
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
if __name__ == '__main__': if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main() unittest.main()
...@@ -6,6 +6,7 @@ from torch.optim import Optimizer ...@@ -6,6 +6,7 @@ from torch.optim import Optimizer
import apex import apex
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
from apex.testing.common_utils import skipIfRocm from apex.testing.common_utils import skipIfRocm
from itertools import product
class RefLAMB(Optimizer): class RefLAMB(Optimizer):
r"""Implements Lamb algorithm. r"""Implements Lamb algorithm.
...@@ -41,7 +42,7 @@ class RefLAMB(Optimizer): ...@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
import amp_C import amp_C
self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
# Skip buffer # Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
self.multi_tensor_lamb = amp_C.multi_tensor_lamb self.multi_tensor_lamb = amp_C.multi_tensor_lamb
else: else:
raise RuntimeError('apex.optimizers.FusedLAMB requires cuda extensions') raise RuntimeError('apex.optimizers.FusedLAMB requires cuda extensions')
...@@ -69,7 +70,8 @@ class RefLAMB(Optimizer): ...@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
else: else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.') raise RuntimeError('FusedLAMB only support fp16 and fp32.')
g_norm_32, g_norm_16 = torch.zeros(1, device='cuda'), torch.zeros(1, device='cuda') device = self.param_groups[0]["params"][0].device
g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
# compute grad norm for two lists # compute grad norm for two lists
if len(g_all_32) > 0: if len(g_all_32) > 0:
g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm, g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
...@@ -85,7 +87,7 @@ class RefLAMB(Optimizer): ...@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
self._dummy_overflow_buf, self._dummy_overflow_buf,
[[g_norm_32, g_norm_16]], [[g_norm_32, g_norm_16]],
False)[0] False)[0]
max_grad_norm = 1.0 max_grad_norm = 1.0
clipped_ratio = max_grad_norm / max(global_grad_norm, max_grad_norm) clipped_ratio = max_grad_norm / max(global_grad_norm, max_grad_norm)
...@@ -94,7 +96,7 @@ class RefLAMB(Optimizer): ...@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
if p.grad is None: if p.grad is None:
continue continue
p.grad.data *= clipped_ratio p.grad.data *= clipped_ratio
grad = p.grad.data grad = p.grad.data
if grad.is_sparse: if grad.is_sparse:
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
...@@ -137,7 +139,7 @@ class RefLAMB(Optimizer): ...@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
state['g_norm'] = g_norm state['g_norm'] = g_norm
state['trust_ratio'] = trust_ratio state['trust_ratio'] = trust_ratio
step_size = group['lr'] step_size = group['lr']
p.data.add_(update, alpha=-step_size*trust_ratio) p.data.add_(update, alpha=-step_size*trust_ratio)
...@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):
return max_abs_diff, max_rel_diff return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float): def gen_single_type_test(self, param_type=torch.float, device="cuda"):
nelem = 278011 nelem = 278011
tensor = torch.rand(nelem, dtype=param_type, device='cuda') tensor = torch.rand(nelem, dtype=param_type, device=device)
weight_decay = [0, 0.01] weight_decay = [0, 0.01]
for wd in weight_decay: for wd in weight_decay:
lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd} lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
...@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
for i in range(self.iters): for i in range(self.iters):
self.gen_grad(ref_param, tst_param) self.gen_grad(ref_param, tst_param)
ref_optim.step() ref_optim.step()
torch.cuda.synchronize()
tst_optim.step() tst_optim.step()
torch.cuda.synchronize()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param) max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
...@@ -216,11 +220,19 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -216,11 +220,19 @@ class TestFusedLAMB(unittest.TestCase):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
@skipIfRocm
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self):
devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices):
with torch.cuda.device(current_dev):
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
@skipIfRocm @skipIfRocm
def test_multi_params(self): def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]] sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
weight_decay = [0, 0.01] weight_decay = [0, 0.01]
for wd in weight_decay: for wd in weight_decay:
lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd} lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
tensors = [] tensors = []
...@@ -242,7 +254,7 @@ class TestFusedLAMB(unittest.TestCase): ...@@ -242,7 +254,7 @@ class TestFusedLAMB(unittest.TestCase):
nelem = 1 nelem = 1
tensor = torch.rand(nelem, dtype=torch.float, device='cuda') tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
weight_decay = [0, 0.01] weight_decay = [0, 0.01]
for wd in weight_decay: for wd in weight_decay:
lamb_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, 'weight_decay':wd} lamb_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, 'weight_decay':wd}
ref_param, tst_param, ref_optim, tst_optim = \ ref_param, tst_param, ref_optim, tst_optim = \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment