Merge remote-tracking branch 'upstream/master'

Conflicts: csrc/multi_tensor_apply.cuh setup.py tests/L0/run_optimizers/test_adagrad.py tests/L0/run_optimizers/test_fused_optimizer.py tests/L0/run_optimizers/test_lamb.py

Merge remote-tracking branch 'upstream/master'
Conflicts: csrc/multi_tensor_apply.cuh setup.py tests/L0/run_optimizers/test_adagrad.py tests/L0/run_optimizers/test_fused_optimizer.py tests/L0/run_optimizers/test_lamb.py
dcc7b513 · Jeff Daily · d061bf20 · 154c6336 · dcc7b513 · dcc7b513
Commit dcc7b513 authored Jan 18, 2021 by Jeff Daily
11 changed files
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
            if channel_last:
                count = int(input.numel()/input.size(-1))
                mean, var_biased = syncbn.welford_mean_var_c_last(input)
+                num_channels = input.size(-1)
            else:
                count = int(input.numel()/input.size(1))
                mean, var_biased = syncbn.welford_mean_var(input)
+                num_channels = input.size(1)

            if torch.distributed.is_initialized():
                if not process_group:
                    process_group = torch.distributed.group.WORLD
                device = mean.device
                world_size = torch.distributed.get_world_size(process_group)
-                mean_all = torch.empty(world_size, mean.size(0), dtype=mean.dtype, device=device)
-                var_all = torch.empty(world_size, var_biased.size(0), dtype=var_biased.dtype, device=device)
-                count_all = torch.cuda.IntTensor(world_size, device=device)
-                mean_l = [mean_all.narrow(0, i, 1) for i in range(world_size)]
-                var_l = [var_all.narrow(0, i, 1) for i in range(world_size)]
-                count_l = [count_all.narrow(0, i, 1) for i in range(world_size)]
-                torch.distributed.all_gather(mean_l, mean, process_group)
-                torch.distributed.all_gather(var_l, var_biased, process_group)
-                torch.distributed.all_gather(
-                      count_l,
-                      torch.cuda.IntTensor([count], device=device),
-                      process_group)
-                mean, var, inv_std = syncbn.welford_parallel(mean_all, var_all, count_all, eps)
+
+                count_t = torch.empty(1, dtype=mean.dtype, device=mean.device).fill_(count)
+                combined = torch.cat([mean.view(-1), var_biased.view(-1), count_t], dim=0)
+                combined_list = [torch.empty_like(combined) for k in range(world_size)]
+                torch.distributed.all_gather(combined_list, combined, process_group)
+                combined = torch.stack(combined_list, dim=0)
+                mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+                count_all = count_all.view(-1)
+                mean, var, inv_std = syncbn.welford_parallel(mean_all, invstd_all, count_all.to(torch.int32), eps)
            else:
                device = mean.device
                count_all = torch.cuda.IntTensor([count], device=device)
                inv_std = 1.0 / torch.sqrt(var_biased + eps)
-                var = var_biased * (count) / (count-1) 
+                var = var_biased * (count) / (count-1)

            if count == 1 and world_size < 2:
                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
            mean = running_mean.data
            inv_std = 1.0 / torch.sqrt(running_variance.data + eps)

-        ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all)
+        ctx.save_for_backward(input, weight, mean, inv_std, z, bias, count_all.to(torch.int32))
        ctx.process_group = process_group
        ctx.channel_last = channel_last
        ctx.world_size = world_size
@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
        if ctx.needs_input_grad[0]:

            if torch.distributed.is_initialized():
+                num_channels = sum_dy.shape[0]
+                combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
                torch.distributed.all_reduce(
-                    sum_dy, ReduceOp.SUM, process_group)
-                torch.distributed.all_reduce(
-                    sum_dy_xmu, ReduceOp.SUM, process_group)
+                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
+                sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
+
            if channel_last:
                grad_input = syncbn.batchnorm_backward_c_last(grad_output, saved_input, mean, inv_std, weight, sum_dy, sum_dy_xmu, count)
            else:

--- a/csrc/multi_tensor_apply.cuh
+++ b/csrc/multi_tensor_apply.cuh
@@ -35,7 +35,7 @@ __global__ void multi_tensor_apply_kernel(
    ArgTypes... args)
 {
  // Hand the chunk information to the user-supplied functor to process however it likes.
-  callable(chunk_size, noop_flag, tl, args...); 
+  callable(chunk_size, noop_flag, tl, args...);
 }

 template<int depth, typename T, typename... ArgTypes>
@@ -50,8 +50,9 @@ void multi_tensor_apply(
  TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
  int len0 = tensor_lists[0].size();
  TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
-
-  for(int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
+  auto ref_device = tensor_lists[0][0].device();
+  TORCH_CHECK(ref_device.type() == at::kCUDA, "expected input to be on cuda");
+  for (int l = 0; l < tensor_lists.size(); l++) // No range-based for because I need indices
  {
    TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
    for(int t = 0; t < tensor_lists[l].size(); t++)
@@ -62,7 +63,7 @@ void multi_tensor_apply(
      contiguous_memory = (contiguous_memory || tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
 #endif
      TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
-      TORCH_CHECK(tensor_lists[l][t].is_cuda(), "A tensor was not cuda.");
+      TORCH_CHECK(tensor_lists[l][t].device() == ref_device, "A tensor was not on the same device as the first tensor");
      TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
    }
  }
@@ -71,8 +72,9 @@ void multi_tensor_apply(

  TensorListMetadata<depth> tl;

+  const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
  auto stream = at::cuda::getCurrentCUDAStream();
-  
+
  tl.start_tensor_this_launch = 0;
  int loc_block_info = 0;
  int loc_tensor_info = 0;
@@ -98,7 +100,7 @@ void multi_tensor_apply(
      tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
      tl.block_to_chunk[loc_block_info] = chunk;
      loc_block_info++;
-  
+
      bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth-1] &&
                           chunk == chunks_this_tensor - 1);
      bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
@@ -124,7 +126,7 @@ void multi_tensor_apply(
        if(chunk == chunks_this_tensor - 1)
        {
          // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
-          loc_tensor_info = 0; 
+          loc_tensor_info = 0;
          tl.start_tensor_this_launch = t + 1;
        }
        else

--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
@@ -2,6 +2,7 @@
 #include <ATen/AccumulateType.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
 // Another possibility:
 // #include <torch/all.h>

@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
      max_chunks_per_tensor);)

  AT_CUDA_CHECK(cudaGetLastError());
-
  // AT_CUDA_CHECK(cudaDeviceSynchronize());

  // This involves one more small kernel launches, but will be negligible end to end.
  // I could get rid of these by hacking the functor + multi tensor harness with persistence
  // logic, but keeping it simple for now
  auto ret = at::empty({1}, output.options());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
  auto stream = at::cuda::getCurrentCUDAStream();
  cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
    output.DATA_PTR<float>(),
@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
  const int norm_type)
 {
  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
-
+  TORCH_CHECK(tensor_lists[0][0].device() == noop_flag.device(), "noop flag should be on the same device as tensors");
  // we don't need global thus uses empty here
  auto output = at::empty({320}, float_options);


--- a/csrc/multi_tensor_sgd_kernel.cu
+++ b/csrc/multi_tensor_sgd_kernel.cu
@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
        TORCH_CHECK(tensor_lists[3][i].scalar_type() == at::ScalarType::Half,
                 "Additional output tensors should always be fp16.");

+  TORCH_CHECK(noop_flag.device() == tensor_lists[0][0].device(), "expected noop flag to be on the same device as tensors");
+
  // We have 3 possibilities to handle here, in terms of
  // grad_type, param_type, momentum_type, requires_fp16_copy
  // 1. fp16, fp16, fp16, No

--- a/csrc/welford.cu
+++ b/csrc/welford.cu
@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
  at::Tensor inv_std = at::empty_like(out_var);
  at::Tensor out_mean = at::empty_like(out_var);

+  at::Tensor mean_feature_nodes_ = mean_feature_nodes.contiguous();
+  at::Tensor var_biased_ = var_biased.contiguous();
+  at::Tensor numel_ = numel.contiguous();
+
  // TODO(jie): tile this for memory coalescing!
  const int block = std::min(h_last_pow2(feature_size), MAX_BLOCK_SIZE);
  const int grid = std::max<int>(1, feature_size / block);
@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
    using namespace at;
    DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
      welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
-          mean_feature_nodes.DATA_PTR<scalar_t_0>(),
-          var_biased.DATA_PTR<scalar_t_0>(),
-          numel.DATA_PTR<int>(),
+          mean_feature_nodes_.DATA_PTR<scalar_t_0>(),
+          var_biased_.DATA_PTR<scalar_t_0>(),
+          numel_.DATA_PTR<int>(),
          out_mean.DATA_PTR<scalar_t_0>(),
          out_var.DATA_PTR<scalar_t_0>(),
          inv_std.DATA_PTR<scalar_t_0>(),

--- a/examples/imagenet/main_amp.py
+++ b/examples/imagenet/main_amp.py
@@ -182,6 +182,7 @@ def main():
                print("=> loading checkpoint '{}'".format(args.resume))
                checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
                args.start_epoch = checkpoint['epoch']
+                global best_prec1
                best_prec1 = checkpoint['best_prec1']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):

    res = []
    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


--- a/setup.py
+++ b/setup.py
 import torch
+from torch.utils import cpp_extension
 from setuptools import setup, find_packages
 import subprocess

@@ -9,6 +10,16 @@ import os
 # ninja build does not work unless include_dirs are abs path
 this_dir = os.path.dirname(os.path.abspath(__file__))

+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
 if not torch.cuda.is_available():
    # https://github.com/NVIDIA/apex/issues/486
    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
@@ -16,11 +27,16 @@ if not torch.cuda.is_available():
    print('\nWarning: Torch did not find available GPUs on this system.\n',
          'If your intention is to cross-compile, this is not an error.\n'
          'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
-          'Volta (compute capability 7.0), and Turing (compute capability 7.5).\n'
+          'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
+          'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
          'If you wish to cross-compile for a single specific architecture,\n'
          'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
-        os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+        if int(bare_metal_major) == 11:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+        else:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"

 print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -64,13 +80,18 @@ if "--cpp_ext" in sys.argv:
        CppExtension('apex_C',
                     ['csrc/flatten_unflatten.cpp',]))

-def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+def get_cuda_bare_metal_version(cuda_dir):
    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
    output = raw_output.split()
    release_idx = output.index("release") + 1
    release = output[release_idx].split(".")
    bare_metal_major = release[0]
    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir)
    torch_binary_major = torch.version.cuda.split(".")[0]
    torch_binary_minor = torch.version.cuda.split(".")[1]

@@ -109,6 +130,25 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
    version_ge_1_5 = ['-DVERSION_GE_1_5']
 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5

+if "--distributed_adam" in sys.argv:
+    from torch.utils.cpp_extension import CUDAExtension
+    sys.argv.remove("--distributed_adam")
+
+    from torch.utils.cpp_extension import BuildExtension
+    cmdclass['build_ext'] = BuildExtension
+
+    if torch.utils.cpp_extension.CUDA_HOME is None:
+        raise RuntimeError("--distributed_adam was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
+    else:
+        ext_modules.append(
+            CUDAExtension(name='distributed_adam_cuda',
+                          sources=['apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp',
+                                   'apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu'],
+                          include_dirs=[os.path.join(this_dir, 'csrc')],
+                          extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
+                                              'nvcc':['-O3',
+                                                      '--use_fast_math'] + version_dependent_macros}))
+
 if "--distributed_lamb" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--distributed_lamb")
@@ -228,7 +268,7 @@ if "--xentropy" in sys.argv:
                                       'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
                              include_dirs=[os.path.join(this_dir, 'csrc')],
                              extra_compile_args=['-O3'] + version_dependent_macros))
-   
+

 if "--deprecated_fused_adam" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
@@ -281,6 +321,7 @@ torch_dir = torch.__path__[0]
 if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
    generator_flag = ['-DOLD_GENERATOR']

+
 if "--fast_multihead_attn" in sys.argv:
    from torch.utils.cpp_extension import CUDAExtension
    sys.argv.remove("--fast_multihead_attn")
@@ -291,6 +332,13 @@ if "--fast_multihead_attn" in sys.argv:
    if torch.utils.cpp_extension.CUDA_HOME is None:
        raise RuntimeError("--fast_multihead_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
    else:
+        # Check, if CUDA11 is installed for compute capability 8.0
+        cc_flag = []
+        _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+        if int(bare_metal_major) >= 11:
+            cc_flag.append('-gencode')
+            cc_flag.append('arch=compute_80,code=sm_80')
+
        subprocess.run(["git", "submodule", "update", "--init", "apex/contrib/csrc/multihead_attn/cutlass"])
        ext_modules.append(
            CUDAExtension(name='fast_additive_mask_softmax_dropout',
@@ -304,7 +352,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_mask_softmax_dropout',
                          sources=['apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp',
@@ -317,7 +365,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_bias_additive_mask',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp',
@@ -330,7 +378,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_bias',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp',
@@ -343,7 +391,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp',
@@ -356,7 +404,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_self_multihead_attn_norm_add',
                          sources=['apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp',
@@ -369,7 +417,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn',
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp',
@@ -382,7 +430,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))
        ext_modules.append(
            CUDAExtension(name='fast_encdec_multihead_attn_norm_add',
                          sources=['apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp',
@@ -395,7 +443,7 @@ if "--fast_multihead_attn" in sys.argv:
                                                      '-U__CUDA_NO_HALF_CONVERSIONS__',
                                                      '--expt-relaxed-constexpr',
                                                      '--expt-extended-lambda',
-                                                      '--use_fast_math'] + version_dependent_macros + generator_flag}))
+                                                      '--use_fast_math'] + version_dependent_macros + generator_flag + cc_flag}))

 setup(
    name='apex',

--- a/tests/L0/run_optimizers/test_adagrad.py
+++ b/tests/L0/run_optimizers/test_adagrad.py
-import unittest
-
-import apex
-import torch
-from apex.testing.common_utils import skipIfRocm
-
-class TestFusedAdagrad(unittest.TestCase):
-    def setUp(self, max_abs_diff=1e-6, max_rel_diff=1, iters=7):
-        self.max_abs_diff = max_abs_diff
-        self.max_rel_diff = max_rel_diff
-        self.iters = iters
-        torch.cuda.manual_seed(9876)
-
-    def tearDown(self):
-        pass
-
-    def gen_param_optim(self, tensors, adagrad_option, apex_only=False):
-        ref_param = []
-        tst_param = []
-        for tensor in tensors:
-            if apex_only:
-                ref_param.append(torch.nn.Parameter(tensor.clone().float()))
-            else:
-                ref_param.append(torch.nn.Parameter(tensor.clone()))
-            tst_param.append(torch.nn.Parameter(tensor.clone()))
-
-        if apex_only:
-            ref_optim = apex.optimizers.FusedAdagrad(ref_param, **adagrad_option)
-        else:
-            ref_optim = torch.optim.Adagrad(ref_param, **adagrad_option)
-        tst_optim = apex.optimizers.FusedAdagrad(tst_param, **adagrad_option)
-
-        return (ref_param, tst_param, ref_optim, tst_optim)
-
-    def gen_grad(self, ref_param, tst_param, apex_only=False):
-        for p_ref, p_tst in zip(ref_param, tst_param):
-            p_tst.grad = torch.rand_like(p_tst)
-            p_ref.grad = p_tst.grad.detach().float() if apex_only else p_tst.grad
-
-    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
-        half_grads = []
-        for p_ref, _ in zip(ref_param, tst_param):
-            half_grads.append(torch.rand_like(p_ref).half())
-            p_ref.grad = half_grads[-1].float() / scale
-        return half_grads
-
-    def get_max_diff(self, ref_param, tst_param, apex_only=False):
-        max_abs_diff = max_rel_diff = 0
-        for p_ref, p_tst in zip(ref_param, tst_param):
-            if apex_only:
-                p_tst = p_tst.float()
-            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
-            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
-
-            if max_abs_diff_p > max_abs_diff:
-                max_abs_diff = max_abs_diff_p
-            if max_rel_diff_p > max_rel_diff:
-                max_rel_diff = max_rel_diff_p
-
-        return max_abs_diff, max_rel_diff
-
-    def gen_single_type_test(self, param_type=torch.float, apex_only=False):
-        nelem = 278011
-        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
-
-        tensor = torch.rand(nelem, dtype=param_type, device="cuda")
-        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
-            [tensor], adagrad_option, apex_only=apex_only
-        )
-
-        for _ in range(self.iters):
-            self.gen_grad(ref_param, tst_param, apex_only=apex_only)
-            ref_optim.step()
-            tst_optim.step()
-            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
-
-            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
-            if not apex_only:
-                self.assertLessEqual(max_rel_diff, self.max_rel_diff)
-
-    @skipIfRocm
-    def test_float(self):
-        self.gen_single_type_test(param_type=torch.float)
-
-    @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
-    def test_half(self):
-        self.gen_single_type_test(param_type=torch.float16)
-
-    # Compares bfloat16 computation against float32 as gold standard.
-    # Uses apex optimizers(controlled by apex_only flag) for both types.
-    # Doesn't use upstream optimizer like other tests as they seem to be
-    # numerically unstable for half types(see skip note for test above).
-    @skipIfRocm
-    def test_bfloat16(self):
-        self.max_abs_diff = 1e-2
-        self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)
-
-    @skipIfRocm
-    def test_multi_params(self):
-        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
-        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
-
-        tensors = []
-        for size in sizes:
-            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
-        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
-            tensors, adagrad_option
-        )
-
-        for _ in range(self.iters):
-            self.gen_grad(ref_param, tst_param)
-            ref_optim.step()
-            tst_optim.step()
-            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
-            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
-            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
-
-    def test_adagrad_option(self):
-        nelem = 1
-        adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
-
-        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
-        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
-            [tensor], adagrad_option
-        )
-
-        for _ in range(self.iters):
-            self.gen_grad(ref_param, tst_param)
-            ref_optim.step()
-            tst_optim.step()
-            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
-
-            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
-            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
--- a/tests/L0/run_optimizers/test_dist_adam.py
+++ b/tests/L0/run_optimizers/test_dist_adam.py
+import argparse
+import random
+import sys
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from apex import amp
+from apex.optimizers import FusedAdam
+from apex.contrib.optimizers.distributed_fused_adam import DistributedFusedAdam
+
+
+class TestModel(torch.nn.Module):
+    def __init__(self, args):
+        super(TestModel, self).__init__()
+
+        self.linear = torch.nn.Sequential(*[torch.nn.Linear(args.dim, args.dim, bias=args.bias) for _ in range(args.layers)])
+
+    def forward(self, x):
+        return self.linear(x)
+
+def setup(args):
+    ## Model
+    ref_model = TestModel(args).cuda()
+    dist_model = TestModel(args).cuda()
+
+    # Same weights
+    with torch.no_grad():
+        for dp, rp in zip(dist_model.parameters(), ref_model.parameters()):
+            dp.data.copy_(rp.data)
+
+    dist_model = dist_model.half()
+
+
+    ## Optimizer
+    # same hyperparameters
+    ref_opt_args = { 'lr': 1e-3, 'eps': 1e-6, 'weight_decay': 0.01 }
+    ref_opt = FusedAdam(ref_model.parameters(), **ref_opt_args)
+
+    dist_opt_args = ref_opt_args.copy()
+    dist_opt_args.update( {'overlap_reductions' : False} )
+    dist_opt_args.update( {'process_group_size' : args.n_gpu} )
+    dist_opt_args.update( {'dwu_group_size' : args.dwu_group_size} )
+    dist_opt_args.update( {'dwu_num_blocks' : 1} )
+    dist_opt_args.update( {'dwu_num_chunks' : 1} )
+    dist_opt = DistributedFusedAdam(dist_model.parameters(), **dist_opt_args)
+    dist_opt.set_global_scale(1.)
+    
+    ## amp-init
+    amp_args = { 'loss_scale' : 'dynamic' , 'opt_level' : 'O2'}
+    ref_model, ref_opt = amp.initialize(ref_model, ref_opt, **amp_args)
+    
+   
+    ## DDP
+    ref_model = DDP(ref_model, device_ids=[args.rank])
+    with torch.no_grad():
+        for dp in dist_model.parameters():
+             torch.distributed.broadcast(dp.data, src=0)
+        for rp in ref_model.parameters():
+            torch.distributed.broadcast(rp.data, src=0)
+    torch.cuda.synchronize()
+    torch.distributed.barrier()
+    if get_rank() == 0:
+        print(f'dist opt with {args.n_gpu} GPUs')
+
+    return ref_model, ref_opt, dist_model, dist_opt
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--local_rank', type=int, default=-1)
+    parser.add_argument('--steps', type=int, default=20)
+    parser.add_argument('--batch', type=int, default=32)
+    parser.add_argument('--dim', type=int, default=4)
+    parser.add_argument('--layers', type=int, default=2)
+    parser.add_argument('--bias', action='store_true')
+    parser.add_argument('--atol', type=float, default=1e-3)
+    parser.add_argument('--rtol', type=float, default=1)
+    parser.add_argument('--dwu_group_size', type=float, default=1)
+
+    args = parser.parse_args()
+
+    return args
+
+def setup_env(args):
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group(backend='nccl', init_method='env://')
+    args.rank = torch.distributed.get_rank()
+    args.n_gpu = torch.distributed.get_world_size()
+
+    seed = 42 + get_rank()
+
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    return args
+
+def get_rank():
+    return torch.distributed.get_rank()
+
+def main():
+    args = parse_args()
+    args = setup_env(args)
+    tol_args = { 'atol' : args.atol, 'rtol' : args.rtol }
+
+    torch.set_printoptions(precision=16)
+
+    ref_model, ref_opt, dist_model, dist_opt = setup(args)
+
+    # lazy_init not called yet, initialize stash
+    stash = ref_opt._amp_stash
+    stash.all_fp16_params, stash.all_fp32_from_fp16_params = [], []
+
+    # make sure everything from _first_step_init_ is ready before training
+    # e.g. registering allreduce_hook
+    # so that gradients are copied/reduced when necessary
+    dist_opt._init_everything()
+
+    for i in range(args.steps):
+        x_ref = torch.randn(args.batch, args.dim, dtype=torch.half).cuda().requires_grad_(True)
+        x_dist = x_ref.clone().detach().requires_grad_(True)
+        
+        if get_rank() == 0:
+            print(f'[{i}] Checking input')
+        #print("x_ref:", x_ref.flatten()[:10])
+        #print("x_dist:", x_dist.flatten()[:10])
+        assert(torch.allclose(x_ref, x_dist, **tol_args))
+
+        y_ref = ref_model(x_ref).half()
+        y_dist = dist_model(x_dist)
+
+        if get_rank() == 0:
+            print(f'[{i}] Checking output')
+        #print("y_ref:", y_ref.flatten()[:10])
+        #print("y_dist:", y_dist.flatten()[:10])
+        assert(torch.allclose(y_ref, y_dist, **tol_args))
+
+        dy = torch.randn_like(y_ref)
+
+        y_ref.backward(dy)
+        y_dist.backward(dy)
+
+        if get_rank() == 0:
+            print(f'[{i}] Checking gradients')
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+        assert(torch.allclose(x_ref.grad, x_dist.grad, **tol_args))
+
+        # gradient all-reduce within distributed optimizer
+        dist_opt.complete_reductions()
+
+        if get_rank() == 0:
+            print(f'[{i}] Stepping')
+        ref_opt.step()
+        dist_opt.step()
+
+        torch.cuda.synchronize()
+        torch.distributed.barrier()
+        print('Checking new weights')
+        if get_rank() == 0:
+            print("ref param:", ref_model.module.linear[0].weight)
+            print("dist param:", dist_model.linear[0].weight)
+        
+        for i, (rp, dp) in enumerate(zip(ref_model.parameters(), dist_model.parameters())):
+            if not torch.allclose(rp, dp, **tol_args):
+                if get_rank() == 0:
+                    print(f'Rank: {get_rank()}, Param: {i}')
+                    print(f'ref: {rp.sum().item()}, dist: {dp.sum().item()}')
+                    print(rp)
+                    print(dp)
+    
+                    print(torch.abs(rp-dp) > tol_args['atol'])
+                    sys.exit(0)
+
+        # zero grads
+        for rp, dp in zip(ref_model.parameters(), dist_model.parameters()):
+            rp.grad = None
+            dp.grad = None
+
+
+if __name__ == "__main__":
+    main()
+
--- a/tests/L0/run_optimizers/test_adam.py
+++ b/tests/L0/run_optimizers/test_adam.py
@@ -4,10 +4,11 @@ import random

 import torch
 import apex
+from itertools import product

 from apex.testing.common_utils import skipIfRocm

-class TestFusedAdam(unittest.TestCase):
+class TestFusedOptimizer(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
        self.max_abs_diff = max_abs_diff
        self.max_rel_diff = max_rel_diff
@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
    def tearDown(self):
        pass

-    def gen_param_optim(self, tensors, adam_option, apex_only=False):
+    def gen_param_optim(self, tensors, options, apex_only=False):
        ref_param = []
        tst_param = []
        for tensor in tensors:
@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
            tst_param.append(torch.nn.Parameter(tensor.clone()))

        if apex_only:
-            ref_optim = apex.optimizers.FusedAdam(ref_param, **adam_option)
+            ref_optim = self.fused_optim(ref_param, **options)
        else:
-            ref_optim = torch.optim.Adam(ref_param, **adam_option)
-        tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
+            ref_optim = self.ref_optim(ref_param, **options)
+        tst_optim = self.fused_optim(tst_param, **options)

        return (ref_param, tst_param, ref_optim, tst_optim)

@@ -60,26 +61,32 @@ class TestFusedAdam(unittest.TestCase):

        return max_abs_diff, max_rel_diff

-    def gen_single_type_test(self, param_type=torch.float, apex_only=False):
+    def gen_single_type_test(self, param_type=torch.float, apex_only=False, device='cuda'):
        nelem = 278011
-        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
-            'weight_decay':0, 'amsgrad':False}

-        tensor = torch.rand(nelem, dtype=param_type, device='cuda')
+        tensor = torch.rand(nelem, dtype=param_type, device=device)
        ref_param, tst_param, ref_optim, tst_optim = \
-            self.gen_param_optim([tensor], adam_option, apex_only=apex_only)
+            self.gen_param_optim([tensor], self.options, apex_only=apex_only)

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param, apex_only=apex_only)
            ref_optim.step()
            tst_optim.step()
            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param, apex_only=apex_only)
-
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            if not apex_only:
                self.assertLessEqual(max_rel_diff, self.max_rel_diff)

-    @skipIfRocm
+
+class TestFusedAdam(TestFusedOptimizer):
+
+    def __init__(self, *args, **kwargs):
+        super(TestFusedAdam, self).__init__(*args, **kwargs)
+        self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay': 0, 'amsgrad': False}
+        self.ref_optim = torch.optim.Adam
+        self.fused_optim = apex.optimizers.FusedAdam
+
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

@@ -95,17 +102,22 @@ class TestFusedAdam(unittest.TestCase):
        self.max_abs_diff = 1e-2
        self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)

+    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
+    def test_multi_device(self):
+        devices = ("cuda:0", "cuda:1")
+        for current_dev, tensor_dev in product(devices, devices):
+            with torch.cuda.device(current_dev):
+                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
+
    @unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked')
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
-        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
-            'weight_decay':0, 'amsgrad':False}

        tensors = []
        for size in sizes:
            tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
        ref_param, tst_param, ref_optim, tst_optim = \
-            self.gen_param_optim(tensors, adam_option)
+            self.gen_param_optim(tensors, self.options)

        for i in range(self.iters):
            self.gen_grad(ref_param, tst_param)
@@ -118,12 +130,9 @@ class TestFusedAdam(unittest.TestCase):
    @unittest.skip('No longer support fuse scaling')
    def test_scale(self):
        nelem = 278011
-        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
-            'weight_decay':0, 'amsgrad':False}
-
        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
        ref_param, tst_param, ref_optim, tst_optim = \
-            self.gen_param_optim([tensor], adam_option)
+            self.gen_param_optim([tensor], self.options)

        for i in range(self.iters):
            scale = random.random() * 1000
@@ -138,12 +147,10 @@ class TestFusedAdam(unittest.TestCase):
    @unittest.skip('No longer support output fp16 param')
    def test_fp16_output(self):
        nelem = 278011
-        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
-            'weight_decay':0, 'amsgrad':False}

        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
        ref_param, tst_param, ref_optim, tst_optim = \
-            self.gen_param_optim([tensor], adam_option)
+            self.gen_param_optim([tensor], self.options)

        fp16_param = torch.nn.Parameter(tensor.clone().half())

@@ -180,6 +187,103 @@ class TestFusedAdam(unittest.TestCase):
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)


+class TestFusedAdagrad(TestFusedOptimizer):
+    def __init__(self, *args, **kwargs):
+        super(TestFusedAdagrad, self).__init__(*args, **kwargs)
+        self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
+        self.ref_optim = torch.optim.Adagrad
+        self.fused_optim = apex.optimizers.FusedAdagrad
+
+    def test_float(self):
+        self.gen_single_type_test(param_type=torch.float)
+
+    @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
+    def test_half(self):
+        self.gen_single_type_test(param_type=torch.float16)
+
+    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
+    def test_multi_device(self):
+        devices = ("cuda:0", "cuda:1")
+        for current_dev, tensor_dev in product(devices, devices):
+            with torch.cuda.device(current_dev):
+                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
+
+
+    def test_multi_params(self):
+        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
+        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
+
+        tensors = []
+        for size in sizes:
+            tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
+        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
+            tensors, adagrad_option
+        )
+
+        for _ in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
+    def test_multi_params_different_devices_throws(self):
+        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
+        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
+
+        tensors = []
+        for i, size in enumerate(sizes):
+            tensors.append(torch.rand(size, dtype=torch.float, device="cuda:"+str(i % 2)))
+        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
+            tensors, adagrad_option
+        )
+        self.gen_grad(ref_param, tst_param)
+        with self.assertRaisesRegex(RuntimeError, "not on the same device"):
+            tst_optim.step()
+
+    def test_adagrad_option(self):
+        nelem = 1
+        adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
+        ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
+            [tensor], adagrad_option
+        )
+
+        for _ in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+
+class TestFusedSGD(TestFusedOptimizer):
+    def __init__(self, *args, **kwargs):
+        super(TestFusedSGD, self).__init__(*args, **kwargs)
+        self.options = {"lr": .25, "momentum": .125}
+        self.ref_optim = torch.optim.SGD
+        self.fused_optim = apex.optimizers.FusedSGD
+
+    def test_float(self):
+        self.gen_single_type_test(param_type=torch.float)
+
+    def test_half(self):
+        self.gen_single_type_test(param_type=torch.float16)
+
+    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
+    def test_multi_device(self):
+        devices = ("cuda:0", "cuda:1")
+        for current_dev, tensor_dev in product(devices, devices):
+            with torch.cuda.device(current_dev):
+                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
+
+
+
+
 if __name__ == '__main__':
-    script_path = os.path.dirname(os.path.realpath(__file__))
    unittest.main()
--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -6,6 +6,7 @@ from torch.optim import Optimizer
 import apex
 from apex.multi_tensor_apply import multi_tensor_applier
 from apex.testing.common_utils import skipIfRocm
+from itertools import product

 class RefLAMB(Optimizer):
    r"""Implements Lamb algorithm.
@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
            import amp_C
            self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
            # Skip buffer
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
            self.multi_tensor_lamb = amp_C.multi_tensor_lamb
        else:
            raise RuntimeError('apex.optimizers.FusedLAMB requires cuda extensions')
@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
                else:
                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')

-        g_norm_32, g_norm_16 = torch.zeros(1, device='cuda'), torch.zeros(1, device='cuda')
+        device = self.param_groups[0]["params"][0].device
+        g_norm_32, g_norm_16 = torch.zeros(1, device=device), torch.zeros(1, device=device)
        # compute grad norm for two lists
        if len(g_all_32) > 0:
            g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
                                                self._dummy_overflow_buf,
                                                [[g_norm_32, g_norm_16]],
                                                False)[0]
-        
+
        max_grad_norm = 1.0
        clipped_ratio = max_grad_norm / max(global_grad_norm, max_grad_norm)

@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
                if p.grad is None:
                    continue
                p.grad.data *= clipped_ratio
-                grad = p.grad.data 
+                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')

@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
                state['g_norm'] = g_norm
                state['trust_ratio'] = trust_ratio

-                step_size = group['lr'] 
+                step_size = group['lr']

                p.data.add_(update, alpha=-step_size*trust_ratio)

@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):

        return max_abs_diff, max_rel_diff

-    def gen_single_type_test(self, param_type=torch.float):
+    def gen_single_type_test(self, param_type=torch.float, device="cuda"):
        nelem = 278011
-        tensor = torch.rand(nelem, dtype=param_type, device='cuda')
+        tensor = torch.rand(nelem, dtype=param_type, device=device)
        weight_decay = [0, 0.01]
-        
+
        for wd in weight_decay:
            lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
            ref_param, tst_param, ref_optim, tst_optim = \
@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
            for i in range(self.iters):
                self.gen_grad(ref_param, tst_param)
                ref_optim.step()
+                torch.cuda.synchronize()
                tst_optim.step()
+                torch.cuda.synchronize()
                max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)

                self.assertLessEqual(max_abs_diff, self.max_abs_diff)
@@ -217,10 +221,17 @@ class TestFusedLAMB(unittest.TestCase):
        self.gen_single_type_test(param_type=torch.float16)

    @skipIfRocm
+    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
+    def test_multi_device(self):
+        devices = ("cuda:0", "cuda:1")
+        for current_dev, tensor_dev in product(devices, devices):
+            with torch.cuda.device(current_dev):
+                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
+
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        weight_decay = [0, 0.01]
-        
+
        for wd in weight_decay:
            lamb_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08, 'weight_decay':wd}
            tensors = []
@@ -242,7 +253,7 @@ class TestFusedLAMB(unittest.TestCase):
        nelem = 1
        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
        weight_decay = [0, 0.01]
-        
+
        for wd in weight_decay:
            lamb_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, 'weight_decay':wd}
            ref_param, tst_param, ref_optim, tst_optim = \