use __launch_bounds__ for multi_tensor_apply (#44)

use __launch_bounds__(1024) for multi_tensor_apply, re-enable skipped tests

use __launch_bounds__ for multi_tensor_apply (#44)
use __launch_bounds__(1024) for multi_tensor_apply, re-enable skipped tests
5baa68d3 · Jeff Daily · GitHub · 85b56d01 · 5baa68d3 · 5baa68d3
Unverified Commit 5baa68d3 authored Jan 21, 2021 by Jeff Daily Committed by GitHub Jan 21, 2021
11 changed files
--- a/csrc/multi_tensor_apply.cuh
+++ b/csrc/multi_tensor_apply.cuh
@@ -28,6 +28,9 @@ template<int n> struct TensorListMetadata


 template<typename T, typename U, typename... ArgTypes>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(1024)
+#endif
 __global__ void multi_tensor_apply_kernel(
    int chunk_size,
    volatile int* noop_flag,

--- a/tests/L0/run_amp/test_checkpointing.py
+++ b/tests/L0/run_amp/test_checkpointing.py
@@ -6,7 +6,6 @@ import torch.nn.functional as F
 import torch.optim as optim

 from apex import amp
-from apex.testing.common_utils import skipIfRocm

 from utils import common_init, FLOAT

@@ -44,7 +43,7 @@ class TestCheckpointing(unittest.TestCase):
                             'Parameter in state_dict not FLOAT')

    def train_step(self, model, optimizer, data, loss_ids):
-        optimizer.zero_grad()        
+        optimizer.zero_grad()

        output = model(data)

@@ -102,12 +101,12 @@ class TestCheckpointing(unittest.TestCase):
                        if opt_level == res_opt_level:
                            # train for nb_epochs and restore after nb_epochs_restore
                            for epoch in range(nb_epochs):
-    
+
                                x = torch.randn(16, 3, 24, 24, device='cuda')
                                output = self.train_step(
                                    model, optimizer, x, range(num_losses))
                                # Initialize model one step before comparing.
-                                # Otherwise the batchnorm layers will be updated 
+                                # Otherwise the batchnorm layers will be updated
                                # additionally in restore_model
                                if epoch == (nb_epochs_restore - 1):
                                    # Load model and optimizer
@@ -161,7 +160,6 @@ class TestCheckpointing(unittest.TestCase):
                            # skip tests for different opt_levels
                            continue

-    @skipIfRocm
    def test_loss_scale_decrease(self):
        num_losses = 3
        nb_decrease_loss_scales = [0, 1, 2]
@@ -171,10 +169,10 @@ class TestCheckpointing(unittest.TestCase):
            nb_decrease_loss_scales_tmp = list(nb_decrease_loss_scales)

            model = MyModel().to('cuda')
-        
+
            optimizer = optim.SGD(model.parameters(),
                                  lr=self.initial_lr)
-        
+
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=opt_level, num_losses=num_losses,
                verbosity=0)
@@ -182,26 +180,26 @@ class TestCheckpointing(unittest.TestCase):
            if amp._amp_state.opt_properties.loss_scale != 'dynamic':
                #print('Static loss scale set. Skipping opt_level.')
                continue
-        
+
            # force to skip some updates to decrease the loss_scale
            initial_loss_scales = []
            for idx in range(num_losses):
                initial_loss_scales.append(
                    amp._amp_state.loss_scalers[idx].loss_scale())
-            
+
            for _ in range(len(nb_decrease_loss_scales)):
                x = torch.randn(16, 3, 24, 24, device='cuda')
                for idx in range(num_losses):
                    while nb_decrease_loss_scales_tmp[idx] > 0:
                        optimizer.zero_grad()
                        output = model(x * 2**17)
-                        loss = output.mean()            
-                    
+                        loss = output.mean()
+
                        with amp.scale_loss(loss, optimizer, loss_id=idx) as scaled_loss:
                            scaled_loss.backward(retain_graph=True)
                        optimizer.step()
                        nb_decrease_loss_scales_tmp[idx] -= 1
-                
+
            # Check loss scales afterwards
            updated_loss_scales = []
            for idx in range(num_losses):
@@ -243,7 +241,7 @@ class TestCheckpointing(unittest.TestCase):
            # Create dummy data
            data = torch.randn(10, 3, 4, 4, device='cuda')
            target = torch.randn(10, 6, 4, 4, device='cuda')
-            
+
            # Get initnial loss
            optimizer.zero_grad()
            output = model(data)
@@ -266,4 +264,4 @@ class TestCheckpointing(unittest.TestCase):

 if __name__=='__main__':
    unittest.main()
-        
+
--- a/tests/L0/run_amp/test_fused_sgd.py
+++ b/tests/L0/run_amp/test_fused_sgd.py
@@ -13,8 +13,6 @@ from torch.nn import Parameter
 from utils import common_init, HALF, FLOAT,\
    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT

-from apex.testing.common_utils import skipIfRocm
-
 try:
  import amp_C
  disabled = False

--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ b/tests/L0/run_amp/test_multi_tensor_axpby.py
@@ -12,8 +12,6 @@ from math import floor
 from utils import common_init, HALF, FLOAT,\
    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT

-from apex.testing.common_utils import skipIfRocm
-
 try:
  import amp_C
  from amp_C import multi_tensor_axpby
@@ -103,7 +101,6 @@ class TestMultiTensorAxpby(unittest.TestCase):
    #     self.assertTrue(self.overflow_buf.item())

    @unittest.skipIf(disabled, "amp_C is unavailable")
-    @skipIfRocm
    def test_fuzz(self):
        input_size_pairs = (
            (7777*77, 555*555),
@@ -143,7 +140,6 @@ class TestMultiTensorAxpby(unittest.TestCase):

    @unittest.skipIf(disabled, "amp_C is unavailable")
    @unittest.skipIf(not try_nhwc, "torch version is 1.4 or earlier, may not support nhwc")
-    @skipIfRocm
    def test_fuzz_nhwc(self):
        input_size_pairs = (
            ((7, 77, 7, 77), (5, 55, 5, 55)),

--- a/tests/L0/run_amp/test_multi_tensor_l2norm.py
+++ b/tests/L0/run_amp/test_multi_tensor_l2norm.py
@@ -11,8 +11,6 @@ import torch.nn.functional as F
 from utils import common_init, HALF, FLOAT,\
    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT

-from apex.testing.common_utils import skipIfRocm
-
 try:
  import amp_C
  from amp_C import multi_tensor_l2norm
@@ -69,7 +67,7 @@ class TestMultiTensorL2Norm(unittest.TestCase):
            (33333, 555),
            (555, 33333))
        appliers = (
-            MultiTensorApply(2048*32), 
+            MultiTensorApply(2048*32),
            MultiTensorApply(333),
            MultiTensorApply(33333))
        repeat_tensors = (

--- a/tests/L0/run_amp/test_multi_tensor_scale.py
+++ b/tests/L0/run_amp/test_multi_tensor_scale.py
@@ -11,11 +11,9 @@ import torch.nn.functional as F
 from utils import common_init, HALF, FLOAT,\
    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT

-from apex.testing.common_utils import skipIfRocm
-
 try:
  import amp_C
-  from amp_C import multi_tensor_scale 
+  from amp_C import multi_tensor_scale
  from apex.multi_tensor_apply import MultiTensorApply
  disabled = False
 except ImportError as err:
@@ -56,7 +54,7 @@ class TestMultiTensorScale(unittest.TestCase):
            out_list = [out.float() for out in out_list]
        self.assertTrue(all([torch.allclose(out, self.ref.to(out.dtype)) for out in out_list]))
        self.assertTrue(self.overflow_buf.item() == 0)
- 
+
    def find_inf(self, sizea, sizeb, applier, repeat_tensors, in_type, out_type, t, ind, val, inplace=False):
        self.overflow_buf.zero_()
        a = torch.cuda.FloatTensor(sizea).fill_(self.scale)
@@ -84,13 +82,12 @@ class TestMultiTensorScale(unittest.TestCase):
    # @unittest.skipIf(disabled, "amp_C is unavailable")
    # def test_fp16_to_fp16(self):
    #     self.downscale(self.fp16, self.fp16, self.fp16_ref)
-    # 
+    #
    # @unittest.skipIf(disabled, "amp_C is unavailable")
    # def test_fp32_to_fp16(self):
    #     self.downscale(self.fp32, self.fp16, self.fp16_ref)

    @unittest.skipIf(disabled, "amp_C is unavailable")
-    @skipIfRocm
    def test_fuzz(self):
        input_size_pairs = (
            (7777*77, 555*555),
@@ -102,7 +99,7 @@ class TestMultiTensorScale(unittest.TestCase):
            (33333, 555),
            (555, 33333))
        appliers = (
-            MultiTensorApply(2048*32), 
+            MultiTensorApply(2048*32),
            MultiTensorApply(333),
            MultiTensorApply(33333))
        repeat_tensors = (

--- a/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
+++ b/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
@@ -13,8 +13,6 @@ from torch.nn import Parameter
 from utils import common_init, HALF, FLOAT,\
    ALWAYS_HALF, ALWAYS_FLOAT, MATCH_INPUT

-from apex.testing.common_utils import skipIfRocm
-
 class MyModel(torch.nn.Module):
    def __init__(self, unique):
        super(MyModel, self).__init__()
@@ -43,7 +41,7 @@ class TestMultipleModelsOptimizersLosses(unittest.TestCase):

    def tearDown(self):
        pass
-    
+
    def test_2models2losses1optimizer(self):
        model0 = MyModel(1)
        model1 = MyModel(2)

--- a/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+++ b/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
@@ -6,7 +6,7 @@ import torch
 import apex
 from torch.autograd import Variable

-        
+
 class TestFusedLayerNorm(unittest.TestCase):
    def setUp(self):
        # bias and weight are set to 0 and 1 respectively, so no need to copy parameters from cpu module to the gpu one
@@ -33,10 +33,13 @@ class TestFusedLayerNorm(unittest.TestCase):

    def test_large_batch(self):
        self._test_same_output(65536)
-        
-        
+
+
 class TestFusedLayerNormElemWise(TestFusedLayerNorm):
    def setUp(self):
        self.module_cpu_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cpu()
        self.module_cuda_ = apex.normalization.FusedLayerNorm(normalized_shape=[32, 16], elementwise_affine=True).cuda()

+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/L0/run_optimizers/test_fused_optimizer.py
+++ b/tests/L0/run_optimizers/test_fused_optimizer.py
@@ -6,8 +6,6 @@ import torch
 import apex
 from itertools import product

-from apex.testing.common_utils import skipIfRocm
-
 class TestFusedOptimizer(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
        self.max_abs_diff = max_abs_diff
@@ -87,7 +85,6 @@ class TestFusedAdam(TestFusedOptimizer):
        self.ref_optim = torch.optim.Adam
        self.fused_optim = apex.optimizers.FusedAdam

-    @skipIfRocm
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

@@ -98,12 +95,10 @@ class TestFusedAdam(TestFusedOptimizer):
    # Uses apex optimizers(controlled by apex_only flag) for both types.
    # Doesn't use upstream optimizer like other tests as they seem to be
    # numerically unstable for half types
-    @skipIfRocm
    def test_bfloat16(self):
        self.max_abs_diff = 1e-2
        self.gen_single_type_test(param_type=torch.bfloat16, apex_only=True)

-    @skipIfRocm
    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
@@ -196,7 +191,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
        self.ref_optim = torch.optim.Adagrad
        self.fused_optim = apex.optimizers.FusedAdagrad

-    @skipIfRocm
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

@@ -204,7 +198,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

-    @skipIfRocm
    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
@@ -213,7 +206,6 @@ class TestFusedAdagrad(TestFusedOptimizer):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)


-    @skipIfRocm
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}

--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -5,7 +5,6 @@ import torch
 from torch.optim import Optimizer
 import apex
 from apex.multi_tensor_apply import multi_tensor_applier
-from apex.testing.common_utils import skipIfRocm
 from itertools import product

 class RefLAMB(Optimizer):
@@ -212,7 +211,6 @@ class TestFusedLAMB(unittest.TestCase):
                self.assertLessEqual(max_abs_diff, self.max_abs_diff)
                self.assertLessEqual(max_rel_diff, self.max_rel_diff)

-    @skipIfRocm
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)

@@ -220,7 +218,6 @@ class TestFusedLAMB(unittest.TestCase):
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)

-    @skipIfRocm
    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
@@ -228,7 +225,6 @@ class TestFusedLAMB(unittest.TestCase):
            with torch.cuda.device(current_dev):
                self.gen_single_type_test(param_type=torch.float, device=tensor_dev)

-    @skipIfRocm
    def test_multi_params(self):
        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
        weight_decay = [0, 0.01]
@@ -249,7 +245,6 @@ class TestFusedLAMB(unittest.TestCase):
                self.assertLessEqual(max_abs_diff, self.max_abs_diff)
                self.assertLessEqual(max_rel_diff, self.max_rel_diff)

-    @skipIfRocm
    def test_lamb_option(self):
        nelem = 1
        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')

--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
 import unittest
 import sys

-from apex.testing.common_utils import TEST_WITH_ROCM, skipIfRocm
+from apex.testing.common_utils import TEST_WITH_ROCM

 test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"]