Merge remote-tracking branch 'origin/dev/hubertlu/flaky_tests' into IFU-master-2022-07-29

f1f28ff6 · hubertlu-tw · 57dea7f2 · 4d567459 · f1f28ff6 · f1f28ff6
Commit f1f28ff6 authored Aug 09, 2022 by hubertlu-tw
8 changed files
--- a/apex/testing/common_utils.py
+++ b/apex/testing/common_utils.py
@@ -10,6 +10,7 @@ import unittest
 TEST_WITH_ROCM = os.getenv('APEX_TEST_WITH_ROCM', '0') == '1'
+SKIP_FLAKY_TEST = os.getenv('APEX_SKIP_FLAKY_TEST', '0') == '1'
 ## Wrapper to skip the unit tests.
 def skipIfRocm(fn):
@@ -20,3 +21,13 @@ def skipIfRocm(fn):
        else:
            fn(*args, **kwargs)
    return wrapper
+## Wrapper to skip the flaky unit tests.
+def skipFlakyTest(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if SKIP_FLAKY_TEST:
+            raise unittest.SkipTest("Test is flaky.")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
--- a/tests/L0/run_amp/test_checkpointing.py
+++ b/tests/L0/run_amp/test_checkpointing.py
@@ -8,7 +8,7 @@ import torch.optim as optim
 from apex import amp
 from utils import common_init, FLOAT
+from apex.testing.common_utils import skipFlakyTest
 class MyModel(torch.nn.Module):
    def __init__(self):
@@ -161,6 +161,7 @@ class TestCheckpointing(unittest.TestCase):
                            # skip tests for different opt_levels
                            continue
+    @skipFlakyTest
    def test_loss_scale_decrease(self):
        num_losses = 3
        nb_decrease_loss_scales = [0, 1, 2]

--- a/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+++ b/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
@@ -4,7 +4,7 @@ import unittest
 import torch
 import apex
+from apex.testing.common_utils import skipFlakyTest
 class TestFusedLayerNorm(unittest.TestCase):
    dtype = torch.float
@@ -180,6 +180,7 @@ class TestMixedFusedRMSNormElemWise(TestFusedRMSNorm):
    elementwise_affine = True
    mixed_fused = True
+@skipFlakyTest
 class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise):
    dtype = torch.half
    bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
@@ -188,6 +189,7 @@ class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise):
        self.skipTest("Skip to save time")
+@skipFlakyTest
 class TestFusedLayerNormElemWiseBFloat16(TestFusedLayerNormElemWise):
    dtype = torch.bfloat16
    # NOTE (mkozuki): [BFloat16 Layer Norm flakiness]

--- a/tests/L0/run_mlp/test_mlp.py
+++ b/tests/L0/run_mlp/test_mlp.py
@@ -7,7 +7,7 @@ import torch
 from torch import nn
 from apex.mlp import MLP
-from apex.testing.common_utils import skipIfRocm
+from apex.testing.common_utils import skipFlakyTest
 batch_size = 1024
 mlp_sizes = [480, 1024, 1024, 512, 256, 1]
@@ -18,7 +18,7 @@ class TestMLP(unittest.TestCase):
    def test_creation(self):
        MLP(mlp_sizes)
-    @skipIfRocm
+    @skipFlakyTest
    def test_numeric(self):
        mlp = MLP(mlp_sizes).cuda()
@@ -53,7 +53,7 @@ class TestMLP(unittest.TestCase):
            ref_mlp[0].bias.grad.detach().cpu().numpy(),
            atol=1e-7, rtol=1e-5)
-    @skipIfRocm
+    @skipFlakyTest
    def test_no_bias(self):
        for use_activation in ['none', 'relu', 'sigmoid']:
            mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
@@ -91,7 +91,7 @@ class TestMLP(unittest.TestCase):
                ref_mlp[0].weight.grad.detach().cpu().numpy(),
                atol=1e-7, rtol=100)
-    @skipIfRocm
+    @skipFlakyTest
    def test_with_bias(self):
        for use_activation in ['none', 'relu', 'sigmoid']:
            mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
@@ -134,7 +134,7 @@ class TestMLP(unittest.TestCase):
                ref_mlp[0].bias.grad.detach().cpu().numpy(),
                atol=1e-7, rtol=1e-5)
-    @skipIfRocm
+    @skipFlakyTest
    def test_no_grad(self):
        mlp = MLP(mlp_sizes).cuda()
@@ -165,7 +165,6 @@ class TestMLP(unittest.TestCase):
            ref_mlp[0].weight.grad.detach().cpu().numpy(),
            atol=1e-7, rtol=1e-5)
-    @skipIfRocm
    def test_performance_half(self):
        mlp = MLP(mlp_sizes).cuda().half()
@@ -195,7 +194,7 @@ class TestMLP(unittest.TestCase):
            mlp.zero_grad()
            test_loss.backward()
-        torch.cuda.profiler.start()
+        #torch.cuda.profiler.start()
        torch.cuda.synchronize()
        start_time = time()
        for _ in range(num_iters):
@@ -217,7 +216,7 @@ class TestMLP(unittest.TestCase):
        torch.cuda.synchronize()
        stop_time = time()
        print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
-        torch.cuda.profiler.stop()
+        #torch.cuda.profiler.stop()
 if __name__ == '__main__':
    unittest.main()
--- a/tests/L0/run_optimizers/test_fused_optimizer.py
+++ b/tests/L0/run_optimizers/test_fused_optimizer.py
@@ -95,6 +95,7 @@ class TestFusedAdam(TestFusedOptimizer):
        self.ref_optim = torch.optim.Adam
        self.fused_optim = apex.optimizers.FusedAdam
+    @unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
    def test_float(self):
        self.gen_single_type_test(param_type=torch.float)
@@ -107,6 +108,7 @@ class TestFusedAdam(TestFusedOptimizer):
    def test_bfloat16(self):
        self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
+    @unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")
@@ -173,6 +175,7 @@ class TestFusedAdam(TestFusedOptimizer):
            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+    @unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
    def test_adam_option(self):
        nelem = 1
        adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,

--- a/tests/L0/run_optimizers/test_lamb.py
+++ b/tests/L0/run_optimizers/test_lamb.py
@@ -285,6 +285,7 @@ class TestFusedMixedPrecisionLamb(TestLamb):
    def test_half(self):
        self.gen_single_type_test(param_type=torch.float16)
+    @unittest.skip("Skipped the test since it failed the accuracy test on the PyTorch as of 8/1/2022. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/83")
    @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
    def test_multi_device(self):
        devices = ("cuda:0", "cuda:1")

--- a/tests/L0/run_rocm.sh
+++ b/tests/L0/run_rocm.sh
 #!/bin/bash
-APEX_TEST_WITH_ROCM=1 python run_test.py
+APEX_TEST_WITH_ROCM=1 APEX_SKIP_FLAKY_TEST=1 python run_test.py
--- a/tests/L0/run_test.py
+++ b/tests/L0/run_test.py
+"""
 import unittest
 import sys
 from apex.testing.common_utils import TEST_WITH_ROCM
+from apex.testing.common_utils import SKIP_FLAKY_TEST
 test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp", "run_transformer"]
@@ -15,7 +17,7 @@ runner = unittest.TextTestRunner(verbosity=2)
 errcode = 0
 for test_dir in test_dirs:
-    if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM:
+    if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM and SKIP_FLAKY_TEST:
        continue
    suite = unittest.TestLoader().discover(test_dir)
@@ -27,3 +29,79 @@ for test_dir in test_dirs:
        errcode = 1
 sys.exit(errcode)
+"""
+############################################################
+"""L0 Tests Runner.
+How to run this script?
+1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
+2. Run one of the tests (e.g. fused layer norm):
+    `python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
+3. Run two or more of the tests (e.g. optimizers and fused layer norm):
+    `python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
+"""
+import argparse
+import os
+import unittest
+import sys
+from apex.testing.common_utils import TEST_WITH_ROCM
+from apex.testing.common_utils import SKIP_FLAKY_TEST
+TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
+TEST_DIRS = [
+    "run_amp",
+    "run_fp16util",
+    "run_optimizers",
+    "run_fused_layer_norm",
+    "run_mlp",
+    "run_transformer",       # not fully supported on ROCm
+]
+DEFAULT_TEST_DIRS = [
+    "run_amp",
+    "run_fp16util",
+    "run_optimizers",
+    "run_fused_layer_norm",
+    "run_mlp",
+]
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="L0 test runner",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--include",
+        nargs="+",
+        choices=TEST_DIRS,
+        default=DEFAULT_TEST_DIRS,
+        help="select a set of tests to run (defaults to ALL tests).",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+def main(args):
+    runner = unittest.TextTestRunner(verbosity=2)
+    errcode = 0
+    for test_dir in args.include:
+        test_dir = os.path.join(TEST_ROOT, test_dir)
+        print(test_dir)
+        suite = unittest.TestLoader().discover(test_dir)
+        print("\nExecuting tests from " + test_dir)
+        result = runner.run(suite)
+        if not result.wasSuccessful():
+            errcode = 1
+    sys.exit(errcode)
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)