Commit f1f28ff6 authored by hubertlu-tw's avatar hubertlu-tw
Browse files

Merge remote-tracking branch 'origin/dev/hubertlu/flaky_tests' into IFU-master-2022-07-29

parents 57dea7f2 4d567459
...@@ -10,6 +10,7 @@ import unittest ...@@ -10,6 +10,7 @@ import unittest
TEST_WITH_ROCM = os.getenv('APEX_TEST_WITH_ROCM', '0') == '1' TEST_WITH_ROCM = os.getenv('APEX_TEST_WITH_ROCM', '0') == '1'
SKIP_FLAKY_TEST = os.getenv('APEX_SKIP_FLAKY_TEST', '0') == '1'
## Wrapper to skip the unit tests. ## Wrapper to skip the unit tests.
def skipIfRocm(fn): def skipIfRocm(fn):
...@@ -20,3 +21,13 @@ def skipIfRocm(fn): ...@@ -20,3 +21,13 @@ def skipIfRocm(fn):
else: else:
fn(*args, **kwargs) fn(*args, **kwargs)
return wrapper return wrapper
## Wrapper to skip the flaky unit tests.
def skipFlakyTest(fn):
@wraps(fn)
def wrapper(*args, **kwargs):
if SKIP_FLAKY_TEST:
raise unittest.SkipTest("Test is flaky.")
else:
fn(*args, **kwargs)
return wrapper
...@@ -8,7 +8,7 @@ import torch.optim as optim ...@@ -8,7 +8,7 @@ import torch.optim as optim
from apex import amp from apex import amp
from utils import common_init, FLOAT from utils import common_init, FLOAT
from apex.testing.common_utils import skipFlakyTest
class MyModel(torch.nn.Module): class MyModel(torch.nn.Module):
def __init__(self): def __init__(self):
...@@ -161,6 +161,7 @@ class TestCheckpointing(unittest.TestCase): ...@@ -161,6 +161,7 @@ class TestCheckpointing(unittest.TestCase):
# skip tests for different opt_levels # skip tests for different opt_levels
continue continue
@skipFlakyTest
def test_loss_scale_decrease(self): def test_loss_scale_decrease(self):
num_losses = 3 num_losses = 3
nb_decrease_loss_scales = [0, 1, 2] nb_decrease_loss_scales = [0, 1, 2]
......
...@@ -4,7 +4,7 @@ import unittest ...@@ -4,7 +4,7 @@ import unittest
import torch import torch
import apex import apex
from apex.testing.common_utils import skipFlakyTest
class TestFusedLayerNorm(unittest.TestCase): class TestFusedLayerNorm(unittest.TestCase):
dtype = torch.float dtype = torch.float
...@@ -180,6 +180,7 @@ class TestMixedFusedRMSNormElemWise(TestFusedRMSNorm): ...@@ -180,6 +180,7 @@ class TestMixedFusedRMSNormElemWise(TestFusedRMSNorm):
elementwise_affine = True elementwise_affine = True
mixed_fused = True mixed_fused = True
@skipFlakyTest
class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise): class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise):
dtype = torch.half dtype = torch.half
bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3) bwd_thresholds = dict(rtol=1.6e-2, atol=3e-3)
...@@ -188,6 +189,7 @@ class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise): ...@@ -188,6 +189,7 @@ class TestFusedRMSNormElemWiseHalf(TestFusedRMSNormElemWise):
self.skipTest("Skip to save time") self.skipTest("Skip to save time")
@skipFlakyTest
class TestFusedLayerNormElemWiseBFloat16(TestFusedLayerNormElemWise): class TestFusedLayerNormElemWiseBFloat16(TestFusedLayerNormElemWise):
dtype = torch.bfloat16 dtype = torch.bfloat16
# NOTE (mkozuki): [BFloat16 Layer Norm flakiness] # NOTE (mkozuki): [BFloat16 Layer Norm flakiness]
......
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from torch import nn from torch import nn
from apex.mlp import MLP from apex.mlp import MLP
from apex.testing.common_utils import skipIfRocm from apex.testing.common_utils import skipFlakyTest
batch_size = 1024 batch_size = 1024
mlp_sizes = [480, 1024, 1024, 512, 256, 1] mlp_sizes = [480, 1024, 1024, 512, 256, 1]
...@@ -18,7 +18,7 @@ class TestMLP(unittest.TestCase): ...@@ -18,7 +18,7 @@ class TestMLP(unittest.TestCase):
def test_creation(self): def test_creation(self):
MLP(mlp_sizes) MLP(mlp_sizes)
@skipIfRocm @skipFlakyTest
def test_numeric(self): def test_numeric(self):
mlp = MLP(mlp_sizes).cuda() mlp = MLP(mlp_sizes).cuda()
...@@ -53,7 +53,7 @@ class TestMLP(unittest.TestCase): ...@@ -53,7 +53,7 @@ class TestMLP(unittest.TestCase):
ref_mlp[0].bias.grad.detach().cpu().numpy(), ref_mlp[0].bias.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5) atol=1e-7, rtol=1e-5)
@skipIfRocm @skipFlakyTest
def test_no_bias(self): def test_no_bias(self):
for use_activation in ['none', 'relu', 'sigmoid']: for use_activation in ['none', 'relu', 'sigmoid']:
mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda() mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
...@@ -91,7 +91,7 @@ class TestMLP(unittest.TestCase): ...@@ -91,7 +91,7 @@ class TestMLP(unittest.TestCase):
ref_mlp[0].weight.grad.detach().cpu().numpy(), ref_mlp[0].weight.grad.detach().cpu().numpy(),
atol=1e-7, rtol=100) atol=1e-7, rtol=100)
@skipIfRocm @skipFlakyTest
def test_with_bias(self): def test_with_bias(self):
for use_activation in ['none', 'relu', 'sigmoid']: for use_activation in ['none', 'relu', 'sigmoid']:
mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda() mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
...@@ -134,7 +134,7 @@ class TestMLP(unittest.TestCase): ...@@ -134,7 +134,7 @@ class TestMLP(unittest.TestCase):
ref_mlp[0].bias.grad.detach().cpu().numpy(), ref_mlp[0].bias.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5) atol=1e-7, rtol=1e-5)
@skipIfRocm @skipFlakyTest
def test_no_grad(self): def test_no_grad(self):
mlp = MLP(mlp_sizes).cuda() mlp = MLP(mlp_sizes).cuda()
...@@ -165,7 +165,6 @@ class TestMLP(unittest.TestCase): ...@@ -165,7 +165,6 @@ class TestMLP(unittest.TestCase):
ref_mlp[0].weight.grad.detach().cpu().numpy(), ref_mlp[0].weight.grad.detach().cpu().numpy(),
atol=1e-7, rtol=1e-5) atol=1e-7, rtol=1e-5)
@skipIfRocm
def test_performance_half(self): def test_performance_half(self):
mlp = MLP(mlp_sizes).cuda().half() mlp = MLP(mlp_sizes).cuda().half()
...@@ -195,7 +194,7 @@ class TestMLP(unittest.TestCase): ...@@ -195,7 +194,7 @@ class TestMLP(unittest.TestCase):
mlp.zero_grad() mlp.zero_grad()
test_loss.backward() test_loss.backward()
torch.cuda.profiler.start() #torch.cuda.profiler.start()
torch.cuda.synchronize() torch.cuda.synchronize()
start_time = time() start_time = time()
for _ in range(num_iters): for _ in range(num_iters):
...@@ -217,7 +216,7 @@ class TestMLP(unittest.TestCase): ...@@ -217,7 +216,7 @@ class TestMLP(unittest.TestCase):
torch.cuda.synchronize() torch.cuda.synchronize()
stop_time = time() stop_time = time()
print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms") print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms")
torch.cuda.profiler.stop() #torch.cuda.profiler.stop()
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -95,6 +95,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -95,6 +95,7 @@ class TestFusedAdam(TestFusedOptimizer):
self.ref_optim = torch.optim.Adam self.ref_optim = torch.optim.Adam
self.fused_optim = apex.optimizers.FusedAdam self.fused_optim = apex.optimizers.FusedAdam
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
def test_float(self): def test_float(self):
self.gen_single_type_test(param_type=torch.float) self.gen_single_type_test(param_type=torch.float)
...@@ -107,6 +108,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -107,6 +108,7 @@ class TestFusedAdam(TestFusedOptimizer):
def test_bfloat16(self): def test_bfloat16(self):
self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True) self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
...@@ -173,6 +175,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -173,6 +175,7 @@ class TestFusedAdam(TestFusedOptimizer):
self.assertLessEqual(max_abs_diff, self.max_abs_diff) self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff) self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skip("Skipped the test since a regression introduced from PyTorch upstream: due to https://github.com/pytorch/pytorch/issues/80809#issuecomment-1175211598. Please also refer to https://github.com/ROCmSoftwarePlatform/apex/issues/82")
def test_adam_option(self): def test_adam_option(self):
nelem = 1 nelem = 1
adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06, adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
......
...@@ -285,6 +285,7 @@ class TestFusedMixedPrecisionLamb(TestLamb): ...@@ -285,6 +285,7 @@ class TestFusedMixedPrecisionLamb(TestLamb):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
@unittest.skip("Skipped the test since it failed the accuracy test on the PyTorch as of 8/1/2022. Please refer to https://github.com/ROCmSoftwarePlatform/apex/issues/83")
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required") @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
......
#!/bin/bash #!/bin/bash
APEX_TEST_WITH_ROCM=1 python run_test.py APEX_TEST_WITH_ROCM=1 APEX_SKIP_FLAKY_TEST=1 python run_test.py
"""
import unittest import unittest
import sys import sys
from apex.testing.common_utils import TEST_WITH_ROCM from apex.testing.common_utils import TEST_WITH_ROCM
from apex.testing.common_utils import SKIP_FLAKY_TEST
test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp", "run_transformer"] test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp", "run_transformer"]
...@@ -15,7 +17,7 @@ runner = unittest.TextTestRunner(verbosity=2) ...@@ -15,7 +17,7 @@ runner = unittest.TextTestRunner(verbosity=2)
errcode = 0 errcode = 0
for test_dir in test_dirs: for test_dir in test_dirs:
if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM: if (test_dir in ROCM_BLACKLIST) and TEST_WITH_ROCM and SKIP_FLAKY_TEST:
continue continue
suite = unittest.TestLoader().discover(test_dir) suite = unittest.TestLoader().discover(test_dir)
...@@ -27,3 +29,79 @@ for test_dir in test_dirs: ...@@ -27,3 +29,79 @@ for test_dir in test_dirs:
errcode = 1 errcode = 1
sys.exit(errcode) sys.exit(errcode)
"""
############################################################
"""L0 Tests Runner.
How to run this script?
1. Run all the tests: `python /path/to/apex/tests/L0/run_test.py`
2. Run one of the tests (e.g. fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_fused_layer_norm`
3. Run two or more of the tests (e.g. optimizers and fused layer norm):
`python /path/to/apex/tests/L0/run_test.py --include run_optimizers run_fused_layer_norm`
"""
import argparse
import os
import unittest
import sys
from apex.testing.common_utils import TEST_WITH_ROCM
from apex.testing.common_utils import SKIP_FLAKY_TEST
TEST_ROOT = os.path.dirname(os.path.abspath(__file__))
TEST_DIRS = [
"run_amp",
"run_fp16util",
"run_optimizers",
"run_fused_layer_norm",
"run_mlp",
"run_transformer", # not fully supported on ROCm
]
DEFAULT_TEST_DIRS = [
"run_amp",
"run_fp16util",
"run_optimizers",
"run_fused_layer_norm",
"run_mlp",
]
def parse_args():
parser = argparse.ArgumentParser(
description="L0 test runner",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--include",
nargs="+",
choices=TEST_DIRS,
default=DEFAULT_TEST_DIRS,
help="select a set of tests to run (defaults to ALL tests).",
)
args, _ = parser.parse_known_args()
return args
def main(args):
runner = unittest.TextTestRunner(verbosity=2)
errcode = 0
for test_dir in args.include:
test_dir = os.path.join(TEST_ROOT, test_dir)
print(test_dir)
suite = unittest.TestLoader().discover(test_dir)
print("\nExecuting tests from " + test_dir)
result = runner.run(suite)
if not result.wasSuccessful():
errcode = 1
sys.exit(errcode)
if __name__ == '__main__':
args = parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment