Unverified Commit 7e1068b3 authored by Tim Moon's avatar Tim Moon Committed by GitHub
Browse files

[PyTorch] Port fused optimizer tests to pytest (#1185)



Port optimizer tests to pytest
Signed-off-by: default avatarTim Moon <tmoon@nvidia.com>
parent eb60b1ab
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
# See LICENSE for license information. # See LICENSE for license information.
from itertools import product from itertools import product
import unittest
import copy import copy
import pytest
import torch import torch
from torch import nn from torch import nn
from torch.testing._internal.common_device_type import largeTensorTest from torch.testing._internal.common_device_type import largeTensorTest
...@@ -19,14 +19,12 @@ from transformer_engine.pytorch.fp8 import FP8GlobalStateManager ...@@ -19,14 +19,12 @@ from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
class TestFusedOptimizer(unittest.TestCase): class TestFusedOptimizer:
def setUp(self, iters=7):
def setup_method(self, *, iters: int = 7) -> None:
self.iters = iters self.iters = iters
torch.manual_seed(9876) torch.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, options, tst_options=None): def gen_param_optim(self, tensors, options, tst_options=None):
# Adding this to make backward compatible with existing tests. Just in # Adding this to make backward compatible with existing tests. Just in
...@@ -88,8 +86,8 @@ class TestFusedOptimizer(unittest.TestCase): ...@@ -88,8 +86,8 @@ class TestFusedOptimizer(unittest.TestCase):
class TestFusedAdam(TestFusedOptimizer): class TestFusedAdam(TestFusedOptimizer):
def setUp(self): def setup_method(self) -> None:
super().setUp() super().setup_method()
self.options = { self.options = {
"lr": 5e-4, "lr": 5e-4,
"betas": (0.9, 0.999), "betas": (0.9, 0.999),
...@@ -111,7 +109,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -111,7 +109,7 @@ class TestFusedAdam(TestFusedOptimizer):
def test_bfloat16(self): def test_bfloat16(self):
self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True) self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
@unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices): for current_dev, tensor_dev in product(devices, devices):
...@@ -176,7 +174,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -176,7 +174,7 @@ class TestFusedAdam(TestFusedOptimizer):
torch.testing.assert_close(ref_param, tst_param) torch.testing.assert_close(ref_param, tst_param)
@unittest.skipIf(not is_bf16_compatible(), "bf16 if not supported") @pytest.mark.skipif(not is_bf16_compatible(), reason="bf16 if not supported")
def test_bf16_model_weight_cast(self): def test_bf16_model_weight_cast(self):
dtype = torch.bfloat16 dtype = torch.bfloat16
model = MultiheadAttention( model = MultiheadAttention(
...@@ -214,7 +212,7 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -214,7 +212,7 @@ class TestFusedAdam(TestFusedOptimizer):
ref_params, model_params_to_fp32, rtol=1e-3, atol=1e-3, equal_nan=True ref_params, model_params_to_fp32, rtol=1e-3, atol=1e-3, equal_nan=True
) )
@unittest.skipIf(not fp8_available, reason=reason_for_no_fp8) @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
def test_fp8_model_weight_cast(self): def test_fp8_model_weight_cast(self):
dtype = torch.bfloat16 dtype = torch.bfloat16
with fp8_model_init(enabled=True): with fp8_model_init(enabled=True):
...@@ -255,8 +253,9 @@ class TestFusedAdam(TestFusedOptimizer): ...@@ -255,8 +253,9 @@ class TestFusedAdam(TestFusedOptimizer):
class TestFusedSGD(TestFusedOptimizer): class TestFusedSGD(TestFusedOptimizer):
def __init__(self, *args, **kwargs):
super(TestFusedSGD, self).__init__(*args, **kwargs) def setup_method(self) -> None:
super().setup_method()
self.options = {"lr": 0.25, "momentum": 0.125} self.options = {"lr": 0.25, "momentum": 0.125}
self.ref_optim = torch.optim.SGD self.ref_optim = torch.optim.SGD
self.fused_optim = te.optimizers.FusedSGD self.fused_optim = te.optimizers.FusedSGD
...@@ -267,7 +266,7 @@ class TestFusedSGD(TestFusedOptimizer): ...@@ -267,7 +266,7 @@ class TestFusedSGD(TestFusedOptimizer):
def test_half(self): def test_half(self):
self.gen_single_type_test(param_type=torch.float16) self.gen_single_type_test(param_type=torch.float16)
@unittest.skipIf(torch.cuda.device_count() < 2, "more than 1 GPU required") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="more than 1 GPU required")
def test_multi_device(self): def test_multi_device(self):
devices = ("cuda:0", "cuda:1") devices = ("cuda:0", "cuda:1")
for current_dev, tensor_dev in product(devices, devices): for current_dev, tensor_dev in product(devices, devices):
...@@ -308,9 +307,9 @@ class Model(torch.nn.Module): ...@@ -308,9 +307,9 @@ class Model(torch.nn.Module):
return y return y
class AdamTest(unittest.TestCase): class AdamTest:
def setUp(self, seed=0):
super().setUp() def setup_method(self, *, seed: int = 0) -> None:
torch.manual_seed(seed) torch.manual_seed(seed)
self.model = Model().cuda() self.model = Model().cuda()
...@@ -321,7 +320,7 @@ class AdamTest(unittest.TestCase): ...@@ -321,7 +320,7 @@ class AdamTest(unittest.TestCase):
params = [p for p in self.model.parameters() if p.requires_grad] params = [p for p in self.model.parameters() if p.requires_grad]
self.optimizer = torch.optim.Adam(params, lr=self.lr) self.optimizer = torch.optim.Adam(params, lr=self.lr)
def testGradScaler(self): def test_grad_scaler(self):
params_ = [p for p in self.model_.parameters() if p.requires_grad] params_ = [p for p in self.model_.parameters() if p.requires_grad]
optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=False) optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=False)
scaler = torch.cuda.amp.GradScaler(enabled=True) scaler = torch.cuda.amp.GradScaler(enabled=True)
...@@ -372,7 +371,7 @@ class AdamTest(unittest.TestCase): ...@@ -372,7 +371,7 @@ class AdamTest(unittest.TestCase):
self.model_.load_state_dict(copy.deepcopy(self.model.state_dict())) self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))
def testGradScalerCapturable(self): def test_grad_scaler_capturable(self):
params_ = [p for p in self.model_.parameters() if p.requires_grad] params_ = [p for p in self.model_.parameters() if p.requires_grad]
optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=True) optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=True)
scaler = torch.cuda.amp.GradScaler(enabled=True) scaler = torch.cuda.amp.GradScaler(enabled=True)
...@@ -423,7 +422,7 @@ class AdamTest(unittest.TestCase): ...@@ -423,7 +422,7 @@ class AdamTest(unittest.TestCase):
self.model_.load_state_dict(copy.deepcopy(self.model.state_dict())) self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))
def testGradScalerCapturableMaster(self): def test_grad_scaler_capturable_master(self):
# Cast conv layers to FP16 # Cast conv layers to FP16
for m in self.model_.modules(): for m in self.model_.modules():
if m.__class__ in [torch.nn.Conv2d]: if m.__class__ in [torch.nn.Conv2d]:
...@@ -485,7 +484,7 @@ class AdamTest(unittest.TestCase): ...@@ -485,7 +484,7 @@ class AdamTest(unittest.TestCase):
self.model_.load_state_dict(copy.deepcopy(self.model.state_dict())) self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))
def testNative(self): def test_native(self):
params_ = [p for p in self.model_.parameters() if p.requires_grad] params_ = [p for p in self.model_.parameters() if p.requires_grad]
optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=False) optimizer_ = te.optimizers.FusedAdam(params_, lr=self.lr, capturable=False)
...@@ -531,7 +530,7 @@ class AdamTest(unittest.TestCase): ...@@ -531,7 +530,7 @@ class AdamTest(unittest.TestCase):
self.model_.load_state_dict(copy.deepcopy(self.model.state_dict())) self.model_.load_state_dict(copy.deepcopy(self.model.state_dict()))
@largeTensorTest("60GB", "cuda") @largeTensorTest("60GB", "cuda")
def testLargeTensor(self): def test_large_tensor(self):
t = torch.zeros(2359332864, dtype=torch.half, device="cuda") t = torch.zeros(2359332864, dtype=torch.half, device="cuda")
t2 = torch.zeros(2359332864, dtype=torch.half, device="cuda") t2 = torch.zeros(2359332864, dtype=torch.half, device="cuda")
grad = torch.randn_like(t) grad = torch.randn_like(t)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment