Unverified Commit f566c6e3 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix failing GPU trainer tests (#14903)

* Fix failing GPU trainer tests

* Remove print statements
parent fe4197ab
...@@ -130,6 +130,7 @@ class TestTrainerExt(TestCasePlus): ...@@ -130,6 +130,7 @@ class TestTrainerExt(TestCasePlus):
self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple") self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple")
# test --sharded_ddp w/ --fp16 # test --sharded_ddp w/ --fp16
@unittest.skip("Requires an update of the env running those tests")
@require_torch_multi_gpu @require_torch_multi_gpu
@require_fairscale @require_fairscale
def test_run_seq2seq_sharded_ddp_fp16(self): def test_run_seq2seq_sharded_ddp_fp16(self):
...@@ -142,6 +143,7 @@ class TestTrainerExt(TestCasePlus): ...@@ -142,6 +143,7 @@ class TestTrainerExt(TestCasePlus):
self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False) self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False)
# test --sharded_ddp zero_dp_2 w/ --fp16 # test --sharded_ddp zero_dp_2 w/ --fp16
@unittest.skip("Requires an update of the env running those tests")
@require_torch_multi_gpu @require_torch_multi_gpu
@require_fairscale @require_fairscale
def test_run_seq2seq_fully_sharded_ddp_fp16(self): def test_run_seq2seq_fully_sharded_ddp_fp16(self):
......
...@@ -1093,17 +1093,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): ...@@ -1093,17 +1093,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
def test_training_finite_iterable_dataset(self): def test_training_finite_iterable_dataset(self):
num_gpus = max(1, get_gpu_count())
if num_gpus > 2:
return
config = RegressionModelConfig() config = RegressionModelConfig()
model = RegressionPreTrainedModel(config) model = RegressionPreTrainedModel(config)
batch_size = 1 batch_size = 1
num_samples = 10 num_samples = 10
available_steps = num_samples // (batch_size * num_gpus) available_steps = num_samples // batch_size
data = FiniteIterableDataset(length=num_samples) data = FiniteIterableDataset(length=num_samples)
train_args = TrainingArguments( train_args = TrainingArguments(
...@@ -1510,7 +1506,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): ...@@ -1510,7 +1506,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
expected_commits = [f"Training in progress, epoch {i}" for i in range(3, 0, -1)] expected_commits = [f"Training in progress, epoch {i}" for i in range(3, 0, -1)]
expected_commits.append("initial commit") expected_commits.append("initial commit")
self.assertListEqual(commits, expected_commits) self.assertListEqual(commits, expected_commits)
print(commits, len(commits))
def test_push_to_hub_with_saves_each_n_steps(self): def test_push_to_hub_with_saves_each_n_steps(self):
num_gpus = max(1, get_gpu_count()) num_gpus = max(1, get_gpu_count())
...@@ -1534,7 +1529,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): ...@@ -1534,7 +1529,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase):
expected_commits = [f"Training in progress, step {i}" for i in range(total_steps, 0, -5)] expected_commits = [f"Training in progress, step {i}" for i in range(total_steps, 0, -5)]
expected_commits.append("initial commit") expected_commits.append("initial commit")
self.assertListEqual(commits, expected_commits) self.assertListEqual(commits, expected_commits)
print(commits, len(commits))
@require_torch @require_torch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment