[cuda ext tests] fixing tests (#11619)

* fixing tests * cleanup

[cuda ext tests] fixing tests (#11619)
* fixing tests * cleanup
619200cc · Stas Bekman · GitHub · 44c5621d · 619200cc · 619200cc
Unverified Commit 619200cc authored May 06, 2021 by Stas Bekman Committed by GitHub May 06, 2021
3 changed files
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -261,6 +261,7 @@ jobs:

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed]

@@ -301,6 +302,7 @@ jobs:

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed,fairscale]


--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -318,9 +318,10 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
            self.assertNotEqual(yes_grad_accum_a, a)

-        # training with half the batch size but accumulation steps as 2 should give the same weights
-        self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
-        self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
+        # training with half the batch size but accumulation steps as 2 should give the same
+        # weights, but sometimes get a slight difference still of 1e-6
+        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
+        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)

        # see the note above how to get identical loss on a small bs
        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)

--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -167,8 +167,8 @@ class TestTrainerExt(TestCasePlus):
        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
-        assert "test_generations.txt" in contents
-        assert "test_results.json" in contents
+        assert "generated_predictions.txt" in contents
+        assert "predict_results.json" in contents

    def run_trainer(
        self,