[Tests] Speed up example tests (#6319)

* remove validation args from textual onverson tests * reduce number of train steps in textual inversion tests * fix: directories. * debig * fix: directories. * remove validation tests from textual onversion * try reducing the time of test_text_to_image_checkpointing_use_ema * fix: directories * speed up test_text_to_image_checkpointing * speed up test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * fix * speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * set checkpoints_total_limit to 2. * test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints speed up * speed up test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * debug * fix: directories. * speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit * speed up: test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_controlnet_sdxl * speed up dreambooth tests * speed up test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit * speed up # checkpoint-2 should have been deleted * speed up examples/text_to_image/test_text_to_image.py::TextToImage::test_text_to_image_checkpointing_checkpoints_total_limit * additional speed ups * style

[Tests] Speed up example tests (#6319)
* remove validation args from textual onverson tests * reduce number of train steps in textual inversion tests * fix: directories. * debig * fix: directories. * remove validation tests from textual onversion * try reducing the time of test_text_to_image_checkpointing_use_ema * fix: directories * speed up test_text_to_image_checkpointing * speed up test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * fix * speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * set checkpoints_total_limit to 2. * test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints speed up * speed up test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * debug * fix: directories. * speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit * speed up: test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_controlnet_sdxl * speed up dreambooth tests * speed up test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints * speed up test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit * speed up # checkpoint-2 should have been deleted * speed up examples/text_to_image/test_text_to_image.py::TextToImage::test_text_to_image_checkpointing_checkpoints_total_limit * additional speed ups * style
f4b0b26f · Sayak Paul · GitHub · 89459a5d · f4b0b26f · f4b0b26f
Unverified Commit f4b0b26f authored Dec 25, 2023 by Sayak Paul Committed by GitHub Dec 25, 2023
9 changed files
--- a/examples/controlnet/test_controlnet.py
+++ b/examples/controlnet/test_controlnet.py
@@ -65,7 +65,7 @@ class ControlNet(ExamplesTestsAccelerate):
            --train_batch_size=1
            --gradient_accumulation_steps=1
            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
-            --max_train_steps=9
+            --max_train_steps=6
            --checkpointing_steps=2
            """.split()

@@ -73,7 +73,7 @@ class ControlNet(ExamplesTestsAccelerate):

            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
            )

            resume_run_args = f"""
@@ -85,18 +85,15 @@ class ControlNet(ExamplesTestsAccelerate):
            --train_batch_size=1
            --gradient_accumulation_steps=1
            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
-            --max_train_steps=11
+            --max_train_steps=8
            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
+            --resume_from_checkpoint=checkpoint-6
+            --checkpoints_total_limit=2
            """.split()

            run_command(self._launch_args + resume_run_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-8", "checkpoint-10", "checkpoint-12"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})


 class ControlNetSDXL(ExamplesTestsAccelerate):
@@ -111,7 +108,7 @@ class ControlNetSDXL(ExamplesTestsAccelerate):
            --train_batch_size=1
            --gradient_accumulation_steps=1
            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
-            --max_train_steps=9
+            --max_train_steps=4
            --checkpointing_steps=2
            """.split()


--- a/examples/custom_diffusion/test_custom_diffusion.py
+++ b/examples/custom_diffusion/test_custom_diffusion.py
@@ -76,10 +76,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):

            run_command(self._launch_args + test_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})

    def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -93,7 +90,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):
            --train_batch_size=1
            --modifier_token=<new1>
            --dataloader_num_workers=0
-            --max_train_steps=9
+            --max_train_steps=4
            --checkpointing_steps=2
            --no_safe_serialization
            """.split()
@@ -102,7 +99,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):

            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4"},
            )

            resume_run_args = f"""
@@ -115,16 +112,13 @@ class CustomDiffusion(ExamplesTestsAccelerate):
            --train_batch_size=1
            --modifier_token=<new1>
            --dataloader_num_workers=0
-            --max_train_steps=11
+            --max_train_steps=8
            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
            --no_safe_serialization
            """.split()

            run_command(self._launch_args + resume_run_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
--- a/examples/dreambooth/test_dreambooth.py
+++ b/examples/dreambooth/test_dreambooth.py
@@ -89,7 +89,7 @@ class DreamBooth(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
+            # max_train_steps == 4, checkpointing_steps == 2
            # Should create checkpoints at steps 2, 4

            initial_run_args = f"""
@@ -100,7 +100,7 @@ class DreamBooth(ExamplesTestsAccelerate):
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 5
+                --max_train_steps 4
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -114,7 +114,7 @@ class DreamBooth(ExamplesTestsAccelerate):

            # check can run the original fully trained output pipeline
            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
+            pipe(instance_prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
@@ -123,7 +123,7 @@ class DreamBooth(ExamplesTestsAccelerate):
            # check can run an intermediate checkpoint
            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
+            pipe(instance_prompt, num_inference_steps=1)

            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
@@ -138,7 +138,7 @@ class DreamBooth(ExamplesTestsAccelerate):
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -153,7 +153,7 @@ class DreamBooth(ExamplesTestsAccelerate):

            # check can run new fully trained pipeline
            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
+            pipe(instance_prompt, num_inference_steps=1)

            # check old checkpoints do not exist
            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
@@ -196,7 +196,7 @@ class DreamBooth(ExamplesTestsAccelerate):
            --resolution=64
            --train_batch_size=1
            --gradient_accumulation_steps=1
-            --max_train_steps=9
+            --max_train_steps=4
            --checkpointing_steps=2
            """.split()

@@ -204,7 +204,7 @@ class DreamBooth(ExamplesTestsAccelerate):

            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4"},
            )

            resume_run_args = f"""
@@ -216,15 +216,12 @@ class DreamBooth(ExamplesTestsAccelerate):
            --resolution=64
            --train_batch_size=1
            --gradient_accumulation_steps=1
-            --max_train_steps=11
+            --max_train_steps=8
            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
            """.split()

            run_command(self._launch_args + resume_run_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
--- a/examples/dreambooth/test_dreambooth_lora.py
+++ b/examples/dreambooth/test_dreambooth_lora.py
@@ -135,16 +135,13 @@ class DreamBoothLoRA(ExamplesTestsAccelerate):
            --resolution=64
            --train_batch_size=1
            --gradient_accumulation_steps=1
-            --max_train_steps=9
+            --max_train_steps=4
            --checkpointing_steps=2
            """.split()

            run_command(self._launch_args + test_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})

            resume_run_args = f"""
            examples/dreambooth/train_dreambooth_lora.py
@@ -155,18 +152,15 @@ class DreamBoothLoRA(ExamplesTestsAccelerate):
            --resolution=64
            --train_batch_size=1
            --gradient_accumulation_steps=1
-            --max_train_steps=11
+            --max_train_steps=8
            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
+            --resume_from_checkpoint=checkpoint-4
+            --checkpoints_total_limit=2
            """.split()

            run_command(self._launch_args + resume_run_args)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})

    def test_dreambooth_lora_if_model(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -328,7 +322,7 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate):
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --checkpointing_steps=2
                --checkpoints_total_limit=2
                --learning_rate 5.0e-04
@@ -342,14 +336,11 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate):

            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
            pipe.load_lora_weights(tmpdir)
-            pipe("a prompt", num_inference_steps=2)
+            pipe("a prompt", num_inference_steps=1)

            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
            # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})

    def test_dreambooth_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"

--- a/examples/instruct_pix2pix/test_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/test_instruct_pix2pix.py
@@ -40,7 +40,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
                --resolution=64
                --random_flip
                --train_batch_size=1
-                --max_train_steps=7
+                --max_train_steps=6
                --checkpointing_steps=2
                --checkpoints_total_limit=2
                --output_dir {tmpdir}
@@ -63,7 +63,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
                --resolution=64
                --random_flip
                --train_batch_size=1
-                --max_train_steps=9
+                --max_train_steps=4
                --checkpointing_steps=2
                --output_dir {tmpdir}
                --seed=0
@@ -74,7 +74,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4"},
            )

            resume_run_args = f"""
@@ -84,12 +84,12 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
                --resolution=64
                --random_flip
                --train_batch_size=1
-                --max_train_steps=11
+                --max_train_steps=8
                --checkpointing_steps=2
                --output_dir {tmpdir}
                --seed=0
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
                """.split()

            run_command(self._launch_args + resume_run_args)
@@ -97,5 +97,5 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
+                {"checkpoint-6", "checkpoint-8"},
            )
--- a/examples/text_to_image/test_text_to_image.py
+++ b/examples/text_to_image/test_text_to_image.py
@@ -64,7 +64,7 @@ class TextToImage(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
+            # max_train_steps == 4, checkpointing_steps == 2
            # Should create checkpoints at steps 2, 4

            initial_run_args = f"""
@@ -76,7 +76,7 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 5
+                --max_train_steps 4
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -89,7 +89,7 @@ class TextToImage(ExamplesTestsAccelerate):
            run_command(self._launch_args + initial_run_args)

            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertEqual(
@@ -100,12 +100,12 @@ class TextToImage(ExamplesTestsAccelerate):
            # check can run an intermediate checkpoint
            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))

-            # Run training script for 7 total steps resuming from checkpoint 4
+            # Run training script for 2 total steps resuming from checkpoint 4

            resume_run_args = f"""
                examples/text_to_image/train_text_to_image.py
@@ -116,13 +116,13 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 2
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
                --lr_warmup_steps 0
                --output_dir {tmpdir}
-                --checkpointing_steps=2
+                --checkpointing_steps=1
                --resume_from_checkpoint=checkpoint-4
                --seed=0
                """.split()
@@ -131,16 +131,13 @@ class TextToImage(ExamplesTestsAccelerate):

            # check can run new fully trained pipeline
            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {
            # no checkpoint-2 -> check old checkpoints do not exist
            # check new checkpoints exist
-                    "checkpoint-4",
-                    "checkpoint-6",
-                },
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
            )

    def test_text_to_image_checkpointing_use_ema(self):
@@ -149,7 +146,7 @@ class TextToImage(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
+            # max_train_steps == 4, checkpointing_steps == 2
            # Should create checkpoints at steps 2, 4

            initial_run_args = f"""
@@ -161,7 +158,7 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 5
+                --max_train_steps 4
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -186,12 +183,12 @@ class TextToImage(ExamplesTestsAccelerate):
            # check can run an intermediate checkpoint
            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))

-            # Run training script for 7 total steps resuming from checkpoint 4
+            # Run training script for 2 total steps resuming from checkpoint 4

            resume_run_args = f"""
                examples/text_to_image/train_text_to_image.py
@@ -202,13 +199,13 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 2
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
                --lr_warmup_steps 0
                --output_dir {tmpdir}
-                --checkpointing_steps=2
+                --checkpointing_steps=1
                --resume_from_checkpoint=checkpoint-4
                --use_ema
                --seed=0
@@ -218,16 +215,13 @@ class TextToImage(ExamplesTestsAccelerate):

            # check can run new fully trained pipeline
            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {
            # no checkpoint-2 -> check old checkpoints do not exist
            # check new checkpoints exist
-                    "checkpoint-4",
-                    "checkpoint-6",
-                },
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
            )

    def test_text_to_image_checkpointing_checkpoints_total_limit(self):
@@ -236,7 +230,7 @@ class TextToImage(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
            # Should create checkpoints at steps 2, 4, 6
            # with checkpoint at step 2 deleted

@@ -249,7 +243,7 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -263,14 +257,11 @@ class TextToImage(ExamplesTestsAccelerate):
            run_command(self._launch_args + initial_run_args)

            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
            # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})

    def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
@@ -278,8 +269,8 @@ class TextToImage(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 9, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4, 6, 8
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4

            initial_run_args = f"""
                examples/text_to_image/train_text_to_image.py
@@ -290,7 +281,7 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 9
+                --max_train_steps 4
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -303,15 +294,15 @@ class TextToImage(ExamplesTestsAccelerate):
            run_command(self._launch_args + initial_run_args)

            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4"},
            )

-            # resume and we should try to checkpoint at 10, where we'll have to remove
+            # resume and we should try to checkpoint at 6, where we'll have to remove
            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint

            resume_run_args = f"""
@@ -323,27 +314,27 @@ class TextToImage(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 11
+                --max_train_steps 8
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
                --lr_warmup_steps 0
                --output_dir {tmpdir}
                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
                --seed=0
                """.split()

            run_command(self._launch_args + resume_run_args)

            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
+                {"checkpoint-6", "checkpoint-8"},
            )



--- a/examples/text_to_image/test_text_to_image_lora.py
+++ b/examples/text_to_image/test_text_to_image_lora.py
@@ -41,7 +41,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
            # Should create checkpoints at steps 2, 4, 6
            # with checkpoint at step 2 deleted

@@ -52,7 +52,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -66,14 +66,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate):

            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
            # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})

    def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self):
        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
@@ -81,7 +78,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
            # Should create checkpoints at steps 2, 4, 6
            # with checkpoint at step 2 deleted

@@ -94,7 +91,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -112,14 +109,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
            )
            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
            # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})

    def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
@@ -127,8 +121,8 @@ class TextToImageLoRA(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 9, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4, 6, 8
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4

            initial_run_args = f"""
                examples/text_to_image/train_text_to_image_lora.py
@@ -139,7 +133,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 9
+                --max_train_steps 4
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -156,15 +150,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
            )
            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
+                {"checkpoint-2", "checkpoint-4"},
            )

-            # resume and we should try to checkpoint at 10, where we'll have to remove
+            # resume and we should try to checkpoint at 6, where we'll have to remove
            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint

            resume_run_args = f"""
@@ -176,15 +170,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                --random_flip
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 11
+                --max_train_steps 8
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
                --lr_warmup_steps 0
                --output_dir {tmpdir}
                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
                --seed=0
                --num_validation_images=0
                """.split()
@@ -195,12 +189,12 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
            )
            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
+                {"checkpoint-6", "checkpoint-8"},
            )


@@ -272,7 +266,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):

        with tempfile.TemporaryDirectory() as tmpdir:
            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
            # Should create checkpoints at steps 2, 4, 6
            # with checkpoint at step 2 deleted

@@ -283,7 +277,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 7
+                --max_train_steps 6
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -298,11 +292,8 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):

            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
+            pipe(prompt, num_inference_steps=1)

            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
            # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
--- a/examples/textual_inversion/test_textual_inversion.py
+++ b/examples/textual_inversion/test_textual_inversion.py
@@ -40,8 +40,6 @@ class TextualInversion(ExamplesTestsAccelerate):
                --learnable_property object
                --placeholder_token <cat-toy>
                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
                --save_steps 1
                --num_vectors 2
                --resolution 64
@@ -68,8 +66,6 @@ class TextualInversion(ExamplesTestsAccelerate):
                --learnable_property object
                --placeholder_token <cat-toy>
                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
                --save_steps 1
                --num_vectors 2
                --resolution 64
@@ -102,14 +98,12 @@ class TextualInversion(ExamplesTestsAccelerate):
                --learnable_property object
                --placeholder_token <cat-toy>
                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
                --save_steps 1
                --num_vectors 2
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 3
+                --max_train_steps 2
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
@@ -123,7 +117,7 @@ class TextualInversion(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-1", "checkpoint-2", "checkpoint-3"},
+                {"checkpoint-1", "checkpoint-2"},
            )

            resume_run_args = f"""
@@ -133,21 +127,19 @@ class TextualInversion(ExamplesTestsAccelerate):
                --learnable_property object
                --placeholder_token <cat-toy>
                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
                --save_steps 1
                --num_vectors 2
                --resolution 64
                --train_batch_size 1
                --gradient_accumulation_steps 1
-                --max_train_steps 4
+                --max_train_steps 2
                --learning_rate 5.0e-04
                --scale_lr
                --lr_scheduler constant
                --lr_warmup_steps 0
                --output_dir {tmpdir}
                --checkpointing_steps=1
-                --resume_from_checkpoint=checkpoint-3
+                --resume_from_checkpoint=checkpoint-2
                --checkpoints_total_limit=2
                """.split()

@@ -156,5 +148,5 @@ class TextualInversion(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-3", "checkpoint-4"},
+                {"checkpoint-2", "checkpoint-3"},
            )
--- a/examples/unconditional_image_generation/test_unconditional.py
+++ b/examples/unconditional_image_generation/test_unconditional.py
@@ -90,10 +90,10 @@ class Unconditional(ExamplesTestsAccelerate):
                --train_batch_size 1
                --num_epochs 1
                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
+                --ddpm_num_inference_steps 1
                --learning_rate 1e-3
                --lr_warmup_steps 5
-                --checkpointing_steps=1
+                --checkpointing_steps=2
                """.split()

            run_command(self._launch_args + initial_run_args)
@@ -101,7 +101,7 @@ class Unconditional(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-1", "checkpoint-2", "checkpoint-3", "checkpoint-4", "checkpoint-5", "checkpoint-6"},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
            )

            resume_run_args = f"""
@@ -113,12 +113,12 @@ class Unconditional(ExamplesTestsAccelerate):
                --train_batch_size 1
                --num_epochs 2
                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
+                --ddpm_num_inference_steps 1
                --learning_rate 1e-3
                --lr_warmup_steps 5
                --resume_from_checkpoint=checkpoint-6
                --checkpointing_steps=2
-                --checkpoints_total_limit=3
+                --checkpoints_total_limit=2
                """.split()

            run_command(self._launch_args + resume_run_args)
@@ -126,5 +126,5 @@ class Unconditional(ExamplesTestsAccelerate):
            # check checkpoint directories exist
            self.assertEqual(
                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-8", "checkpoint-10", "checkpoint-12"},
+                {"checkpoint-10", "checkpoint-12"},
            )