Unverified Commit f4b0b26f authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[Tests] Speed up example tests (#6319)

* remove validation args from textual onverson tests

* reduce number of train steps in textual inversion tests

* fix: directories.

* debig

* fix: directories.

* remove validation tests from textual onversion

* try reducing the time of test_text_to_image_checkpointing_use_ema

* fix: directories

* speed up test_text_to_image_checkpointing

* speed up test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* fix

* speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* set checkpoints_total_limit to 2.

* test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints speed up

* speed up test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* debug

* fix: directories.

* speed up test_instruct_pix2pix_checkpointing_checkpoints_total_limit

* speed up: test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* speed up test_controlnet_sdxl

* speed up dreambooth tests

* speed up test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* speed up test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints

* speed up test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit

* speed up # checkpoint-2 should have been deleted

* speed up examples/text_to_image/test_text_to_image.py::TextToImage::test_text_to_image_checkpointing_checkpoints_total_limit

* additional speed ups

* style
parent 89459a5d
...@@ -65,7 +65,7 @@ class ControlNet(ExamplesTestsAccelerate): ...@@ -65,7 +65,7 @@ class ControlNet(ExamplesTestsAccelerate):
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
--max_train_steps=9 --max_train_steps=6
--checkpointing_steps=2 --checkpointing_steps=2
""".split() """.split()
...@@ -73,7 +73,7 @@ class ControlNet(ExamplesTestsAccelerate): ...@@ -73,7 +73,7 @@ class ControlNet(ExamplesTestsAccelerate):
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -85,18 +85,15 @@ class ControlNet(ExamplesTestsAccelerate): ...@@ -85,18 +85,15 @@ class ControlNet(ExamplesTestsAccelerate):
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
--max_train_steps=11 --max_train_steps=8
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-6
--checkpoints_total_limit=3 --checkpoints_total_limit=2
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-8", "checkpoint-10", "checkpoint-12"},
)
class ControlNetSDXL(ExamplesTestsAccelerate): class ControlNetSDXL(ExamplesTestsAccelerate):
...@@ -111,7 +108,7 @@ class ControlNetSDXL(ExamplesTestsAccelerate): ...@@ -111,7 +108,7 @@ class ControlNetSDXL(ExamplesTestsAccelerate):
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
--max_train_steps=9 --max_train_steps=4
--checkpointing_steps=2 --checkpointing_steps=2
""".split() """.split()
......
...@@ -76,10 +76,7 @@ class CustomDiffusion(ExamplesTestsAccelerate): ...@@ -76,10 +76,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):
run_command(self._launch_args + test_args) run_command(self._launch_args + test_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-4", "checkpoint-6"},
)
def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
...@@ -93,7 +90,7 @@ class CustomDiffusion(ExamplesTestsAccelerate): ...@@ -93,7 +90,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):
--train_batch_size=1 --train_batch_size=1
--modifier_token=<new1> --modifier_token=<new1>
--dataloader_num_workers=0 --dataloader_num_workers=0
--max_train_steps=9 --max_train_steps=4
--checkpointing_steps=2 --checkpointing_steps=2
--no_safe_serialization --no_safe_serialization
""".split() """.split()
...@@ -102,7 +99,7 @@ class CustomDiffusion(ExamplesTestsAccelerate): ...@@ -102,7 +99,7 @@ class CustomDiffusion(ExamplesTestsAccelerate):
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -115,16 +112,13 @@ class CustomDiffusion(ExamplesTestsAccelerate): ...@@ -115,16 +112,13 @@ class CustomDiffusion(ExamplesTestsAccelerate):
--train_batch_size=1 --train_batch_size=1
--modifier_token=<new1> --modifier_token=<new1>
--dataloader_num_workers=0 --dataloader_num_workers=0
--max_train_steps=11 --max_train_steps=8
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
--no_safe_serialization --no_safe_serialization
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"},
)
...@@ -89,7 +89,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -89,7 +89,7 @@ class DreamBooth(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 5, checkpointing_steps == 2 # max_train_steps == 4, checkpointing_steps == 2
# Should create checkpoints at steps 2, 4 # Should create checkpoints at steps 2, 4
initial_run_args = f""" initial_run_args = f"""
...@@ -100,7 +100,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -100,7 +100,7 @@ class DreamBooth(ExamplesTestsAccelerate):
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 5 --max_train_steps 4
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -114,7 +114,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -114,7 +114,7 @@ class DreamBooth(ExamplesTestsAccelerate):
# check can run the original fully trained output pipeline # check can run the original fully trained output pipeline
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(instance_prompt, num_inference_steps=2) pipe(instance_prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
...@@ -123,7 +123,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -123,7 +123,7 @@ class DreamBooth(ExamplesTestsAccelerate):
# check can run an intermediate checkpoint # check can run an intermediate checkpoint
unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet") unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
pipe(instance_prompt, num_inference_steps=2) pipe(instance_prompt, num_inference_steps=1)
# Remove checkpoint 2 so that we can check only later checkpoints exist after resuming # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
shutil.rmtree(os.path.join(tmpdir, "checkpoint-2")) shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
...@@ -138,7 +138,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -138,7 +138,7 @@ class DreamBooth(ExamplesTestsAccelerate):
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -153,7 +153,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -153,7 +153,7 @@ class DreamBooth(ExamplesTestsAccelerate):
# check can run new fully trained pipeline # check can run new fully trained pipeline
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(instance_prompt, num_inference_steps=2) pipe(instance_prompt, num_inference_steps=1)
# check old checkpoints do not exist # check old checkpoints do not exist
self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
...@@ -196,7 +196,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -196,7 +196,7 @@ class DreamBooth(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--max_train_steps=9 --max_train_steps=4
--checkpointing_steps=2 --checkpointing_steps=2
""".split() """.split()
...@@ -204,7 +204,7 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -204,7 +204,7 @@ class DreamBooth(ExamplesTestsAccelerate):
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -216,15 +216,12 @@ class DreamBooth(ExamplesTestsAccelerate): ...@@ -216,15 +216,12 @@ class DreamBooth(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--max_train_steps=11 --max_train_steps=8
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"},
)
...@@ -135,16 +135,13 @@ class DreamBoothLoRA(ExamplesTestsAccelerate): ...@@ -135,16 +135,13 @@ class DreamBoothLoRA(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--max_train_steps=9 --max_train_steps=4
--checkpointing_steps=2 --checkpointing_steps=2
""".split() """.split()
run_command(self._launch_args + test_args) run_command(self._launch_args + test_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
)
resume_run_args = f""" resume_run_args = f"""
examples/dreambooth/train_dreambooth_lora.py examples/dreambooth/train_dreambooth_lora.py
...@@ -155,18 +152,15 @@ class DreamBoothLoRA(ExamplesTestsAccelerate): ...@@ -155,18 +152,15 @@ class DreamBoothLoRA(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--train_batch_size=1 --train_batch_size=1
--gradient_accumulation_steps=1 --gradient_accumulation_steps=1
--max_train_steps=11 --max_train_steps=8
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
self.assertEqual( self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
{x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"},
)
def test_dreambooth_lora_if_model(self): def test_dreambooth_lora_if_model(self):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
...@@ -328,7 +322,7 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate): ...@@ -328,7 +322,7 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate):
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--checkpointing_steps=2 --checkpointing_steps=2
--checkpoints_total_limit=2 --checkpoints_total_limit=2
--learning_rate 5.0e-04 --learning_rate 5.0e-04
...@@ -342,14 +336,11 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate): ...@@ -342,14 +336,11 @@ class DreamBoothLoRASDXL(ExamplesTestsAccelerate):
pipe = DiffusionPipeline.from_pretrained(pipeline_path) pipe = DiffusionPipeline.from_pretrained(pipeline_path)
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe("a prompt", num_inference_steps=2) pipe("a prompt", num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( # checkpoint-2 should have been deleted
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
# checkpoint-2 should have been deleted
{"checkpoint-4", "checkpoint-6"},
)
def test_dreambooth_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self): def test_dreambooth_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe" pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
......
...@@ -40,7 +40,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate): ...@@ -40,7 +40,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--random_flip --random_flip
--train_batch_size=1 --train_batch_size=1
--max_train_steps=7 --max_train_steps=6
--checkpointing_steps=2 --checkpointing_steps=2
--checkpoints_total_limit=2 --checkpoints_total_limit=2
--output_dir {tmpdir} --output_dir {tmpdir}
...@@ -63,7 +63,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate): ...@@ -63,7 +63,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--random_flip --random_flip
--train_batch_size=1 --train_batch_size=1
--max_train_steps=9 --max_train_steps=4
--checkpointing_steps=2 --checkpointing_steps=2
--output_dir {tmpdir} --output_dir {tmpdir}
--seed=0 --seed=0
...@@ -74,7 +74,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate): ...@@ -74,7 +74,7 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -84,12 +84,12 @@ class InstructPix2Pix(ExamplesTestsAccelerate): ...@@ -84,12 +84,12 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
--resolution=64 --resolution=64
--random_flip --random_flip
--train_batch_size=1 --train_batch_size=1
--max_train_steps=11 --max_train_steps=8
--checkpointing_steps=2 --checkpointing_steps=2
--output_dir {tmpdir} --output_dir {tmpdir}
--seed=0 --seed=0
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
...@@ -97,5 +97,5 @@ class InstructPix2Pix(ExamplesTestsAccelerate): ...@@ -97,5 +97,5 @@ class InstructPix2Pix(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"}, {"checkpoint-6", "checkpoint-8"},
) )
...@@ -64,7 +64,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -64,7 +64,7 @@ class TextToImage(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 5, checkpointing_steps == 2 # max_train_steps == 4, checkpointing_steps == 2
# Should create checkpoints at steps 2, 4 # Should create checkpoints at steps 2, 4
initial_run_args = f""" initial_run_args = f"""
...@@ -76,7 +76,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -76,7 +76,7 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 5 --max_train_steps 4
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -89,7 +89,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -89,7 +89,7 @@ class TextToImage(ExamplesTestsAccelerate):
run_command(self._launch_args + initial_run_args) run_command(self._launch_args + initial_run_args)
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
...@@ -100,12 +100,12 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -100,12 +100,12 @@ class TextToImage(ExamplesTestsAccelerate):
# check can run an intermediate checkpoint # check can run an intermediate checkpoint
unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet") unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# Remove checkpoint 2 so that we can check only later checkpoints exist after resuming # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
shutil.rmtree(os.path.join(tmpdir, "checkpoint-2")) shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
# Run training script for 7 total steps resuming from checkpoint 4 # Run training script for 2 total steps resuming from checkpoint 4
resume_run_args = f""" resume_run_args = f"""
examples/text_to_image/train_text_to_image.py examples/text_to_image/train_text_to_image.py
...@@ -116,13 +116,13 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -116,13 +116,13 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 2
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
--lr_warmup_steps 0 --lr_warmup_steps 0
--output_dir {tmpdir} --output_dir {tmpdir}
--checkpointing_steps=2 --checkpointing_steps=1
--resume_from_checkpoint=checkpoint-4 --resume_from_checkpoint=checkpoint-4
--seed=0 --seed=0
""".split() """.split()
...@@ -131,16 +131,13 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -131,16 +131,13 @@ class TextToImage(ExamplesTestsAccelerate):
# check can run new fully trained pipeline # check can run new fully trained pipeline
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# no checkpoint-2 -> check old checkpoints do not exist
# check new checkpoints exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{ {"checkpoint-4", "checkpoint-5"},
# no checkpoint-2 -> check old checkpoints do not exist
# check new checkpoints exist
"checkpoint-4",
"checkpoint-6",
},
) )
def test_text_to_image_checkpointing_use_ema(self): def test_text_to_image_checkpointing_use_ema(self):
...@@ -149,7 +146,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -149,7 +146,7 @@ class TextToImage(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 5, checkpointing_steps == 2 # max_train_steps == 4, checkpointing_steps == 2
# Should create checkpoints at steps 2, 4 # Should create checkpoints at steps 2, 4
initial_run_args = f""" initial_run_args = f"""
...@@ -161,7 +158,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -161,7 +158,7 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 5 --max_train_steps 4
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -186,12 +183,12 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -186,12 +183,12 @@ class TextToImage(ExamplesTestsAccelerate):
# check can run an intermediate checkpoint # check can run an intermediate checkpoint
unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet") unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# Remove checkpoint 2 so that we can check only later checkpoints exist after resuming # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
shutil.rmtree(os.path.join(tmpdir, "checkpoint-2")) shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
# Run training script for 7 total steps resuming from checkpoint 4 # Run training script for 2 total steps resuming from checkpoint 4
resume_run_args = f""" resume_run_args = f"""
examples/text_to_image/train_text_to_image.py examples/text_to_image/train_text_to_image.py
...@@ -202,13 +199,13 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -202,13 +199,13 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 2
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
--lr_warmup_steps 0 --lr_warmup_steps 0
--output_dir {tmpdir} --output_dir {tmpdir}
--checkpointing_steps=2 --checkpointing_steps=1
--resume_from_checkpoint=checkpoint-4 --resume_from_checkpoint=checkpoint-4
--use_ema --use_ema
--seed=0 --seed=0
...@@ -218,16 +215,13 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -218,16 +215,13 @@ class TextToImage(ExamplesTestsAccelerate):
# check can run new fully trained pipeline # check can run new fully trained pipeline
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# no checkpoint-2 -> check old checkpoints do not exist
# check new checkpoints exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{ {"checkpoint-4", "checkpoint-5"},
# no checkpoint-2 -> check old checkpoints do not exist
# check new checkpoints exist
"checkpoint-4",
"checkpoint-6",
},
) )
def test_text_to_image_checkpointing_checkpoints_total_limit(self): def test_text_to_image_checkpointing_checkpoints_total_limit(self):
...@@ -236,7 +230,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -236,7 +230,7 @@ class TextToImage(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
# Should create checkpoints at steps 2, 4, 6 # Should create checkpoints at steps 2, 4, 6
# with checkpoint at step 2 deleted # with checkpoint at step 2 deleted
...@@ -249,7 +243,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -249,7 +243,7 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -263,14 +257,11 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -263,14 +257,11 @@ class TextToImage(ExamplesTestsAccelerate):
run_command(self._launch_args + initial_run_args) run_command(self._launch_args + initial_run_args)
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( # checkpoint-2 should have been deleted
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
# checkpoint-2 should have been deleted
{"checkpoint-4", "checkpoint-6"},
)
def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
...@@ -278,8 +269,8 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -278,8 +269,8 @@ class TextToImage(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 9, checkpointing_steps == 2 # max_train_steps == 4, checkpointing_steps == 2
# Should create checkpoints at steps 2, 4, 6, 8 # Should create checkpoints at steps 2, 4
initial_run_args = f""" initial_run_args = f"""
examples/text_to_image/train_text_to_image.py examples/text_to_image/train_text_to_image.py
...@@ -290,7 +281,7 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -290,7 +281,7 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 9 --max_train_steps 4
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -303,15 +294,15 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -303,15 +294,15 @@ class TextToImage(ExamplesTestsAccelerate):
run_command(self._launch_args + initial_run_args) run_command(self._launch_args + initial_run_args)
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4"},
) )
# resume and we should try to checkpoint at 10, where we'll have to remove # resume and we should try to checkpoint at 6, where we'll have to remove
# checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
resume_run_args = f""" resume_run_args = f"""
...@@ -323,27 +314,27 @@ class TextToImage(ExamplesTestsAccelerate): ...@@ -323,27 +314,27 @@ class TextToImage(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 11 --max_train_steps 8
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
--lr_warmup_steps 0 --lr_warmup_steps 0
--output_dir {tmpdir} --output_dir {tmpdir}
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
--seed=0 --seed=0
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"}, {"checkpoint-6", "checkpoint-8"},
) )
......
...@@ -41,7 +41,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -41,7 +41,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
# Should create checkpoints at steps 2, 4, 6 # Should create checkpoints at steps 2, 4, 6
# with checkpoint at step 2 deleted # with checkpoint at step 2 deleted
...@@ -52,7 +52,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -52,7 +52,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -66,14 +66,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -66,14 +66,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
pipe = DiffusionPipeline.from_pretrained(pipeline_path) pipe = DiffusionPipeline.from_pretrained(pipeline_path)
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( # checkpoint-2 should have been deleted
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
# checkpoint-2 should have been deleted
{"checkpoint-4", "checkpoint-6"},
)
def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self): def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self):
pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
...@@ -81,7 +78,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -81,7 +78,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
# Should create checkpoints at steps 2, 4, 6 # Should create checkpoints at steps 2, 4, 6
# with checkpoint at step 2 deleted # with checkpoint at step 2 deleted
...@@ -94,7 +91,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -94,7 +91,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -112,14 +109,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -112,14 +109,11 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
"hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
) )
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( # checkpoint-2 should have been deleted
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
# checkpoint-2 should have been deleted
{"checkpoint-4", "checkpoint-6"},
)
def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
...@@ -127,8 +121,8 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -127,8 +121,8 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 9, checkpointing_steps == 2 # max_train_steps == 4, checkpointing_steps == 2
# Should create checkpoints at steps 2, 4, 6, 8 # Should create checkpoints at steps 2, 4
initial_run_args = f""" initial_run_args = f"""
examples/text_to_image/train_text_to_image_lora.py examples/text_to_image/train_text_to_image_lora.py
...@@ -139,7 +133,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -139,7 +133,7 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 9 --max_train_steps 4
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -156,15 +150,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -156,15 +150,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
"hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
) )
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, {"checkpoint-2", "checkpoint-4"},
) )
# resume and we should try to checkpoint at 10, where we'll have to remove # resume and we should try to checkpoint at 6, where we'll have to remove
# checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
resume_run_args = f""" resume_run_args = f"""
...@@ -176,15 +170,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -176,15 +170,15 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
--random_flip --random_flip
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 11 --max_train_steps 8
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
--lr_warmup_steps 0 --lr_warmup_steps 0
--output_dir {tmpdir} --output_dir {tmpdir}
--checkpointing_steps=2 --checkpointing_steps=2
--resume_from_checkpoint=checkpoint-8 --resume_from_checkpoint=checkpoint-4
--checkpoints_total_limit=3 --checkpoints_total_limit=2
--seed=0 --seed=0
--num_validation_images=0 --num_validation_images=0
""".split() """.split()
...@@ -195,12 +189,12 @@ class TextToImageLoRA(ExamplesTestsAccelerate): ...@@ -195,12 +189,12 @@ class TextToImageLoRA(ExamplesTestsAccelerate):
"hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
) )
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-6", "checkpoint-8", "checkpoint-10"}, {"checkpoint-6", "checkpoint-8"},
) )
...@@ -272,7 +266,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate): ...@@ -272,7 +266,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# Run training script with checkpointing # Run training script with checkpointing
# max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
# Should create checkpoints at steps 2, 4, 6 # Should create checkpoints at steps 2, 4, 6
# with checkpoint at step 2 deleted # with checkpoint at step 2 deleted
...@@ -283,7 +277,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate): ...@@ -283,7 +277,7 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 7 --max_train_steps 6
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -298,11 +292,8 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate): ...@@ -298,11 +292,8 @@ class TextToImageLoRASDXL(ExamplesTestsAccelerate):
pipe = DiffusionPipeline.from_pretrained(pipeline_path) pipe = DiffusionPipeline.from_pretrained(pipeline_path)
pipe.load_lora_weights(tmpdir) pipe.load_lora_weights(tmpdir)
pipe(prompt, num_inference_steps=2) pipe(prompt, num_inference_steps=1)
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( # checkpoint-2 should have been deleted
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
# checkpoint-2 should have been deleted
{"checkpoint-4", "checkpoint-6"},
)
...@@ -40,8 +40,6 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -40,8 +40,6 @@ class TextualInversion(ExamplesTestsAccelerate):
--learnable_property object --learnable_property object
--placeholder_token <cat-toy> --placeholder_token <cat-toy>
--initializer_token a --initializer_token a
--validation_prompt <cat-toy>
--validation_steps 1
--save_steps 1 --save_steps 1
--num_vectors 2 --num_vectors 2
--resolution 64 --resolution 64
...@@ -68,8 +66,6 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -68,8 +66,6 @@ class TextualInversion(ExamplesTestsAccelerate):
--learnable_property object --learnable_property object
--placeholder_token <cat-toy> --placeholder_token <cat-toy>
--initializer_token a --initializer_token a
--validation_prompt <cat-toy>
--validation_steps 1
--save_steps 1 --save_steps 1
--num_vectors 2 --num_vectors 2
--resolution 64 --resolution 64
...@@ -102,14 +98,12 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -102,14 +98,12 @@ class TextualInversion(ExamplesTestsAccelerate):
--learnable_property object --learnable_property object
--placeholder_token <cat-toy> --placeholder_token <cat-toy>
--initializer_token a --initializer_token a
--validation_prompt <cat-toy>
--validation_steps 1
--save_steps 1 --save_steps 1
--num_vectors 2 --num_vectors 2
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 3 --max_train_steps 2
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
...@@ -123,7 +117,7 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -123,7 +117,7 @@ class TextualInversion(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-1", "checkpoint-2", "checkpoint-3"}, {"checkpoint-1", "checkpoint-2"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -133,21 +127,19 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -133,21 +127,19 @@ class TextualInversion(ExamplesTestsAccelerate):
--learnable_property object --learnable_property object
--placeholder_token <cat-toy> --placeholder_token <cat-toy>
--initializer_token a --initializer_token a
--validation_prompt <cat-toy>
--validation_steps 1
--save_steps 1 --save_steps 1
--num_vectors 2 --num_vectors 2
--resolution 64 --resolution 64
--train_batch_size 1 --train_batch_size 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--max_train_steps 4 --max_train_steps 2
--learning_rate 5.0e-04 --learning_rate 5.0e-04
--scale_lr --scale_lr
--lr_scheduler constant --lr_scheduler constant
--lr_warmup_steps 0 --lr_warmup_steps 0
--output_dir {tmpdir} --output_dir {tmpdir}
--checkpointing_steps=1 --checkpointing_steps=1
--resume_from_checkpoint=checkpoint-3 --resume_from_checkpoint=checkpoint-2
--checkpoints_total_limit=2 --checkpoints_total_limit=2
""".split() """.split()
...@@ -156,5 +148,5 @@ class TextualInversion(ExamplesTestsAccelerate): ...@@ -156,5 +148,5 @@ class TextualInversion(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-3", "checkpoint-4"}, {"checkpoint-2", "checkpoint-3"},
) )
...@@ -90,10 +90,10 @@ class Unconditional(ExamplesTestsAccelerate): ...@@ -90,10 +90,10 @@ class Unconditional(ExamplesTestsAccelerate):
--train_batch_size 1 --train_batch_size 1
--num_epochs 1 --num_epochs 1
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--ddpm_num_inference_steps 2 --ddpm_num_inference_steps 1
--learning_rate 1e-3 --learning_rate 1e-3
--lr_warmup_steps 5 --lr_warmup_steps 5
--checkpointing_steps=1 --checkpointing_steps=2
""".split() """.split()
run_command(self._launch_args + initial_run_args) run_command(self._launch_args + initial_run_args)
...@@ -101,7 +101,7 @@ class Unconditional(ExamplesTestsAccelerate): ...@@ -101,7 +101,7 @@ class Unconditional(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-1", "checkpoint-2", "checkpoint-3", "checkpoint-4", "checkpoint-5", "checkpoint-6"}, {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
) )
resume_run_args = f""" resume_run_args = f"""
...@@ -113,12 +113,12 @@ class Unconditional(ExamplesTestsAccelerate): ...@@ -113,12 +113,12 @@ class Unconditional(ExamplesTestsAccelerate):
--train_batch_size 1 --train_batch_size 1
--num_epochs 2 --num_epochs 2
--gradient_accumulation_steps 1 --gradient_accumulation_steps 1
--ddpm_num_inference_steps 2 --ddpm_num_inference_steps 1
--learning_rate 1e-3 --learning_rate 1e-3
--lr_warmup_steps 5 --lr_warmup_steps 5
--resume_from_checkpoint=checkpoint-6 --resume_from_checkpoint=checkpoint-6
--checkpointing_steps=2 --checkpointing_steps=2
--checkpoints_total_limit=3 --checkpoints_total_limit=2
""".split() """.split()
run_command(self._launch_args + resume_run_args) run_command(self._launch_args + resume_run_args)
...@@ -126,5 +126,5 @@ class Unconditional(ExamplesTestsAccelerate): ...@@ -126,5 +126,5 @@ class Unconditional(ExamplesTestsAccelerate):
# check checkpoint directories exist # check checkpoint directories exist
self.assertEqual( self.assertEqual(
{x for x in os.listdir(tmpdir) if "checkpoint" in x}, {x for x in os.listdir(tmpdir) if "checkpoint" in x},
{"checkpoint-8", "checkpoint-10", "checkpoint-12"}, {"checkpoint-10", "checkpoint-12"},
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment