Fix config prints and save, load of pipelines (#2849)

* [Config] Fix config prints and save, load * Only use potential nn.Modules for dtype and device * Correct vae image processor * make sure in_channels is not accessed directly * make sure in channels is only accessed via config * Make sure schedulers only access config attributes * Make sure to access config in SAG * Fix vae processor and make style * add tests * uP * make style * Fix more naming issues * Final fix with vae config * change more

Fix config prints and save, load of pipelines (#2849)
* [Config] Fix config prints and save, load * Only use potential nn.Modules for dtype and device * Correct vae image processor * make sure in_channels is not accessed directly * make sure in channels is only accessed via config * Make sure schedulers only access config attributes * Make sure to access config in SAG * Fix vae processor and make style * add tests * uP * make style * Fix more naming issues * Final fix with vae config * change more
8b451eb6 · Patrick von Platen · GitHub · 83691967 · 8b451eb6 · 8b451eb6
Unverified Commit 8b451eb6 authored Apr 11, 2023 by Patrick von Platen Committed by GitHub Apr 11, 2023
20 changed files
--- a/docs/source/en/tutorials/basic_training.mdx
+++ b/docs/source/en/tutorials/basic_training.mdx
@@ -344,7 +344,7 @@ Now you can wrap all these components together in a training loop with 🤗 Acce

 ...             # Sample a random timestep for each image
 ...             timesteps = torch.randint(
-...                 0, noise_scheduler.num_train_timesteps, (bs,), device=clean_images.device
+...                 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device
 ...             ).long()

 ...             # Add noise to the clean images according to the noise magnitude at each timestep

--- a/docs/source/en/using-diffusers/contribute_pipeline.mdx
+++ b/docs/source/en/using-diffusers/contribute_pipeline.mdx
@@ -62,7 +62,7 @@ class UnetSchedulerOneForwardPipeline(DiffusionPipeline):

    def __call__(self):
        image = torch.randn(
-            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+            (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
        )
        timestep = 1

@@ -108,7 +108,7 @@ class UnetSchedulerOneForwardPipeline(DiffusionPipeline):

    def __call__(self):
        image = torch.randn(
-            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+            (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
        )
        timestep = 1


--- a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx
@@ -89,7 +89,9 @@ class MyPipeline(DiffusionPipeline):
    @torch.no_grad()
    def __call__(self, batch_size: int = 1, num_inference_steps: int = 50):
        # Sample gaussian noise to begin loop
-        image = torch.randn((batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size))
+        image = torch.randn(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size)
+        )

        image = image.to(self.device)


--- a/examples/community/bit_diffusion.py
+++ b/examples/community/bit_diffusion.py
@@ -238,7 +238,7 @@ class BitDiffusion(DiffusionPipeline):
        **kwargs,
    ) -> Union[Tuple, ImagePipelineOutput]:
        latents = torch.randn(
-            (batch_size, self.unet.in_channels, height, width),
+            (batch_size, self.unet.config.in_channels, height, width),
            generator=generator,
        )
        latents = decimal_to_bits(latents) * self.bit_scale

--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -254,7 +254,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -414,7 +414,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -513,7 +513,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,

--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -424,7 +424,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (1, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if self.device.type == "mps":
            # randn does not exist on mps

--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -320,7 +320,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":
@@ -416,7 +416,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
    def get_noise(self, seed, dtype=torch.float32, height=512, width=512):
        """Takes in random seed and returns corresponding noise vector"""
        return torch.randn(
-            (1, self.unet.in_channels, height // 8, width // 8),
+            (1, self.unet.config.in_channels, height // 8, width // 8),
            generator=torch.Generator(device=self.device).manual_seed(seed),
            device=self.device,
            dtype=dtype,

--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -627,7 +627,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
        if image is None:
            shape = (
                batch_size,
-                self.unet.in_channels,
+                self.unet.config.in_channels,
                height // self.vae_scale_factor,
                width // self.vae_scale_factor,
            )

--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -486,7 +486,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
            self.__init__additional__()

    def __init__additional__(self):
-        self.unet_in_channels = 4
+        self.unet.config.in_channels = 4
        self.vae_scale_factor = 8

    def _encode_prompt(
@@ -621,7 +621,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
        if image is None:
            shape = (
                batch_size,
-                self.unet_in_channels,
+                self.unet.config.in_channels,
                height // self.vae_scale_factor,
                width // self.vae_scale_factor,
            )

--- a/examples/community/magic_mix.py
+++ b/examples/community/magic_mix.py
@@ -93,7 +93,7 @@ class MagicMixPipeline(DiffusionPipeline):

        torch.manual_seed(seed)
        noise = torch.randn(
-            (1, self.unet.in_channels, height // 8, width // 8),
+            (1, self.unet.config.in_channels, height // 8, width // 8),
        ).to(self.device)

        latents = self.scheduler.add_noise(

--- a/examples/community/multilingual_stable_diffusion.py
+++ b/examples/community/multilingual_stable_diffusion.py
@@ -355,7 +355,7 @@ class MultilingualStableDiffusion(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/community/sd_text2img_k_diffusion.py
+++ b/examples/community/sd_text2img_k_diffusion.py
@@ -433,7 +433,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
        sigmas = sigmas.to(text_embeddings.dtype)

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,

--- a/examples/community/seed_resize_stable_diffusion.py
+++ b/examples/community/seed_resize_stable_diffusion.py
@@ -262,8 +262,8 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
-        latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.in_channels, 64, 64)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
+        latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.config.in_channels, 64, 64)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/community/speech_to_image_diffusion.py
+++ b/examples/community/speech_to_image_diffusion.py
@@ -190,7 +190,7 @@ class SpeechToImagePipeline(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -337,7 +337,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        if latents is None:
            if self.device.type == "mps":

--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -794,7 +794,7 @@ def main():
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep

--- a/examples/research_projects/lora/train_text_to_image_lora.py
+++ b/examples/research_projects/lora/train_text_to_image_lora.py
@@ -794,7 +794,7 @@ def main():
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep

--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -641,7 +641,7 @@ def main():
                noise = torch.randn_like(latents)
                bsz = latents.shape[0]
                # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                timesteps = timesteps.long()

                # Add noise to the latents according to the noise magnitude at each timestep