Unverified Commit 8b451eb6 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Fix config prints and save, load of pipelines (#2849)

* [Config] Fix config prints and save, load

* Only use potential nn.Modules for dtype and device

* Correct vae image processor

* make sure in_channels is not accessed directly

* make sure in channels is only accessed via config

* Make sure schedulers only access config attributes

* Make sure to access config in SAG

* Fix vae processor and make style

* add tests

* uP

* make style

* Fix more naming issues

* Final fix with vae config

* change more
parent 83691967
......@@ -344,7 +344,7 @@ Now you can wrap all these components together in a training loop with 🤗 Acce
... # Sample a random timestep for each image
... timesteps = torch.randint(
... 0, noise_scheduler.num_train_timesteps, (bs,), device=clean_images.device
... 0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device
... ).long()
... # Add noise to the clean images according to the noise magnitude at each timestep
......
......@@ -62,7 +62,7 @@ class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
def __call__(self):
image = torch.randn(
(1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
(1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
)
timestep = 1
......@@ -108,7 +108,7 @@ class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
def __call__(self):
image = torch.randn(
(1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
(1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
)
timestep = 1
......
......@@ -89,7 +89,9 @@ class MyPipeline(DiffusionPipeline):
@torch.no_grad()
def __call__(self, batch_size: int = 1, num_inference_steps: int = 50):
# Sample gaussian noise to begin loop
image = torch.randn((batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size))
image = torch.randn(
(batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size)
)
image = image.to(self.device)
......
......@@ -238,7 +238,7 @@ class BitDiffusion(DiffusionPipeline):
**kwargs,
) -> Union[Tuple, ImagePipelineOutput]:
latents = torch.randn(
(batch_size, self.unet.in_channels, height, width),
(batch_size, self.unet.config.in_channels, height, width),
generator=generator,
)
latents = decimal_to_bits(latents) * self.bit_scale
......
......@@ -254,7 +254,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -414,7 +414,7 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -513,7 +513,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
timesteps = self.scheduler.timesteps
# 5. Prepare latent variables
num_channels_latents = self.unet.in_channels
num_channels_latents = self.unet.config.in_channels
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
......
......@@ -424,7 +424,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
latents_shape = (1, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if self.device.type == "mps":
# randn does not exist on mps
......
......@@ -320,7 +320,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......@@ -416,7 +416,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
def get_noise(self, seed, dtype=torch.float32, height=512, width=512):
"""Takes in random seed and returns corresponding noise vector"""
return torch.randn(
(1, self.unet.in_channels, height // 8, width // 8),
(1, self.unet.config.in_channels, height // 8, width // 8),
generator=torch.Generator(device=self.device).manual_seed(seed),
device=self.device,
dtype=dtype,
......
......@@ -627,7 +627,7 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
if image is None:
shape = (
batch_size,
self.unet.in_channels,
self.unet.config.in_channels,
height // self.vae_scale_factor,
width // self.vae_scale_factor,
)
......
......@@ -486,7 +486,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
self.__init__additional__()
def __init__additional__(self):
self.unet_in_channels = 4
self.unet.config.in_channels = 4
self.vae_scale_factor = 8
def _encode_prompt(
......@@ -621,7 +621,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
if image is None:
shape = (
batch_size,
self.unet_in_channels,
self.unet.config.in_channels,
height // self.vae_scale_factor,
width // self.vae_scale_factor,
)
......
......@@ -93,7 +93,7 @@ class MagicMixPipeline(DiffusionPipeline):
torch.manual_seed(seed)
noise = torch.randn(
(1, self.unet.in_channels, height // 8, width // 8),
(1, self.unet.config.in_channels, height // 8, width // 8),
).to(self.device)
latents = self.scheduler.add_noise(
......
......@@ -355,7 +355,7 @@ class MultilingualStableDiffusion(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -433,7 +433,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
sigmas = sigmas.to(text_embeddings.dtype)
# 5. Prepare latent variables
num_channels_latents = self.unet.in_channels
num_channels_latents = self.unet.config.in_channels
latents = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
......
......@@ -262,8 +262,8 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.in_channels, 64, 64)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.config.in_channels, 64, 64)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -190,7 +190,7 @@ class SpeechToImagePipeline(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -337,7 +337,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
# Unlike in other pipelines, latents need to be generated in the target device
# for 1-to-1 results reproducibility with the CompVis implementation.
# However this currently doesn't work in `mps`.
latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
latents_shape = (batch_size * num_images_per_prompt, self.unet.config.in_channels, height // 8, width // 8)
latents_dtype = text_embeddings.dtype
if latents is None:
if self.device.type == "mps":
......
......@@ -794,7 +794,7 @@ def main():
noise = torch.randn_like(latents)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
......
......@@ -794,7 +794,7 @@ def main():
noise = torch.randn_like(latents)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
......
......@@ -641,7 +641,7 @@ def main():
noise = torch.randn_like(latents)
bsz = latents.shape[0]
# Sample a random timestep for each image
timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
timesteps = timesteps.long()
# Add noise to the latents according to the noise magnitude at each timestep
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment