Accept latents as optional input in Latent Diffusion pipeline (#1723)

* Latent Diffusion pipeline accept latents * make style * check for mps randn does not work reproducibly on mps

Accept latents as optional input in Latent Diffusion pipeline (#1723)
* Latent Diffusion pipeline accept latents * make style * check for mps randn does not work reproducibly on mps
727434c2 · Partho · GitHub · 21e61eb3 · 727434c2
Unverified Commit 727434c2 authored Dec 16, 2022 by Partho Committed by GitHub Dec 16, 2022
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 5 deletions

src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py ...s/pipelines/latent_diffusion/pipeline_latent_diffusion.py +21 -5

No files found.
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -72,6 +72,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
        guidance_scale: Optional[float] = 1.0,
        eta: Optional[float] = 0.0,
        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        **kwargs,
@@ -96,6 +97,10 @@ class LDMTextToImagePipeline(DiffusionPipeline):
            generator (`torch.Generator`, *optional*):
                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -130,10 +135,21 @@ class LDMTextToImagePipeline(DiffusionPipeline):
        text_input = self.tokenizer(prompt, padding="max_length", max_length=77, return_tensors="pt")
        text_embeddings = self.bert(text_input.input_ids.to(self.device))[0]
+        # get the initial random noise unless the user supplied it
+        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu").to(self.device)
+            else:
                latents = torch.randn(
-            (batch_size, self.unet.in_channels, height // 8, width // 8),
+                    latents_shape,
                    generator=generator,
+                    device=self.device,
                )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
            latents = latents.to(self.device)
        self.scheduler.set_timesteps(num_inference_steps)