fix for #7365, prevent pipelines from overriding provided prompt embeds (#7926)

* fix for #7365, prevent pipelines from overriding provided prompt embeds * fix-copies * fix implementation * update --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: sayakpaul <spsayakpaul@gmail.com>

fix for #7365, prevent pipelines from overriding provided prompt embeds (#7926)
* fix for #7365, prevent pipelines from overriding provided prompt embeds * fix-copies * fix implementation * update --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: sayakpaul <spsayakpaul@gmail.com>
a0acbdc9 · Bagheera · GitHub · 5655b22e · a0acbdc9 · a0acbdc9
Unverified Commit a0acbdc9 authored Jan 08, 2025 by Bagheera Committed by GitHub Jan 08, 2025
7 changed files
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
@@ -321,7 +321,9 @@ class StableDiffusionXLKDiffusionPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -380,7 +382,9 @@ class StableDiffusionXLKDiffusionPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -406,7 +406,9 @@ class StableDiffusionXLPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -465,7 +467,9 @@ class StableDiffusionXLPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -427,7 +427,9 @@ class StableDiffusionXLImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -486,7 +488,9 @@ class StableDiffusionXLImg2ImgPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -531,7 +531,9 @@ class StableDiffusionXLInpaintPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -590,7 +592,9 @@ class StableDiffusionXLInpaintPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -333,7 +333,9 @@ class StableDiffusionXLInstructPix2PixPipeline(
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                prompt_embeds = prompt_embeds.hidden_states[-2]
                prompt_embeds_list.append(prompt_embeds)
@@ -385,6 +387,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -423,7 +423,9 @@ class StableDiffusionXLAdapterPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -482,7 +484,9 @@ class StableDiffusionXLAdapterPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -705,7 +705,9 @@ class TextToVideoZeroSDXLPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -764,7 +766,9 @@ class TextToVideoZeroSDXLPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]