fix for #7365, prevent pipelines from overriding provided prompt embeds (#7926)

* fix for #7365, prevent pipelines from overriding provided prompt embeds * fix-copies * fix implementation * update --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: sayakpaul <spsayakpaul@gmail.com>

fix for #7365, prevent pipelines from overriding provided prompt embeds (#7926)
* fix for #7365, prevent pipelines from overriding provided prompt embeds * fix-copies * fix implementation * update --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: sayakpaul <spsayakpaul@gmail.com>
a0acbdc9 · Bagheera · GitHub · 5655b22e · a0acbdc9 · a0acbdc9
Unverified Commit a0acbdc9 authored Jan 08, 2025 by Bagheera Committed by GitHub Jan 08, 2025
20 changed files
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -827,7 +827,9 @@ class SDXLLongPromptWeightingPipeline(
                )

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                prompt_embeds = prompt_embeds.hidden_states[-2]

                prompt_embeds_list.append(prompt_embeds)
@@ -879,6 +881,7 @@ class SDXLLongPromptWeightingPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -290,7 +290,9 @@ class DemoFusionSDXLPipeline(
                )

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                prompt_embeds = prompt_embeds.hidden_states[-2]

                prompt_embeds_list.append(prompt_embeds)
@@ -342,6 +344,7 @@ class DemoFusionSDXLPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_sdxl_style_aligned.py
+++ b/examples/community/pipeline_sdxl_style_aligned.py
@@ -628,7 +628,9 @@ class StyleAlignedSDXLPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -688,6 +690,7 @@ class StyleAlignedSDXLPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
@@ -359,7 +359,9 @@ class StableDiffusionXLControlNetAdapterPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -419,6 +421,7 @@ class StableDiffusionXLControlNetAdapterPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
@@ -507,7 +507,9 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -567,6 +569,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
+++ b/examples/community/pipeline_stable_diffusion_xl_differential_img2img.py
@@ -394,7 +394,9 @@ class StableDiffusionXLDifferentialImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -454,6 +456,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/examples/community/pipeline_stable_diffusion_xl_ipex.py
+++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py
@@ -390,7 +390,9 @@ class StableDiffusionXLPipelineIpex(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -450,6 +452,7 @@ class StableDiffusionXLPipelineIpex(
                    output_hidden_states=True,
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py
@@ -438,7 +438,9 @@ class AnimateDiffSDXLPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -497,7 +499,9 @@ class AnimateDiffSDXLPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -406,7 +406,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -465,7 +467,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -415,7 +415,9 @@ class StableDiffusionXLControlNetPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -474,7 +476,9 @@ class StableDiffusionXLControlNetPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -408,7 +408,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -467,7 +469,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_inpaint_sd_xl.py
@@ -388,7 +388,9 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -447,7 +449,9 @@ class StableDiffusionXLControlNetUnionInpaintPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl.py
@@ -397,7 +397,9 @@ class StableDiffusionXLControlNetUnionPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -456,7 +458,9 @@ class StableDiffusionXLControlNetUnionPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_union_sd_xl_img2img.py
@@ -422,7 +422,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -481,7 +483,9 @@ class StableDiffusionXLControlNetUnionImg2ImgPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -336,7 +336,9 @@ class StableDiffusionXLControlNetXSPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -395,7 +397,9 @@ class StableDiffusionXLControlNetXSPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl.py
@@ -421,7 +421,9 @@ class StableDiffusionXLControlNetPAGPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -480,7 +482,9 @@ class StableDiffusionXLControlNetPAGPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -413,7 +413,9 @@ class StableDiffusionXLControlNetPAGImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -472,7 +474,9 @@ class StableDiffusionXLControlNetPAGImg2ImgPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl.py
@@ -415,7 +415,9 @@ class StableDiffusionXLPAGPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -474,7 +476,9 @@ class StableDiffusionXLPAGPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_img2img.py
@@ -436,7 +436,9 @@ class StableDiffusionXLPAGImg2ImgPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -495,7 +497,9 @@ class StableDiffusionXLPAGImg2ImgPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]


--- a/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_xl_inpaint.py
@@ -526,7 +526,9 @@ class StableDiffusionXLPAGInpaintPipeline(
                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
                    pooled_prompt_embeds = prompt_embeds[0]
+
                if clip_skip is None:
                    prompt_embeds = prompt_embeds.hidden_states[-2]
                else:
@@ -585,7 +587,9 @@ class StableDiffusionXLPAGInpaintPipeline(
                    uncond_input.input_ids.to(device),
                    output_hidden_states=True,
                )
+
                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
                    negative_pooled_prompt_embeds = negative_prompt_embeds[0]
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]