"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "2e31a759b5bd8ca2b288b5c61709636a96c4bae9"
Unverified Commit 04d696d6 authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[Core] Add support for CLIP-skip (#4901)



* add support for clip skip

* fix condition

* fix

* add clip_output_layer_to_default

* expose

* remove the previous functions.

* correct condition.

* apply final layer norm

* address feedback

* Apply suggestions from code review
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* refactor clip_skip.

* port to the other pipelines.

* fix copies one more time

---------
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent ed507680
...@@ -231,6 +231,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -231,6 +231,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = ( deprecation_message = (
"`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()`" "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()`"
...@@ -247,6 +248,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -247,6 +248,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -264,6 +266,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -264,6 +266,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -289,7 +292,10 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -289,7 +292,10 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -337,11 +343,22 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -337,11 +343,22 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -544,6 +561,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -544,6 +561,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
guidance_rescale: float = 0.0, guidance_rescale: float = 0.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -600,6 +618,9 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -600,6 +618,9 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
using zero terminal SNR. using zero terminal SNR.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -646,6 +667,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -646,6 +667,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -229,6 +229,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -229,6 +229,7 @@ class AltDiffusionImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = ( deprecation_message = (
"`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()`" "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()`"
...@@ -245,6 +246,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -245,6 +246,7 @@ class AltDiffusionImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -262,6 +264,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -262,6 +264,7 @@ class AltDiffusionImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -287,7 +290,10 @@ class AltDiffusionImg2ImgPipeline( ...@@ -287,7 +290,10 @@ class AltDiffusionImg2ImgPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -335,11 +341,22 @@ class AltDiffusionImg2ImgPipeline( ...@@ -335,11 +341,22 @@ class AltDiffusionImg2ImgPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -582,6 +599,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -582,6 +599,7 @@ class AltDiffusionImg2ImgPipeline(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -638,7 +656,9 @@ class AltDiffusionImg2ImgPipeline( ...@@ -638,7 +656,9 @@ class AltDiffusionImg2ImgPipeline(
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -677,6 +697,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -677,6 +697,7 @@ class AltDiffusionImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -221,6 +221,7 @@ class StableDiffusionControlNetPipeline( ...@@ -221,6 +221,7 @@ class StableDiffusionControlNetPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -234,6 +235,7 @@ class StableDiffusionControlNetPipeline( ...@@ -234,6 +235,7 @@ class StableDiffusionControlNetPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -252,6 +254,7 @@ class StableDiffusionControlNetPipeline( ...@@ -252,6 +254,7 @@ class StableDiffusionControlNetPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -277,7 +280,10 @@ class StableDiffusionControlNetPipeline( ...@@ -277,7 +280,10 @@ class StableDiffusionControlNetPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -325,11 +331,22 @@ class StableDiffusionControlNetPipeline( ...@@ -325,11 +331,22 @@ class StableDiffusionControlNetPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -697,6 +714,7 @@ class StableDiffusionControlNetPipeline( ...@@ -697,6 +714,7 @@ class StableDiffusionControlNetPipeline(
guess_mode: bool = False, guess_mode: bool = False,
control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_start: Union[float, List[float]] = 0.0,
control_guidance_end: Union[float, List[float]] = 1.0, control_guidance_end: Union[float, List[float]] = 1.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -768,6 +786,9 @@ class StableDiffusionControlNetPipeline( ...@@ -768,6 +786,9 @@ class StableDiffusionControlNetPipeline(
The percentage of total steps at which the ControlNet starts applying. The percentage of total steps at which the ControlNet starts applying.
control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
The percentage of total steps at which the ControlNet stops applying. The percentage of total steps at which the ControlNet stops applying.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -841,6 +862,7 @@ class StableDiffusionControlNetPipeline( ...@@ -841,6 +862,7 @@ class StableDiffusionControlNetPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -245,6 +245,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -245,6 +245,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -258,6 +259,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -258,6 +259,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -276,6 +278,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -276,6 +278,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -301,7 +304,10 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -301,7 +304,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -349,11 +355,22 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -349,11 +355,22 @@ class StableDiffusionControlNetImg2ImgPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -769,6 +786,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -769,6 +786,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
guess_mode: bool = False, guess_mode: bool = False,
control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_start: Union[float, List[float]] = 0.0,
control_guidance_end: Union[float, List[float]] = 1.0, control_guidance_end: Union[float, List[float]] = 1.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -844,6 +862,9 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -844,6 +862,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
The percentage of total steps at which the ControlNet starts applying. The percentage of total steps at which the ControlNet starts applying.
control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
The percentage of total steps at which the ControlNet stops applying. The percentage of total steps at which the ControlNet stops applying.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -917,6 +938,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -917,6 +938,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -372,6 +372,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -372,6 +372,7 @@ class StableDiffusionControlNetInpaintPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -385,6 +386,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -385,6 +386,7 @@ class StableDiffusionControlNetInpaintPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -403,6 +405,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -403,6 +405,7 @@ class StableDiffusionControlNetInpaintPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -428,7 +431,10 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -428,7 +431,10 @@ class StableDiffusionControlNetInpaintPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -476,11 +482,22 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -476,11 +482,22 @@ class StableDiffusionControlNetInpaintPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -964,6 +981,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -964,6 +981,7 @@ class StableDiffusionControlNetInpaintPipeline(
guess_mode: bool = False, guess_mode: bool = False,
control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_start: Union[float, List[float]] = 0.0,
control_guidance_end: Union[float, List[float]] = 1.0, control_guidance_end: Union[float, List[float]] = 1.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -1056,6 +1074,9 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -1056,6 +1074,9 @@ class StableDiffusionControlNetInpaintPipeline(
The percentage of total steps at which the ControlNet starts applying. The percentage of total steps at which the ControlNet starts applying.
control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
The percentage of total steps at which the ControlNet stops applying. The percentage of total steps at which the ControlNet stops applying.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -1131,6 +1152,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -1131,6 +1152,7 @@ class StableDiffusionControlNetInpaintPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -238,6 +238,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -238,6 +238,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -251,6 +252,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -251,6 +252,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -269,6 +271,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -269,6 +271,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -294,7 +297,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -294,7 +297,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -342,11 +348,22 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -342,11 +348,22 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -587,6 +604,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -587,6 +604,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -640,7 +658,9 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -640,7 +658,9 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Example: Example:
```py ```py
...@@ -740,9 +760,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor ...@@ -740,9 +760,10 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
do_classifier_free_guidance, do_classifier_free_guidance,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
source_prompt_embeds_tuple = self.encode_prompt( source_prompt_embeds_tuple = self.encode_prompt(
source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None, clip_skip=clip_skip
) )
if prompt_embeds_tuple[1] is not None: if prompt_embeds_tuple[1] is not None:
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
......
...@@ -232,6 +232,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -232,6 +232,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -245,6 +246,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -245,6 +246,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -262,6 +264,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -262,6 +264,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -287,7 +290,10 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -287,7 +290,10 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -335,11 +341,22 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -335,11 +341,22 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -539,6 +556,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -539,6 +556,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
guidance_rescale: float = 0.0, guidance_rescale: float = 0.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -595,6 +613,9 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -595,6 +613,9 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
using zero terminal SNR. using zero terminal SNR.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -641,6 +662,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo ...@@ -641,6 +662,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -262,6 +262,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -262,6 +262,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -275,6 +276,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -275,6 +276,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -293,6 +295,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -293,6 +295,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -318,7 +321,10 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -318,7 +321,10 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -366,11 +372,22 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -366,11 +372,22 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -734,6 +751,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -734,6 +751,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8}, thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
scale_factor: int = 20, scale_factor: int = 20,
attn_res: Optional[Tuple[int]] = (16, 16), attn_res: Optional[Tuple[int]] = (16, 16),
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -798,6 +816,9 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -798,6 +816,9 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
Scale factor to control the step size of each attend-and-excite update. Scale factor to control the step size of each attend-and-excite update.
attn_res (`tuple`, *optional*, default computed from width and height): attn_res (`tuple`, *optional*, default computed from width and height):
The 2D resolution of the semantic attention map. The 2D resolution of the semantic attention map.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -848,6 +869,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion ...@@ -848,6 +869,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -143,6 +143,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -143,6 +143,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -156,6 +157,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -156,6 +157,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -174,6 +176,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -174,6 +176,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -199,7 +202,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -199,7 +202,10 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -247,11 +253,22 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -247,11 +253,22 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -541,6 +558,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -541,6 +558,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -597,7 +615,9 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -597,7 +615,9 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
```py ```py
...@@ -666,6 +686,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader ...@@ -666,6 +686,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -411,6 +411,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -411,6 +411,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -424,6 +425,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -424,6 +425,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -442,6 +444,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -442,6 +444,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -467,7 +470,10 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -467,7 +470,10 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -515,11 +521,22 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -515,11 +521,22 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -1305,6 +1322,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -1305,6 +1322,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_ckip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -1365,7 +1383,9 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -1365,7 +1383,9 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -1424,6 +1444,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -1424,6 +1444,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_ckip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -208,6 +208,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -208,6 +208,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -221,6 +222,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -221,6 +222,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -239,6 +241,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -239,6 +241,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -264,7 +267,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -264,7 +267,10 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -312,11 +318,22 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -312,11 +318,22 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -545,6 +562,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -545,6 +562,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -615,7 +633,9 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -615,7 +633,9 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
using zero terminal SNR. using zero terminal SNR.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -665,6 +685,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline): ...@@ -665,6 +685,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -272,6 +272,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -272,6 +272,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -297,7 +298,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -297,7 +298,10 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -345,11 +349,22 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -345,11 +349,22 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -713,6 +728,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -713,6 +728,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
gligen_normalize_constant: float = 28.7, gligen_normalize_constant: float = 28.7,
clip_skip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -788,6 +804,9 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -788,6 +804,9 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
gligen_normalize_constant (`float`, *optional*, defaults to 28.7): gligen_normalize_constant (`float`, *optional*, defaults to 28.7):
The normalize value of the image embedding. The normalize value of the image embedding.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -836,6 +855,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline): ...@@ -836,6 +855,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
if do_classifier_free_guidance: if do_classifier_free_guidance:
......
...@@ -232,6 +232,7 @@ class StableDiffusionImg2ImgPipeline( ...@@ -232,6 +232,7 @@ class StableDiffusionImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -245,6 +246,7 @@ class StableDiffusionImg2ImgPipeline( ...@@ -245,6 +246,7 @@ class StableDiffusionImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -263,6 +265,7 @@ class StableDiffusionImg2ImgPipeline( ...@@ -263,6 +265,7 @@ class StableDiffusionImg2ImgPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -288,7 +291,10 @@ class StableDiffusionImg2ImgPipeline( ...@@ -288,7 +291,10 @@ class StableDiffusionImg2ImgPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -336,11 +342,22 @@ class StableDiffusionImg2ImgPipeline( ...@@ -336,11 +342,22 @@ class StableDiffusionImg2ImgPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -583,6 +600,7 @@ class StableDiffusionImg2ImgPipeline( ...@@ -583,6 +600,7 @@ class StableDiffusionImg2ImgPipeline(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -639,7 +657,9 @@ class StableDiffusionImg2ImgPipeline( ...@@ -639,7 +657,9 @@ class StableDiffusionImg2ImgPipeline(
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -678,6 +698,7 @@ class StableDiffusionImg2ImgPipeline( ...@@ -678,6 +698,7 @@ class StableDiffusionImg2ImgPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -305,6 +305,7 @@ class StableDiffusionInpaintPipeline( ...@@ -305,6 +305,7 @@ class StableDiffusionInpaintPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -318,6 +319,7 @@ class StableDiffusionInpaintPipeline( ...@@ -318,6 +319,7 @@ class StableDiffusionInpaintPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -336,6 +338,7 @@ class StableDiffusionInpaintPipeline( ...@@ -336,6 +338,7 @@ class StableDiffusionInpaintPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -361,7 +364,10 @@ class StableDiffusionInpaintPipeline( ...@@ -361,7 +364,10 @@ class StableDiffusionInpaintPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -409,11 +415,22 @@ class StableDiffusionInpaintPipeline( ...@@ -409,11 +415,22 @@ class StableDiffusionInpaintPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -720,6 +737,7 @@ class StableDiffusionInpaintPipeline( ...@@ -720,6 +737,7 @@ class StableDiffusionInpaintPipeline(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -792,7 +810,9 @@ class StableDiffusionInpaintPipeline( ...@@ -792,7 +810,9 @@ class StableDiffusionInpaintPipeline(
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
```py ```py
...@@ -874,6 +894,7 @@ class StableDiffusionInpaintPipeline( ...@@ -874,6 +894,7 @@ class StableDiffusionInpaintPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -227,6 +227,7 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -227,6 +227,7 @@ class StableDiffusionInpaintPipelineLegacy(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -240,6 +241,7 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -240,6 +241,7 @@ class StableDiffusionInpaintPipelineLegacy(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -258,6 +260,7 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -258,6 +260,7 @@ class StableDiffusionInpaintPipelineLegacy(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -283,7 +286,10 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -283,7 +286,10 @@ class StableDiffusionInpaintPipelineLegacy(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -331,11 +337,22 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -331,11 +337,22 @@ class StableDiffusionInpaintPipelineLegacy(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -540,6 +557,7 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -540,6 +557,7 @@ class StableDiffusionInpaintPipelineLegacy(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -608,6 +626,9 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -608,6 +626,9 @@ class StableDiffusionInpaintPipelineLegacy(
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
`self.processor` in `self.processor` in
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Returns: Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
...@@ -646,6 +667,7 @@ class StableDiffusionInpaintPipelineLegacy( ...@@ -646,6 +667,7 @@ class StableDiffusionInpaintPipelineLegacy(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -141,6 +141,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -141,6 +141,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -154,6 +155,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -154,6 +155,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -172,6 +174,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -172,6 +174,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -197,7 +200,10 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -197,7 +200,10 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -245,11 +251,22 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -245,11 +251,22 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -429,6 +446,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -429,6 +446,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
callback_steps: int = 1, callback_steps: int = 1,
use_karras_sigmas: Optional[bool] = False, use_karras_sigmas: Optional[bool] = False,
noise_sampler_seed: Optional[int] = None, noise_sampler_seed: Optional[int] = None,
clip_skip: int = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -491,6 +509,9 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -491,6 +509,9 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
Karras`. Karras`.
noise_sampler_seed (`int`, *optional*, defaults to `None`): noise_sampler_seed (`int`, *optional*, defaults to `None`):
The random seed to use for the noise sampler. If `None`, a random seed will be generated. The random seed to use for the noise sampler. If `None`, a random seed will be generated.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Returns: Returns:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
...@@ -532,6 +553,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -532,6 +553,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -202,6 +202,7 @@ class StableDiffusionLDM3DPipeline( ...@@ -202,6 +202,7 @@ class StableDiffusionLDM3DPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -215,6 +216,7 @@ class StableDiffusionLDM3DPipeline( ...@@ -215,6 +216,7 @@ class StableDiffusionLDM3DPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -233,6 +235,7 @@ class StableDiffusionLDM3DPipeline( ...@@ -233,6 +235,7 @@ class StableDiffusionLDM3DPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -258,7 +261,10 @@ class StableDiffusionLDM3DPipeline( ...@@ -258,7 +261,10 @@ class StableDiffusionLDM3DPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -306,11 +312,22 @@ class StableDiffusionLDM3DPipeline( ...@@ -306,11 +312,22 @@ class StableDiffusionLDM3DPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -501,6 +518,7 @@ class StableDiffusionLDM3DPipeline( ...@@ -501,6 +518,7 @@ class StableDiffusionLDM3DPipeline(
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -553,7 +571,9 @@ class StableDiffusionLDM3DPipeline( ...@@ -553,7 +571,9 @@ class StableDiffusionLDM3DPipeline(
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -595,6 +615,7 @@ class StableDiffusionLDM3DPipeline( ...@@ -595,6 +615,7 @@ class StableDiffusionLDM3DPipeline(
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -174,6 +174,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -174,6 +174,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -187,6 +188,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -187,6 +188,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -205,6 +207,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -205,6 +207,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -230,7 +233,10 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -230,7 +233,10 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -278,11 +284,22 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -278,11 +284,22 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -603,6 +620,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -603,6 +620,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -655,6 +673,9 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -655,6 +673,9 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
cross_attention_kwargs (`dict`, *optional*): cross_attention_kwargs (`dict`, *optional*):
A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -718,6 +739,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa ...@@ -718,6 +739,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -151,6 +151,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -151,6 +151,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -164,6 +165,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -164,6 +165,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -182,6 +184,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -182,6 +184,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -207,7 +210,10 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -207,7 +210,10 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -255,11 +261,22 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -255,11 +261,22 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -498,6 +515,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -498,6 +515,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
callback_steps: Optional[int] = 1, callback_steps: Optional[int] = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
circular_padding: bool = False, circular_padding: bool = False,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -559,7 +577,9 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -559,7 +577,9 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular
padding allows the model to seamlessly generate a transition from the rightmost part of the image to padding allows the model to seamlessly generate a transition from the rightmost part of the image to
the leftmost part, maintaining consistency in a 360-degree sense. the leftmost part, maintaining consistency in a 360-degree sense.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -605,6 +625,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM ...@@ -605,6 +625,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
...@@ -186,6 +186,7 @@ class StableDiffusionParadigmsPipeline( ...@@ -186,6 +186,7 @@ class StableDiffusionParadigmsPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
**kwargs,
): ):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
...@@ -199,6 +200,7 @@ class StableDiffusionParadigmsPipeline( ...@@ -199,6 +200,7 @@ class StableDiffusionParadigmsPipeline(
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale, lora_scale=lora_scale,
**kwargs,
) )
# concatenate for backwards comp # concatenate for backwards comp
...@@ -217,6 +219,7 @@ class StableDiffusionParadigmsPipeline( ...@@ -217,6 +219,7 @@ class StableDiffusionParadigmsPipeline(
prompt_embeds: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -242,7 +245,10 @@ class StableDiffusionParadigmsPipeline( ...@@ -242,7 +245,10 @@ class StableDiffusionParadigmsPipeline(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument. argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
# set lora scale so that monkey patched LoRA # set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it # function of text encoder can correctly access it
...@@ -290,11 +296,22 @@ class StableDiffusionParadigmsPipeline( ...@@ -290,11 +296,22 @@ class StableDiffusionParadigmsPipeline(
else: else:
attention_mask = None attention_mask = None
prompt_embeds = self.text_encoder( if clip_skip is None:
text_input_ids.to(device), prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
attention_mask=attention_mask, prompt_embeds = prompt_embeds[0]
) else:
prompt_embeds = prompt_embeds[0] prompt_embeds = self.text_encoder(
text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
)
# Access the `hidden_states` first, that contains a tuple of
# all the hidden states from the encoder layers. Then index into
# the tuple to access the hidden states from the desired layer.
prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
# We also need to apply the final LayerNorm here to not mess with the
# representations. The `last_hidden_states` that we typically use for
# obtaining the final prompt representations passes through the LayerNorm
# layer.
prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
if self.text_encoder is not None: if self.text_encoder is not None:
prompt_embeds_dtype = self.text_encoder.dtype prompt_embeds_dtype = self.text_encoder.dtype
...@@ -497,6 +514,7 @@ class StableDiffusionParadigmsPipeline( ...@@ -497,6 +514,7 @@ class StableDiffusionParadigmsPipeline(
callback_steps: int = 1, callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
debug: bool = False, debug: bool = False,
clip_skip: int = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -558,7 +576,9 @@ class StableDiffusionParadigmsPipeline( ...@@ -558,7 +576,9 @@ class StableDiffusionParadigmsPipeline(
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
debug (`bool`, *optional*, defaults to `False`): debug (`bool`, *optional*, defaults to `False`):
Whether or not to run in debug mode. In debug mode, `torch.cumsum` is evaluated using the CPU. Whether or not to run in debug mode. In debug mode, `torch.cumsum` is evaluated using the CPU.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -600,6 +620,7 @@ class StableDiffusionParadigmsPipeline( ...@@ -600,6 +620,7 @@ class StableDiffusionParadigmsPipeline(
negative_prompt, negative_prompt,
prompt_embeds=prompt_embeds, prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
clip_skip=clip_skip,
) )
# For classifier free guidance, we need to do two forward passes. # For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch # Here we concatenate the unconditional and text embeddings into a single batch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment