Unverified Commit edcbb6f4 authored by Sayak Paul's avatar Sayak Paul Committed by GitHub
Browse files

[WIP] core: add support for clip skip to SDXL (#5057)

* core: add support for clip ckip to SDXL

* add clip_skip support to the rest of the pipeline.

* Empty-Commit
parent 5a287d3f
...@@ -263,6 +263,7 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -263,6 +263,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -302,6 +303,9 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -302,6 +303,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -358,14 +362,15 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -358,14 +362,15 @@ class StableDiffusionXLControlNetInpaintPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -971,6 +976,7 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -971,6 +976,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
target_size: Tuple[int, int] = None, target_size: Tuple[int, int] = None,
aesthetic_score: float = 6.0, aesthetic_score: float = 6.0,
negative_aesthetic_score: float = 2.5, negative_aesthetic_score: float = 2.5,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -1097,6 +1103,9 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -1097,6 +1103,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
Part of SDXL's micro-conditioning as explained in section 2.2 of Part of SDXL's micro-conditioning as explained in section 2.2 of
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
simulate an aesthetic score of the generated image by influencing the negative text condition. simulate an aesthetic score of the generated image by influencing the negative text condition.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -1192,6 +1201,7 @@ class StableDiffusionXLControlNetInpaintPipeline( ...@@ -1192,6 +1201,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. set timesteps # 4. set timesteps
......
...@@ -236,6 +236,7 @@ class StableDiffusionXLControlNetPipeline( ...@@ -236,6 +236,7 @@ class StableDiffusionXLControlNetPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -275,6 +276,9 @@ class StableDiffusionXLControlNetPipeline( ...@@ -275,6 +276,9 @@ class StableDiffusionXLControlNetPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -331,14 +335,15 @@ class StableDiffusionXLControlNetPipeline( ...@@ -331,14 +335,15 @@ class StableDiffusionXLControlNetPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -767,6 +772,7 @@ class StableDiffusionXLControlNetPipeline( ...@@ -767,6 +772,7 @@ class StableDiffusionXLControlNetPipeline(
negative_original_size: Optional[Tuple[int, int]] = None, negative_original_size: Optional[Tuple[int, int]] = None,
negative_crops_coords_top_left: Tuple[int, int] = (0, 0), negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
The call function to the pipeline for generation. The call function to the pipeline for generation.
...@@ -884,6 +890,9 @@ class StableDiffusionXLControlNetPipeline( ...@@ -884,6 +890,9 @@ class StableDiffusionXLControlNetPipeline(
as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -968,6 +977,7 @@ class StableDiffusionXLControlNetPipeline( ...@@ -968,6 +977,7 @@ class StableDiffusionXLControlNetPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. Prepare image # 4. Prepare image
......
...@@ -274,6 +274,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -274,6 +274,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -313,6 +314,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -313,6 +314,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -369,14 +373,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -369,14 +373,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -914,6 +919,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -914,6 +919,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
aesthetic_score: float = 6.0, aesthetic_score: float = 6.0,
negative_aesthetic_score: float = 2.5, negative_aesthetic_score: float = 2.5,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -1057,6 +1063,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -1057,6 +1063,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
Part of SDXL's micro-conditioning as explained in section 2.2 of Part of SDXL's micro-conditioning as explained in section 2.2 of
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
simulate an aesthetic score of the generated image by influencing the negative text condition. simulate an aesthetic score of the generated image by influencing the negative text condition.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -1143,6 +1152,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline( ...@@ -1143,6 +1152,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. Prepare image and controlnet_conditioning_image # 4. Prepare image and controlnet_conditioning_image
......
...@@ -212,6 +212,7 @@ class StableDiffusionXLPipeline( ...@@ -212,6 +212,7 @@ class StableDiffusionXLPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -251,6 +252,9 @@ class StableDiffusionXLPipeline( ...@@ -251,6 +252,9 @@ class StableDiffusionXLPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -307,14 +311,15 @@ class StableDiffusionXLPipeline( ...@@ -307,14 +311,15 @@ class StableDiffusionXLPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -577,6 +582,7 @@ class StableDiffusionXLPipeline( ...@@ -577,6 +582,7 @@ class StableDiffusionXLPipeline(
negative_original_size: Optional[Tuple[int, int]] = None, negative_original_size: Optional[Tuple[int, int]] = None,
negative_crops_coords_top_left: Tuple[int, int] = (0, 0), negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -764,6 +770,7 @@ class StableDiffusionXLPipeline( ...@@ -764,6 +770,7 @@ class StableDiffusionXLPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. Prepare timesteps # 4. Prepare timesteps
......
...@@ -219,6 +219,7 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -219,6 +219,7 @@ class StableDiffusionXLImg2ImgPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -258,6 +259,9 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -258,6 +259,9 @@ class StableDiffusionXLImg2ImgPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -314,14 +318,15 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -314,14 +318,15 @@ class StableDiffusionXLImg2ImgPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -688,6 +693,7 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -688,6 +693,7 @@ class StableDiffusionXLImg2ImgPipeline(
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
aesthetic_score: float = 6.0, aesthetic_score: float = 6.0,
negative_aesthetic_score: float = 2.5, negative_aesthetic_score: float = 2.5,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -823,6 +829,9 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -823,6 +829,9 @@ class StableDiffusionXLImg2ImgPipeline(
Part of SDXL's micro-conditioning as explained in section 2.2 of Part of SDXL's micro-conditioning as explained in section 2.2 of
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
simulate an aesthetic score of the generated image by influencing the negative text condition. simulate an aesthetic score of the generated image by influencing the negative text condition.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -881,6 +890,7 @@ class StableDiffusionXLImg2ImgPipeline( ...@@ -881,6 +890,7 @@ class StableDiffusionXLImg2ImgPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. Preprocess image # 4. Preprocess image
......
...@@ -368,6 +368,7 @@ class StableDiffusionXLInpaintPipeline( ...@@ -368,6 +368,7 @@ class StableDiffusionXLInpaintPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -407,6 +408,9 @@ class StableDiffusionXLInpaintPipeline( ...@@ -407,6 +408,9 @@ class StableDiffusionXLInpaintPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -463,14 +467,15 @@ class StableDiffusionXLInpaintPipeline( ...@@ -463,14 +467,15 @@ class StableDiffusionXLInpaintPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -910,6 +915,7 @@ class StableDiffusionXLInpaintPipeline( ...@@ -910,6 +915,7 @@ class StableDiffusionXLInpaintPipeline(
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
aesthetic_score: float = 6.0, aesthetic_score: float = 6.0,
negative_aesthetic_score: float = 2.5, negative_aesthetic_score: float = 2.5,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -1057,6 +1063,9 @@ class StableDiffusionXLInpaintPipeline( ...@@ -1057,6 +1063,9 @@ class StableDiffusionXLInpaintPipeline(
Part of SDXL's micro-conditioning as explained in section 2.2 of Part of SDXL's micro-conditioning as explained in section 2.2 of
[https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
simulate an aesthetic score of the generated image by influencing the negative text condition. simulate an aesthetic score of the generated image by influencing the negative text condition.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
...@@ -1120,6 +1129,7 @@ class StableDiffusionXLInpaintPipeline( ...@@ -1120,6 +1129,7 @@ class StableDiffusionXLInpaintPipeline(
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
lora_scale=text_encoder_lora_scale, lora_scale=text_encoder_lora_scale,
clip_skip=clip_skip,
) )
# 4. set timesteps # 4. set timesteps
......
...@@ -236,6 +236,7 @@ class StableDiffusionXLAdapterPipeline( ...@@ -236,6 +236,7 @@ class StableDiffusionXLAdapterPipeline(
pooled_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
lora_scale: Optional[float] = None, lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Encodes the prompt into text encoder hidden states. Encodes the prompt into text encoder hidden states.
...@@ -275,6 +276,9 @@ class StableDiffusionXLAdapterPipeline( ...@@ -275,6 +276,9 @@ class StableDiffusionXLAdapterPipeline(
input argument. input argument.
lora_scale (`float`, *optional*): lora_scale (`float`, *optional*):
A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
""" """
device = device or self._execution_device device = device or self._execution_device
...@@ -331,14 +335,15 @@ class StableDiffusionXLAdapterPipeline( ...@@ -331,14 +335,15 @@ class StableDiffusionXLAdapterPipeline(
f" {tokenizer.model_max_length} tokens: {removed_text}" f" {tokenizer.model_max_length} tokens: {removed_text}"
) )
prompt_embeds = text_encoder( prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
text_input_ids.to(device),
output_hidden_states=True,
)
# We are only ALWAYS interested in the pooled output of the final text encoder # We are only ALWAYS interested in the pooled output of the final text encoder
pooled_prompt_embeds = prompt_embeds[0] pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.hidden_states[-2] if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
# "2" because SDXL always indexes from the penultimate layer.
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds_list.append(prompt_embeds) prompt_embeds_list.append(prompt_embeds)
...@@ -634,6 +639,7 @@ class StableDiffusionXLAdapterPipeline( ...@@ -634,6 +639,7 @@ class StableDiffusionXLAdapterPipeline(
negative_target_size: Optional[Tuple[int, int]] = None, negative_target_size: Optional[Tuple[int, int]] = None,
adapter_conditioning_scale: Union[float, List[float]] = 1.0, adapter_conditioning_scale: Union[float, List[float]] = 1.0,
adapter_conditioning_factor: float = 1.0, adapter_conditioning_factor: float = 1.0,
clip_skip: Optional[int] = None,
): ):
r""" r"""
Function invoked when calling the pipeline for generation. Function invoked when calling the pipeline for generation.
...@@ -765,6 +771,10 @@ class StableDiffusionXLAdapterPipeline( ...@@ -765,6 +771,10 @@ class StableDiffusionXLAdapterPipeline(
The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
`0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps. all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
the output of the pre-final layer will be used for computing the prompt embeddings.
Examples: Examples:
Returns: Returns:
...@@ -830,6 +840,7 @@ class StableDiffusionXLAdapterPipeline( ...@@ -830,6 +840,7 @@ class StableDiffusionXLAdapterPipeline(
negative_prompt_embeds=negative_prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
clip_skip=clip_skip,
) )
# 4. Prepare timesteps # 4. Prepare timesteps
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment