[WIP] core: add support for clip skip to SDXL (#5057)

* core: add support for clip ckip to SDXL * add clip_skip support to the rest of the pipeline. * Empty-Commit

[WIP] core: add support for clip skip to SDXL (#5057)
* core: add support for clip ckip to SDXL * add clip_skip support to the rest of the pipeline. * Empty-Commit
edcbb6f4 · Sayak Paul · GitHub · 5a287d3f · edcbb6f4 · edcbb6f4
Unverified Commit edcbb6f4 authored Sep 19, 2023 by Sayak Paul Committed by GitHub Sep 19, 2023
7 changed files
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -263,6 +263,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -302,6 +303,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -358,14 +362,15 @@ class StableDiffusionXLControlNetInpaintPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -971,6 +976,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
        target_size: Tuple[int, int] = None,
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -1097,6 +1103,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
@@ -1192,6 +1201,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. set timesteps

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -236,6 +236,7 @@ class StableDiffusionXLControlNetPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -275,6 +276,9 @@ class StableDiffusionXLControlNetPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -331,14 +335,15 @@ class StableDiffusionXLControlNetPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -767,6 +772,7 @@ class StableDiffusionXLControlNetPipeline(
        negative_original_size: Optional[Tuple[int, int]] = None,
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        The call function to the pipeline for generation.
@@ -884,6 +890,9 @@ class StableDiffusionXLControlNetPipeline(
                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
@@ -968,6 +977,7 @@ class StableDiffusionXLControlNetPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. Prepare image

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -274,6 +274,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -313,6 +314,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -369,14 +373,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -914,6 +919,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
        negative_target_size: Optional[Tuple[int, int]] = None,
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -1057,6 +1063,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
@@ -1143,6 +1152,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. Prepare image and controlnet_conditioning_image

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -212,6 +212,7 @@ class StableDiffusionXLPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -251,6 +252,9 @@ class StableDiffusionXLPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -307,14 +311,15 @@ class StableDiffusionXLPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -577,6 +582,7 @@ class StableDiffusionXLPipeline(
        negative_original_size: Optional[Tuple[int, int]] = None,
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -764,6 +770,7 @@ class StableDiffusionXLPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. Prepare timesteps

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -219,6 +219,7 @@ class StableDiffusionXLImg2ImgPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -258,6 +259,9 @@ class StableDiffusionXLImg2ImgPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -314,14 +318,15 @@ class StableDiffusionXLImg2ImgPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -688,6 +693,7 @@ class StableDiffusionXLImg2ImgPipeline(
        negative_target_size: Optional[Tuple[int, int]] = None,
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -823,6 +829,9 @@ class StableDiffusionXLImg2ImgPipeline(
                Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
@@ -881,6 +890,7 @@ class StableDiffusionXLImg2ImgPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. Preprocess image

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -368,6 +368,7 @@ class StableDiffusionXLInpaintPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -407,6 +408,9 @@ class StableDiffusionXLInpaintPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -463,14 +467,15 @@ class StableDiffusionXLInpaintPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -910,6 +915,7 @@ class StableDiffusionXLInpaintPipeline(
        negative_target_size: Optional[Tuple[int, int]] = None,
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -1057,6 +1063,9 @@ class StableDiffusionXLInpaintPipeline(
                Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
@@ -1120,6 +1129,7 @@ class StableDiffusionXLInpaintPipeline(
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
        )
        # 4. set timesteps

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -236,6 +236,7 @@ class StableDiffusionXLAdapterPipeline(
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -275,6 +276,9 @@ class StableDiffusionXLAdapterPipeline(
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        """
        device = device or self._execution_device
@@ -331,14 +335,15 @@ class StableDiffusionXLAdapterPipeline(
                        f" {tokenizer.model_max_length} tokens: {removed_text}"
                    )
-                prompt_embeds = text_encoder(
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
                prompt_embeds_list.append(prompt_embeds)
@@ -634,6 +639,7 @@ class StableDiffusionXLAdapterPipeline(
        negative_target_size: Optional[Tuple[int, int]] = None,
        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
        adapter_conditioning_factor: float = 1.0,
+        clip_skip: Optional[int] = None,
    ):
        r"""
        Function invoked when calling the pipeline for generation.
@@ -765,6 +771,10 @@ class StableDiffusionXLAdapterPipeline(
                The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
                `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
                all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
        Examples:
        Returns:
@@ -830,6 +840,7 @@ class StableDiffusionXLAdapterPipeline(
            negative_prompt_embeds=negative_prompt_embeds,
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            clip_skip=clip_skip,
        )
        # 4. Prepare timesteps