The Modular Diffusers (#9672)

adding modular diffusers as experimental feature --------- Co-authored-by: hlky <hlky@hlky.ac> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

The Modular Diffusers (#9672)
adding modular diffusers as experimental feature --------- Co-authored-by: hlky <hlky@hlky.ac> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com> Co-authored-by: Aryan <aryan@huggingface.co> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
f33b89ba · YiYi Xu · GitHub · 48a6d295 · f33b89ba · f33b89ba
Unverified Commit f33b89ba authored Jul 09, 2025 by YiYi Xu Committed by GitHub Jul 09, 2025
12 changed files
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Tuple, Union
+import numpy as np
+import PIL
+import torch
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL
+from ...models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
+from ...utils import logging
+from ..modular_pipeline import (
+    PipelineBlock,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableDiffusionXLDecodeStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images"
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("output_type", default="pil"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents from the denoising step",
+            )
+        ]
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+    @staticmethod
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae with self->components
+    def upcast_vae(components):
+        dtype = components.vae.dtype
+        components.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            components.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            components.vae.post_quant_conv.to(dtype)
+            components.vae.decoder.conv_in.to(dtype)
+            components.vae.decoder.mid_block.to(dtype)
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        if not block_state.output_type == "latent":
+            latents = block_state.latents
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            block_state.needs_upcasting = components.vae.dtype == torch.float16 and components.vae.config.force_upcast
+            if block_state.needs_upcasting:
+                self.upcast_vae(components)
+                latents = latents.to(next(iter(components.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != components.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    components.vae = components.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            block_state.has_latents_mean = (
+                hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None
+            )
+            block_state.has_latents_std = (
+                hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None
+            )
+            if block_state.has_latents_mean and block_state.has_latents_std:
+                block_state.latents_mean = (
+                    torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                block_state.latents_std = (
+                    torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * block_state.latents_std / components.vae.config.scaling_factor + block_state.latents_mean
+                )
+            else:
+                latents = latents / components.vae.config.scaling_factor
+            block_state.images = components.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if block_state.needs_upcasting:
+                components.vae.to(dtype=torch.float16)
+        else:
+            block_state.images = block_state.latents
+        # apply watermark if available
+        if hasattr(components, "watermark") and components.watermark is not None:
+            block_state.images = components.watermark.apply_watermark(block_state.images)
+        block_state.images = components.image_processor.postprocess(
+            block_state.images, output_type=block_state.output_type
+        )
+        self.set_block_state(state, block_state)
+        return components, state
+class StableDiffusionXLInpaintOverlayMaskStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def description(self) -> str:
+        return (
+            "A post-processing step that overlays the mask on the image (inpainting task only).\n"
+            + "only needed when you are using the `padding_mask_crop` option when pre-processing the image and mask"
+        )
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("image"),
+            InputParam("mask_image"),
+            InputParam("padding_mask_crop"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+                description="The generated images from the decode step",
+            ),
+            InputParam(
+                "crops_coords",
+                type_hint=Tuple[int, int],
+                description="The crop coordinates to use for preprocess/postprocess the image and mask, for inpainting task only. Can be generated in vae_encode step.",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        if block_state.padding_mask_crop is not None and block_state.crops_coords is not None:
+            block_state.images = [
+                components.image_processor.apply_overlay(
+                    block_state.mask_image, block_state.image, i, block_state.crops_coords
+                )
+                for i in block_state.images
+            ]
+        self.set_block_state(state, block_state)
+        return components, state
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/denoise.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, List, Optional, Tuple
+import torch
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import ControlNetModel, UNet2DConditionModel
+from ...schedulers import EulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    PipelineBlock,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import StableDiffusionXLModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# YiYi experimenting composible denoise loop
+# loop step (1): prepare latent input for denoiser
+class StableDiffusionXLLoopBeforeDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepare the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `StableDiffusionXLDenoiseLoopWrapper`)"
+        )
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
+        block_state.scaled_latents = components.scheduler.scale_model_input(block_state.latents, t)
+        return components, block_state
+# loop step (1): prepare latent input for denoiser (with inpainting)
+class StableDiffusionXLInpaintLoopBeforeDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+            ComponentSpec("unet", UNet2DConditionModel),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepare the latent input for the denoiser (for inpainting workflow only). "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` object"
+        )
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "mask",
+                type_hint=Optional[torch.Tensor],
+                description="The mask to use for the denoising process, for inpainting task only. Can be generated in vae_encode or prepare_latent step.",
+            ),
+            InputParam(
+                "masked_image_latents",
+                type_hint=Optional[torch.Tensor],
+                description="The masked image latents to use for the denoising process, for inpainting task only. Can be generated in vae_encode or prepare_latent step.",
+            ),
+        ]
+    @staticmethod
+    def check_inputs(components, block_state):
+        num_channels_unet = components.num_channels_unet
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            if block_state.mask is None or block_state.masked_image_latents is None:
+                raise ValueError("mask and masked_image_latents must be provided for inpainting-specific Unet")
+            num_channels_latents = block_state.latents.shape[1]
+            num_channels_mask = block_state.mask.shape[1]
+            num_channels_masked_image = block_state.masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `components.unet`: {components.unet.config} expects"
+                    f" {components.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents + num_channels_masked_image + num_channels_mask}. Please verify the config of"
+                    " `components.unet` or your `mask_image` or `image` input."
+                )
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
+        self.check_inputs(components, block_state)
+        block_state.scaled_latents = components.scheduler.scale_model_input(block_state.latents, t)
+        if components.num_channels_unet == 9:
+            block_state.scaled_latents = torch.cat(
+                [block_state.scaled_latents, block_state.mask, block_state.masked_image_latents], dim=1
+            )
+        return components, block_state
+# loop step (2): denoise the latents with guidance
+class StableDiffusionXLLoopDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("unet", UNet2DConditionModel),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `StableDiffusionXLDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("cross_attention_kwargs"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "timestep_cond",
+                type_hint=Optional[torch.Tensor],
+                description="The guidance scale embedding to use for Latent Consistency Models(LCMs). Can be generated in prepare_additional_conditioning step.",
+            ),
+            InputParam(
+                kwargs_type="guider_input_fields",
+                description=(
+                    "All conditional model inputs that need to be prepared with guider. "
+                    "It should contain prompt_embeds/negative_prompt_embeds, "
+                    "add_time_ids/negative_add_time_ids, "
+                    "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
+                    "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
+                    "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                ),
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(
+        self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int
+    ) -> PipelineState:
+        #  Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
+        #  to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
+        guider_input_fields = {
+            "prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"),
+            "time_ids": ("add_time_ids", "negative_add_time_ids"),
+            "text_embeds": ("pooled_prompt_embeds", "negative_pooled_prompt_embeds"),
+            "image_embeds": ("ip_adapter_embeds", "negative_ip_adapter_embeds"),
+        }
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        # Prepare mini‐batches according to guidance method and `guider_input_fields`
+        # Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds.
+        # e.g. for CFG, we prepare two batches: one for uncond, one for cond
+        # for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds
+        # for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds
+        guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
+        # run the denoiser for each guidance batch
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.unet)
+            cond_kwargs = guider_state_batch.as_dict()
+            cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
+            prompt_embeds = cond_kwargs.pop("prompt_embeds")
+            # Predict the noise residual
+            # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
+            guider_state_batch.noise_pred = components.unet(
+                block_state.scaled_latents,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=block_state.timestep_cond,
+                cross_attention_kwargs=block_state.cross_attention_kwargs,
+                added_cond_kwargs=cond_kwargs,
+                return_dict=False,
+            )[0]
+            components.guider.cleanup_models(components.unet)
+        # Perform guidance
+        block_state.noise_pred, block_state.scheduler_step_kwargs = components.guider(guider_state)
+        return components, block_state
+# loop step (2): denoise the latents with guidance (with controlnet)
+class StableDiffusionXLControlNetLoopDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("unet", UNet2DConditionModel),
+            ComponentSpec("controlnet", ControlNetModel),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that denoise the latents with guidance (with controlnet). "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `StableDiffusionXLDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("cross_attention_kwargs"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam(
+                "controlnet_cond",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "conditioning_scale",
+                type_hint=float,
+                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "guess_mode",
+                required=True,
+                type_hint=bool,
+                description="The guess mode value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "controlnet_keep",
+                required=True,
+                type_hint=List[float],
+                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
+            ),
+            InputParam(
+                "timestep_cond",
+                type_hint=Optional[torch.Tensor],
+                description="The guidance scale embedding to use for Latent Consistency Models(LCMs), can be generated by prepare_additional_conditioning step",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                kwargs_type="guider_input_fields",
+                description=(
+                    "All conditional model inputs that need to be prepared with guider. "
+                    "It should contain prompt_embeds/negative_prompt_embeds, "
+                    "add_time_ids/negative_add_time_ids, "
+                    "pooled_prompt_embeds/negative_pooled_prompt_embeds, "
+                    "and ip_adapter_embeds/negative_ip_adapter_embeds (optional)."
+                    "please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                ),
+            ),
+            InputParam(
+                kwargs_type="controlnet_kwargs",
+                description=(
+                    "additional kwargs for controlnet (e.g. control_type_idx and control_type from the controlnet union input step )"
+                    "please add `kwargs_type=controlnet_kwargs` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
+                ),
+            ),
+        ]
+    @staticmethod
+    def prepare_extra_kwargs(func, exclude_kwargs=[], **kwargs):
+        accepted_kwargs = set(inspect.signature(func).parameters.keys())
+        extra_kwargs = {}
+        for key, value in kwargs.items():
+            if key in accepted_kwargs and key not in exclude_kwargs:
+                extra_kwargs[key] = value
+        return extra_kwargs
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
+        extra_controlnet_kwargs = self.prepare_extra_kwargs(
+            components.controlnet.forward, **block_state.controlnet_kwargs
+        )
+        #  Map the keys we'll see on each `guider_state_batch` (e.g. guider_state_batch.prompt_embeds)
+        #  to the corresponding (cond, uncond) fields on block_state. (e.g. block_state.prompt_embeds, block_state.negative_prompt_embeds)
+        guider_input_fields = {
+            "prompt_embeds": ("prompt_embeds", "negative_prompt_embeds"),
+            "time_ids": ("add_time_ids", "negative_add_time_ids"),
+            "text_embeds": ("pooled_prompt_embeds", "negative_pooled_prompt_embeds"),
+            "image_embeds": ("ip_adapter_embeds", "negative_ip_adapter_embeds"),
+        }
+        # cond_scale for the timestep (controlnet input)
+        if isinstance(block_state.controlnet_keep[i], list):
+            block_state.cond_scale = [
+                c * s for c, s in zip(block_state.conditioning_scale, block_state.controlnet_keep[i])
+            ]
+        else:
+            controlnet_cond_scale = block_state.conditioning_scale
+            if isinstance(controlnet_cond_scale, list):
+                controlnet_cond_scale = controlnet_cond_scale[0]
+            block_state.cond_scale = controlnet_cond_scale * block_state.controlnet_keep[i]
+        # default controlnet output/unet input for guess mode + conditional path
+        block_state.down_block_res_samples_zeros = None
+        block_state.mid_block_res_sample_zeros = None
+        # guided denoiser step
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        # Prepare mini‐batches according to guidance method and `guider_input_fields`
+        # Each guider_state_batch will have .prompt_embeds, .time_ids, text_embeds, image_embeds.
+        # e.g. for CFG, we prepare two batches: one for uncond, one for cond
+        # for first batch, guider_state_batch.prompt_embeds correspond to block_state.prompt_embeds
+        # for second batch, guider_state_batch.prompt_embeds correspond to block_state.negative_prompt_embeds
+        guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
+        # run the denoiser for each guidance batch
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.unet)
+            # Prepare additional conditionings
+            added_cond_kwargs = {
+                "text_embeds": guider_state_batch.text_embeds,
+                "time_ids": guider_state_batch.time_ids,
+            }
+            if hasattr(guider_state_batch, "image_embeds") and guider_state_batch.image_embeds is not None:
+                added_cond_kwargs["image_embeds"] = guider_state_batch.image_embeds
+            # Prepare controlnet additional conditionings
+            controlnet_added_cond_kwargs = {
+                "text_embeds": guider_state_batch.text_embeds,
+                "time_ids": guider_state_batch.time_ids,
+            }
+            # run controlnet for the guidance batch
+            if block_state.guess_mode and not components.guider.is_conditional:
+                # guider always run uncond batch first, so these tensors should be set already
+                down_block_res_samples = block_state.down_block_res_samples_zeros
+                mid_block_res_sample = block_state.mid_block_res_sample_zeros
+            else:
+                down_block_res_samples, mid_block_res_sample = components.controlnet(
+                    block_state.scaled_latents,
+                    t,
+                    encoder_hidden_states=guider_state_batch.prompt_embeds,
+                    controlnet_cond=block_state.controlnet_cond,
+                    conditioning_scale=block_state.cond_scale,
+                    guess_mode=block_state.guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                    **extra_controlnet_kwargs,
+                )
+                # assign it to block_state so it will be available for the uncond guidance batch
+                if block_state.down_block_res_samples_zeros is None:
+                    block_state.down_block_res_samples_zeros = [torch.zeros_like(d) for d in down_block_res_samples]
+                if block_state.mid_block_res_sample_zeros is None:
+                    block_state.mid_block_res_sample_zeros = torch.zeros_like(mid_block_res_sample)
+            # Predict the noise
+            # store the noise_pred in guider_state_batch so we can apply guidance across all batches
+            guider_state_batch.noise_pred = components.unet(
+                block_state.scaled_latents,
+                t,
+                encoder_hidden_states=guider_state_batch.prompt_embeds,
+                timestep_cond=block_state.timestep_cond,
+                cross_attention_kwargs=block_state.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+                return_dict=False,
+            )[0]
+            components.guider.cleanup_models(components.unet)
+        # Perform guidance
+        block_state.noise_pred, block_state.scheduler_step_kwargs = components.guider(guider_state)
+        return components, block_state
+# loop step (3): scheduler step to update latents
+class StableDiffusionXLLoopAfterDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that update the latents. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `StableDiffusionXLDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("eta", default=0.0),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam("generator"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents")]
+    # YiYi TODO: move this out of here
+    @staticmethod
+    def prepare_extra_kwargs(func, exclude_kwargs=[], **kwargs):
+        accepted_kwargs = set(inspect.signature(func).parameters.keys())
+        extra_kwargs = {}
+        for key, value in kwargs.items():
+            if key in accepted_kwargs and key not in exclude_kwargs:
+                extra_kwargs[key] = value
+        return extra_kwargs
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
+        # Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        block_state.extra_step_kwargs = self.prepare_extra_kwargs(
+            components.scheduler.step, generator=block_state.generator, eta=block_state.eta
+        )
+        # Perform scheduler step using the predicted output
+        block_state.latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred,
+            t,
+            block_state.latents,
+            **block_state.extra_step_kwargs,
+            **block_state.scheduler_step_kwargs,
+            return_dict=False,
+        )[0]
+        if block_state.latents.dtype != block_state.latents_dtype:
+            if torch.backends.mps.is_available():
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                block_state.latents = block_state.latents.to(block_state.latents_dtype)
+        return components, block_state
+# loop step (3): scheduler step to update latents (with inpainting)
+class StableDiffusionXLInpaintLoopAfterDenoiser(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+            ComponentSpec("unet", UNet2DConditionModel),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that update the latents (for inpainting workflow only). "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `StableDiffusionXLDenoiseLoopWrapper`)"
+        )
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("eta", default=0.0),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[str]:
+        return [
+            InputParam("generator"),
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "mask",
+                type_hint=Optional[torch.Tensor],
+                description="The mask to use for the denoising process, for inpainting task only. Can be generated in vae_encode or prepare_latent step.",
+            ),
+            InputParam(
+                "noise",
+                type_hint=Optional[torch.Tensor],
+                description="The noise added to the image latents, for inpainting task only. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "image_latents",
+                type_hint=Optional[torch.Tensor],
+                description="The image latents to use for the denoising process, for inpainting/image-to-image task only. Can be generated in vae_encode or prepare_latent step.",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents")]
+    @staticmethod
+    def prepare_extra_kwargs(func, exclude_kwargs=[], **kwargs):
+        accepted_kwargs = set(inspect.signature(func).parameters.keys())
+        extra_kwargs = {}
+        for key, value in kwargs.items():
+            if key in accepted_kwargs and key not in exclude_kwargs:
+                extra_kwargs[key] = value
+        return extra_kwargs
+    def check_inputs(self, components, block_state):
+        if components.num_channels_unet == 4:
+            if block_state.image_latents is None:
+                raise ValueError(f"image_latents is required for this step {self.__class__.__name__}")
+            if block_state.mask is None:
+                raise ValueError(f"mask is required for this step {self.__class__.__name__}")
+            if block_state.noise is None:
+                raise ValueError(f"noise is required for this step {self.__class__.__name__}")
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, block_state: BlockState, i: int, t: int):
+        self.check_inputs(components, block_state)
+        # Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        block_state.extra_step_kwargs = self.prepare_extra_kwargs(
+            components.scheduler.step, generator=block_state.generator, eta=block_state.eta
+        )
+        # Perform scheduler step using the predicted output
+        block_state.latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred,
+            t,
+            block_state.latents,
+            **block_state.extra_step_kwargs,
+            **block_state.scheduler_step_kwargs,
+            return_dict=False,
+        )[0]
+        if block_state.latents.dtype != block_state.latents_dtype:
+            if torch.backends.mps.is_available():
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                block_state.latents = block_state.latents.to(block_state.latents_dtype)
+        # adjust latent for inpainting
+        if components.num_channels_unet == 4:
+            block_state.init_latents_proper = block_state.image_latents
+            if i < len(block_state.timesteps) - 1:
+                block_state.noise_timestep = block_state.timesteps[i + 1]
+                block_state.init_latents_proper = components.scheduler.add_noise(
+                    block_state.init_latents_proper, block_state.noise, torch.tensor([block_state.noise_timestep])
+                )
+            block_state.latents = (
+                1 - block_state.mask
+            ) * block_state.init_latents_proper + block_state.mask * block_state.latents
+        return components, block_state
+# the loop wrapper that iterates over the timesteps
+class StableDiffusionXLDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "stable-diffusion-xl"
+    @property
+    def description(self) -> str:
+        return (
+            "Pipeline block that iteratively denoise the latents over `timesteps`. "
+            "The specific steps with each iteration can be customized with `sub_blocks` attributes"
+        )
+    @property
+    def loop_expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("scheduler", EulerDiscreteScheduler),
+            ComponentSpec("unet", UNet2DConditionModel),
+        ]
+    @property
+    def loop_intermediate_inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.disable_guidance = True if components.unet.config.time_cond_proj_dim is not None else False
+        if block_state.disable_guidance:
+            components.guider.disable()
+        else:
+            components.guider.enable()
+        block_state.num_warmup_steps = max(
+            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
+        )
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                if i == len(block_state.timesteps) - 1 or (
+                    (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                ):
+                    progress_bar.update()
+        self.set_block_state(state, block_state)
+        return components, state
+# composing the denoising loops
+class StableDiffusionXLDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [
+        StableDiffusionXLLoopBeforeDenoiser,
+        StableDiffusionXLLoopDenoiser,
+        StableDiffusionXLLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `StableDiffusionXLLoopBeforeDenoiser`\n"
+            " - `StableDiffusionXLLoopDenoiser`\n"
+            " - `StableDiffusionXLLoopAfterDenoiser`\n"
+            "This block supports both text2img and img2img tasks."
+        )
+# control_cond
+class StableDiffusionXLControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [
+        StableDiffusionXLLoopBeforeDenoiser,
+        StableDiffusionXLControlNetLoopDenoiser,
+        StableDiffusionXLLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents with controlnet. \n"
+            "Its loop logic is defined in  `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `StableDiffusionXLLoopBeforeDenoiser`\n"
+            " - `StableDiffusionXLControlNetLoopDenoiser`\n"
+            " - `StableDiffusionXLLoopAfterDenoiser`\n"
+            "This block supports using controlnet for both text2img and img2img tasks."
+        )
+# mask
+class StableDiffusionXLInpaintDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [
+        StableDiffusionXLInpaintLoopBeforeDenoiser,
+        StableDiffusionXLLoopDenoiser,
+        StableDiffusionXLInpaintLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents(for inpainting task only). \n"
+            "Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `StableDiffusionXLInpaintLoopBeforeDenoiser`\n"
+            " - `StableDiffusionXLLoopDenoiser`\n"
+            " - `StableDiffusionXLInpaintLoopAfterDenoiser`\n"
+            "This block onlysupports inpainting tasks."
+        )
+# control_cond + mask
+class StableDiffusionXLInpaintControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
+    block_classes = [
+        StableDiffusionXLInpaintLoopBeforeDenoiser,
+        StableDiffusionXLControlNetLoopDenoiser,
+        StableDiffusionXLInpaintLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents(for inpainting task only) with controlnet. \n"
+            "Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `StableDiffusionXLInpaintLoopBeforeDenoiser`\n"
+            " - `StableDiffusionXLControlNetLoopDenoiser`\n"
+            " - `StableDiffusionXLInpaintLoopAfterDenoiser`\n"
+            "This block only supports using controlnet for inpainting tasks."
+        )
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from .modular_pipeline import StableDiffusionXLModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class StableDiffusionXLIPAdapterStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def description(self) -> str:
+        return (
+            "IP Adapter step that prepares ip adapter image embeddings.\n"
+            "Note that this step only prepares the embeddings - in order for it to work correctly, "
+            "you need to load ip adapter weights into unet via ModularPipeline.load_ip_adapter() and pipeline.set_ip_adapter_scale().\n"
+            "See [ModularIPAdapterMixin](https://huggingface.co/docs/diffusers/api/loaders/ip_adapter#diffusers.loaders.ModularIPAdapterMixin)"
+            " for more details"
+        )
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_encoder", CLIPVisionModelWithProjection),
+            ComponentSpec(
+                "feature_extractor",
+                CLIPImageProcessor,
+                config=FrozenDict({"size": 224, "crop_size": 224}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("unet", UNet2DConditionModel),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "ip_adapter_image",
+                PipelineImageInput,
+                required=True,
+                description="The image(s) to be used as ip adapter",
+            )
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("ip_adapter_embeds", type_hint=torch.Tensor, description="IP adapter image embeddings"),
+            OutputParam(
+                "negative_ip_adapter_embeds",
+                type_hint=torch.Tensor,
+                description="Negative IP adapter image embeddings",
+            ),
+        ]
+    @staticmethod
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image with self->components
+    def encode_image(components, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(components.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = components.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = components.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = components.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = components.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
+    # modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self,
+        components,
+        ip_adapter_image,
+        ip_adapter_image_embeds,
+        device,
+        num_images_per_prompt,
+        prepare_unconditional_embeds,
+    ):
+        image_embeds = []
+        if prepare_unconditional_embeds:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(components.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(components.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, components.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    components, single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                image_embeds.append(single_image_embeds[None, :])
+                if prepare_unconditional_embeds:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if prepare_unconditional_embeds:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            if prepare_unconditional_embeds:
+                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
+                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
+        block_state.device = components._execution_device
+        block_state.ip_adapter_embeds = self.prepare_ip_adapter_image_embeds(
+            components,
+            ip_adapter_image=block_state.ip_adapter_image,
+            ip_adapter_image_embeds=None,
+            device=block_state.device,
+            num_images_per_prompt=1,
+            prepare_unconditional_embeds=block_state.prepare_unconditional_embeds,
+        )
+        if block_state.prepare_unconditional_embeds:
+            block_state.negative_ip_adapter_embeds = []
+            for i, image_embeds in enumerate(block_state.ip_adapter_embeds):
+                negative_image_embeds, image_embeds = image_embeds.chunk(2)
+                block_state.negative_ip_adapter_embeds.append(negative_image_embeds)
+                block_state.ip_adapter_embeds[i] = image_embeds
+        self.set_block_state(state, block_state)
+        return components, state
+class StableDiffusionXLTextEncoderStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generate text_embeddings to guide the image generation"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", CLIPTextModel),
+            ComponentSpec("text_encoder_2", CLIPTextModelWithProjection),
+            ComponentSpec("tokenizer", CLIPTokenizer),
+            ComponentSpec("tokenizer_2", CLIPTokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 7.5}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [ConfigSpec("force_zeros_for_empty_prompt", True)]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("prompt_2"),
+            InputParam("negative_prompt"),
+            InputParam("negative_prompt_2"),
+            InputParam("cross_attention_kwargs"),
+            InputParam("clip_skip"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="guider_input_fields",
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="guider_input_fields",
+                description="negative text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "pooled_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="guider_input_fields",
+                description="pooled text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_pooled_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="guider_input_fields",
+                description="negative pooled text embeddings used to guide the image generation",
+            ),
+        ]
+    @staticmethod
+    def check_inputs(block_state):
+        if block_state.prompt is not None and (
+            not isinstance(block_state.prompt, str) and not isinstance(block_state.prompt, list)
+        ):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
+        elif block_state.prompt_2 is not None and (
+            not isinstance(block_state.prompt_2, str) and not isinstance(block_state.prompt_2, list)
+        ):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(block_state.prompt_2)}")
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prepare_unconditional_embeds: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prepare_unconditional_embeds (`bool`):
+                whether to use prepare unconditional embeddings or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or components._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(components, StableDiffusionXLLoraLoaderMixin):
+            components._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if components.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(components.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(components.text_encoder, lora_scale)
+            if components.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(components.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(components.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Define tokenizers and text encoders
+        tokenizers = (
+            [components.tokenizer, components.tokenizer_2]
+            if components.tokenizer is not None
+            else [components.tokenizer_2]
+        )
+        text_encoders = (
+            [components.text_encoder, components.text_encoder_2]
+            if components.text_encoder is not None
+            else [components.text_encoder_2]
+        )
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(components, TextualInversionLoaderMixin):
+                    prompt = components.maybe_convert_prompt(prompt, tokenizer)
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+                prompt_embeds_list.append(prompt_embeds)
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and components.config.force_zeros_for_empty_prompt
+        if prepare_unconditional_embeds and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif prepare_unconditional_embeds and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(components, TextualInversionLoaderMixin):
+                    negative_prompt = components.maybe_convert_prompt(negative_prompt, tokenizer)
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+        if components.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=components.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=components.unet.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if prepare_unconditional_embeds:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            if components.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(
+                    dtype=components.text_encoder_2.dtype, device=device
+                )
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=components.unet.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if prepare_unconditional_embeds:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+        if components.text_encoder is not None:
+            if isinstance(components, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(components.text_encoder, lora_scale)
+        if components.text_encoder_2 is not None:
+            if isinstance(components, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(components.text_encoder_2, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, state: PipelineState) -> PipelineState:
+        # Get inputs and intermediates
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
+        block_state.device = components._execution_device
+        # Encode input prompt
+        block_state.text_encoder_lora_scale = (
+            block_state.cross_attention_kwargs.get("scale", None)
+            if block_state.cross_attention_kwargs is not None
+            else None
+        )
+        (
+            block_state.prompt_embeds,
+            block_state.negative_prompt_embeds,
+            block_state.pooled_prompt_embeds,
+            block_state.negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            components,
+            block_state.prompt,
+            block_state.prompt_2,
+            block_state.device,
+            1,
+            block_state.prepare_unconditional_embeds,
+            block_state.negative_prompt,
+            block_state.negative_prompt_2,
+            prompt_embeds=None,
+            negative_prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            negative_pooled_prompt_embeds=None,
+            lora_scale=block_state.text_encoder_lora_scale,
+            clip_skip=block_state.clip_skip,
+        )
+        # Add outputs
+        self.set_block_state(state, block_state)
+        return components, state
+class StableDiffusionXLVaeEncoderStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def description(self) -> str:
+        return "Vae Encoder step that encode the input image into a latent representation"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("image", required=True),
+            InputParam("height"),
+            InputParam("width"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[InputParam]:
+        return [
+            InputParam("generator"),
+            InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
+            InputParam(
+                "preprocess_kwargs",
+                type_hint=Optional[dict],
+                description="A kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor]",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="The latents representing the reference image for image-to-image/inpainting generation",
+            )
+        ]
+    # Modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._encode_vae_image with self -> components
+    # YiYi TODO: update the _encode_vae_image so that we can use #Coped from
+    def _encode_vae_image(self, components, image: torch.Tensor, generator: torch.Generator):
+        latents_mean = latents_std = None
+        if hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None:
+            latents_std = torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1)
+        dtype = image.dtype
+        if components.vae.config.force_upcast:
+            image = image.float()
+            components.vae.to(dtype=torch.float32)
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(components.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(components.vae.encode(image), generator=generator)
+        if components.vae.config.force_upcast:
+            components.vae.to(dtype)
+        image_latents = image_latents.to(dtype)
+        if latents_mean is not None and latents_std is not None:
+            latents_mean = latents_mean.to(device=image_latents.device, dtype=dtype)
+            latents_std = latents_std.to(device=image_latents.device, dtype=dtype)
+            image_latents = (image_latents - latents_mean) * components.vae.config.scaling_factor / latents_std
+        else:
+            image_latents = components.vae.config.scaling_factor * image_latents
+        return image_latents
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.preprocess_kwargs = block_state.preprocess_kwargs or {}
+        block_state.device = components._execution_device
+        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
+        block_state.image = components.image_processor.preprocess(
+            block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs
+        )
+        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
+        block_state.batch_size = block_state.image.shape[0]
+        # if generator is a list, make sure the length of it matches the length of images (both should be batch_size)
+        if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
+                f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        block_state.image_latents = self._encode_vae_image(
+            components, image=block_state.image, generator=block_state.generator
+        )
+        self.set_block_state(state, block_state)
+        return components, state
+class StableDiffusionXLInpaintVaeEncoderStep(PipelineBlock):
+    model_name = "stable-diffusion-xl"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec(
+                "mask_processor",
+                VaeImageProcessor,
+                config=FrozenDict(
+                    {"do_normalize": False, "vae_scale_factor": 8, "do_binarize": True, "do_convert_grayscale": True}
+                ),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def description(self) -> str:
+        return "Vae encoder step that prepares the image and mask for the inpainting process"
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("height"),
+            InputParam("width"),
+            InputParam("image", required=True),
+            InputParam("mask_image", required=True),
+            InputParam("padding_mask_crop"),
+        ]
+    @property
+    def intermediate_inputs(self) -> List[InputParam]:
+        return [
+            InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
+            InputParam("generator"),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "image_latents", type_hint=torch.Tensor, description="The latents representation of the input image"
+            ),
+            OutputParam("mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process"),
+            OutputParam(
+                "masked_image_latents",
+                type_hint=torch.Tensor,
+                description="The masked image latents to use for the inpainting process (only for inpainting-specifid unet)",
+            ),
+            OutputParam(
+                "crops_coords",
+                type_hint=Optional[Tuple[int, int]],
+                description="The crop coordinates to use for the preprocess/postprocess of the image and mask",
+            ),
+        ]
+    # Modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline._encode_vae_image with self -> components
+    # YiYi TODO: update the _encode_vae_image so that we can use #Coped from
+    def _encode_vae_image(self, components, image: torch.Tensor, generator: torch.Generator):
+        latents_mean = latents_std = None
+        if hasattr(components.vae.config, "latents_mean") and components.vae.config.latents_mean is not None:
+            latents_mean = torch.tensor(components.vae.config.latents_mean).view(1, 4, 1, 1)
+        if hasattr(components.vae.config, "latents_std") and components.vae.config.latents_std is not None:
+            latents_std = torch.tensor(components.vae.config.latents_std).view(1, 4, 1, 1)
+        dtype = image.dtype
+        if components.vae.config.force_upcast:
+            image = image.float()
+            components.vae.to(dtype=torch.float32)
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(components.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(components.vae.encode(image), generator=generator)
+        if components.vae.config.force_upcast:
+            components.vae.to(dtype)
+        image_latents = image_latents.to(dtype)
+        if latents_mean is not None and latents_std is not None:
+            latents_mean = latents_mean.to(device=image_latents.device, dtype=dtype)
+            latents_std = latents_std.to(device=image_latents.device, dtype=dtype)
+            image_latents = (image_latents - latents_mean) * self.vae.config.scaling_factor / latents_std
+        else:
+            image_latents = components.vae.config.scaling_factor * image_latents
+        return image_latents
+    # modified from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipeline.prepare_mask_latents
+    # do not accept do_classifier_free_guidance
+    def prepare_mask_latents(
+        self, components, mask, masked_image, batch_size, height, width, dtype, device, generator
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // components.vae_scale_factor, width // components.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(components, masked_image, generator=generator)
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+    @torch.no_grad()
+    def __call__(self, components: StableDiffusionXLModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
+        block_state.device = components._execution_device
+        if block_state.height is None:
+            block_state.height = components.default_height
+        if block_state.width is None:
+            block_state.width = components.default_width
+        if block_state.padding_mask_crop is not None:
+            block_state.crops_coords = components.mask_processor.get_crop_region(
+                block_state.mask_image, block_state.width, block_state.height, pad=block_state.padding_mask_crop
+            )
+            block_state.resize_mode = "fill"
+        else:
+            block_state.crops_coords = None
+            block_state.resize_mode = "default"
+        block_state.image = components.image_processor.preprocess(
+            block_state.image,
+            height=block_state.height,
+            width=block_state.width,
+            crops_coords=block_state.crops_coords,
+            resize_mode=block_state.resize_mode,
+        )
+        block_state.image = block_state.image.to(dtype=torch.float32)
+        block_state.mask = components.mask_processor.preprocess(
+            block_state.mask_image,
+            height=block_state.height,
+            width=block_state.width,
+            resize_mode=block_state.resize_mode,
+            crops_coords=block_state.crops_coords,
+        )
+        block_state.masked_image = block_state.image * (block_state.mask < 0.5)
+        block_state.batch_size = block_state.image.shape[0]
+        block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype)
+        block_state.image_latents = self._encode_vae_image(
+            components, image=block_state.image, generator=block_state.generator
+        )
+        # 7. Prepare mask latent variables
+        block_state.mask, block_state.masked_image_latents = self.prepare_mask_latents(
+            components,
+            block_state.mask,
+            block_state.masked_image,
+            block_state.batch_size,
+            block_state.height,
+            block_state.width,
+            block_state.dtype,
+            block_state.device,
+            block_state.generator,
+        )
+        self.set_block_state(state, block_state)
+        return components, state
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_blocks.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    StableDiffusionXLControlNetInputStep,
+    StableDiffusionXLControlNetUnionInputStep,
+    StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
+    StableDiffusionXLImg2ImgPrepareLatentsStep,
+    StableDiffusionXLImg2ImgSetTimestepsStep,
+    StableDiffusionXLInpaintPrepareLatentsStep,
+    StableDiffusionXLInputStep,
+    StableDiffusionXLPrepareAdditionalConditioningStep,
+    StableDiffusionXLPrepareLatentsStep,
+    StableDiffusionXLSetTimestepsStep,
+)
+from .decoders import (
+    StableDiffusionXLDecodeStep,
+    StableDiffusionXLInpaintOverlayMaskStep,
+)
+from .denoise import (
+    StableDiffusionXLControlNetDenoiseStep,
+    StableDiffusionXLDenoiseStep,
+    StableDiffusionXLInpaintControlNetDenoiseStep,
+    StableDiffusionXLInpaintDenoiseStep,
+)
+from .encoders import (
+    StableDiffusionXLInpaintVaeEncoderStep,
+    StableDiffusionXLIPAdapterStep,
+    StableDiffusionXLTextEncoderStep,
+    StableDiffusionXLVaeEncoderStep,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# auto blocks & sequential blocks & mappings
+# vae encoder (run before before_denoise)
+class StableDiffusionXLAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLInpaintVaeEncoderStep, StableDiffusionXLVaeEncoderStep]
+    block_names = ["inpaint", "img2img"]
+    block_trigger_inputs = ["mask_image", "image"]
+    @property
+    def description(self):
+        return (
+            "Vae encoder step that encode the image inputs into their latent representations.\n"
+            + "This is an auto pipeline block that works for both inpainting and img2img tasks.\n"
+            + " - `StableDiffusionXLInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
+            + " - `StableDiffusionXLVaeEncoderStep` (img2img) is used when only `image` is provided."
+            + " - if neither `mask_image` nor `image` is provided, step will be skipped."
+        )
+# optional ip-adapter (run before input step)
+class StableDiffusionXLAutoIPAdapterStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLIPAdapterStep]
+    block_names = ["ip_adapter"]
+    block_trigger_inputs = ["ip_adapter_image"]
+    @property
+    def description(self):
+        return "Run IP Adapter step if `ip_adapter_image` is provided. This step should be placed before the 'input' step.\n"
+# before_denoise: text2img
+class StableDiffusionXLBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLInputStep,
+        StableDiffusionXLSetTimestepsStep,
+        StableDiffusionXLPrepareLatentsStep,
+        StableDiffusionXLPrepareAdditionalConditioningStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `StableDiffusionXLSetTimestepsStep` is used to set the timesteps\n"
+            + " - `StableDiffusionXLPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `StableDiffusionXLPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
+        )
+# before_denoise: img2img
+class StableDiffusionXLImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLInputStep,
+        StableDiffusionXLImg2ImgSetTimestepsStep,
+        StableDiffusionXLImg2ImgPrepareLatentsStep,
+        StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step for img2img task.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+            + " - `StableDiffusionXLImg2ImgPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
+        )
+# before_denoise: inpainting
+class StableDiffusionXLInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLInputStep,
+        StableDiffusionXLImg2ImgSetTimestepsStep,
+        StableDiffusionXLInpaintPrepareLatentsStep,
+        StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep,
+    ]
+    block_names = ["input", "set_timesteps", "prepare_latents", "prepare_add_cond"]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step for inpainting task.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `StableDiffusionXLInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `StableDiffusionXLImg2ImgSetTimestepsStep` is used to set the timesteps\n"
+            + " - `StableDiffusionXLInpaintPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep` is used to prepare the additional conditioning\n"
+        )
+# before_denoise: all task (text2img, img2img, inpainting)
+class StableDiffusionXLAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLInpaintBeforeDenoiseStep,
+        StableDiffusionXLImg2ImgBeforeDenoiseStep,
+        StableDiffusionXLBeforeDenoiseStep,
+    ]
+    block_names = ["inpaint", "img2img", "text2img"]
+    block_trigger_inputs = ["mask", "image_latents", None]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is an auto pipeline block that works for text2img, img2img and inpainting tasks as well as controlnet, controlnet_union.\n"
+            + " - `StableDiffusionXLInpaintBeforeDenoiseStep` (inpaint) is used when both `mask` and `image_latents` are provided.\n"
+            + " - `StableDiffusionXLImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.\n"
+            + " - `StableDiffusionXLBeforeDenoiseStep` (text2img) is used when both `image_latents` and `mask` are not provided.\n"
+        )
+# optional controlnet input step (after before_denoise, before denoise)
+# works for both controlnet and controlnet_union
+class StableDiffusionXLAutoControlNetInputStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLControlNetUnionInputStep, StableDiffusionXLControlNetInputStep]
+    block_names = ["controlnet_union", "controlnet"]
+    block_trigger_inputs = ["control_mode", "control_image"]
+    @property
+    def description(self):
+        return (
+            "Controlnet Input step that prepare the controlnet input.\n"
+            + "This is an auto pipeline block that works for both controlnet and controlnet_union.\n"
+            + " (it should be called right before the denoise step)"
+            + " - `StableDiffusionXLControlNetUnionInputStep` is called to prepare the controlnet input when `control_mode` and `control_image` are provided.\n"
+            + " - `StableDiffusionXLControlNetInputStep` is called to prepare the controlnet input when `control_image` is provided."
+            + " - if neither `control_mode` nor `control_image` is provided, step will be skipped."
+        )
+# denoise: controlnet (text2img, img2img, inpainting)
+class StableDiffusionXLAutoControlNetDenoiseStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLInpaintControlNetDenoiseStep, StableDiffusionXLControlNetDenoiseStep]
+    block_names = ["inpaint_controlnet_denoise", "controlnet_denoise"]
+    block_trigger_inputs = ["mask", "controlnet_cond"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents with controlnet. "
+            "This is a auto pipeline block that using controlnet for text2img, img2img and inpainting tasks."
+            "This block should not be used without a controlnet_cond input"
+            " - `StableDiffusionXLInpaintControlNetDenoiseStep` (inpaint_controlnet_denoise) is used when mask is provided."
+            " - `StableDiffusionXLControlNetDenoiseStep` (controlnet_denoise) is used when mask is not provided but controlnet_cond is provided."
+            " - If neither mask nor controlnet_cond are provided, step will be skipped."
+        )
+# denoise: all task with or without controlnet (text2img, img2img, inpainting)
+class StableDiffusionXLAutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLAutoControlNetDenoiseStep,
+        StableDiffusionXLInpaintDenoiseStep,
+        StableDiffusionXLDenoiseStep,
+    ]
+    block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"]
+    block_trigger_inputs = ["controlnet_cond", "mask", None]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. "
+            "This is a auto pipeline block that works for text2img, img2img and inpainting tasks. And can be used with or without controlnet."
+            " - `StableDiffusionXLAutoControlNetDenoiseStep` (controlnet_denoise) is used when controlnet_cond is provided (support controlnet withtext2img, img2img and inpainting tasks)."
+            " - `StableDiffusionXLInpaintDenoiseStep` (inpaint_denoise) is used when mask is provided (support inpainting tasks)."
+            " - `StableDiffusionXLDenoiseStep` (denoise) is used when neither mask nor controlnet_cond are provided (support text2img and img2img tasks)."
+        )
+# decode: inpaint
+class StableDiffusionXLInpaintDecodeStep(SequentialPipelineBlocks):
+    block_classes = [StableDiffusionXLDecodeStep, StableDiffusionXLInpaintOverlayMaskStep]
+    block_names = ["decode", "mask_overlay"]
+    @property
+    def description(self):
+        return (
+            "Inpaint decode step that decode the denoised latents into images outputs.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `StableDiffusionXLDecodeStep` is used to decode the denoised latents into images\n"
+            + " - `StableDiffusionXLInpaintOverlayMaskStep` is used to overlay the mask on the image"
+        )
+# decode: all task (text2img, img2img, inpainting)
+class StableDiffusionXLAutoDecodeStep(AutoPipelineBlocks):
+    block_classes = [StableDiffusionXLInpaintDecodeStep, StableDiffusionXLDecodeStep]
+    block_names = ["inpaint", "non-inpaint"]
+    block_trigger_inputs = ["padding_mask_crop", None]
+    @property
+    def description(self):
+        return (
+            "Decode step that decode the denoised latents into images outputs.\n"
+            + "This is an auto pipeline block that works for inpainting and non-inpainting tasks.\n"
+            + " - `StableDiffusionXLInpaintDecodeStep` (inpaint) is used when `padding_mask_crop` is provided.\n"
+            + " - `StableDiffusionXLDecodeStep` (non-inpaint) is used when `padding_mask_crop` is not provided."
+        )
+# ip-adapter, controlnet, text2img, img2img, inpainting
+class StableDiffusionXLAutoBlocks(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLTextEncoderStep,
+        StableDiffusionXLAutoIPAdapterStep,
+        StableDiffusionXLAutoVaeEncoderStep,
+        StableDiffusionXLAutoBeforeDenoiseStep,
+        StableDiffusionXLAutoControlNetInputStep,
+        StableDiffusionXLAutoDenoiseStep,
+        StableDiffusionXLAutoDecodeStep,
+    ]
+    block_names = [
+        "text_encoder",
+        "ip_adapter",
+        "image_encoder",
+        "before_denoise",
+        "controlnet_input",
+        "denoise",
+        "decoder",
+    ]
+    @property
+    def description(self):
+        return (
+            "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using Stable Diffusion XL.\n"
+            + "- for image-to-image generation, you need to provide either `image` or `image_latents`\n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- to run the controlnet workflow, you need to provide `control_image`\n"
+            + "- to run the controlnet_union workflow, you need to provide `control_image` and `control_mode`\n"
+            + "- to run the ip_adapter workflow, you need to provide `ip_adapter_image`\n"
+            + "- for text-to-image generation, all you need to provide is `prompt`"
+        )
+# controlnet (input + denoise step)
+class StableDiffusionXLAutoControlnetStep(SequentialPipelineBlocks):
+    block_classes = [
+        StableDiffusionXLAutoControlNetInputStep,
+        StableDiffusionXLAutoControlNetDenoiseStep,
+    ]
+    block_names = ["controlnet_input", "controlnet_denoise"]
+    @property
+    def description(self):
+        return (
+            "Controlnet auto step that prepare the controlnet input and denoise the latents. "
+            + "It works for both controlnet and controlnet_union and supports text2img, img2img and inpainting tasks."
+            + " (it should be replace at 'denoise' step)"
+        )
+TEXT2IMAGE_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", StableDiffusionXLTextEncoderStep),
+        ("input", StableDiffusionXLInputStep),
+        ("set_timesteps", StableDiffusionXLSetTimestepsStep),
+        ("prepare_latents", StableDiffusionXLPrepareLatentsStep),
+        ("prepare_add_cond", StableDiffusionXLPrepareAdditionalConditioningStep),
+        ("denoise", StableDiffusionXLDenoiseStep),
+        ("decode", StableDiffusionXLDecodeStep),
+    ]
+)
+IMAGE2IMAGE_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", StableDiffusionXLTextEncoderStep),
+        ("image_encoder", StableDiffusionXLVaeEncoderStep),
+        ("input", StableDiffusionXLInputStep),
+        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
+        ("prepare_latents", StableDiffusionXLImg2ImgPrepareLatentsStep),
+        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
+        ("denoise", StableDiffusionXLDenoiseStep),
+        ("decode", StableDiffusionXLDecodeStep),
+    ]
+)
+INPAINT_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", StableDiffusionXLTextEncoderStep),
+        ("image_encoder", StableDiffusionXLInpaintVaeEncoderStep),
+        ("input", StableDiffusionXLInputStep),
+        ("set_timesteps", StableDiffusionXLImg2ImgSetTimestepsStep),
+        ("prepare_latents", StableDiffusionXLInpaintPrepareLatentsStep),
+        ("prepare_add_cond", StableDiffusionXLImg2ImgPrepareAdditionalConditioningStep),
+        ("denoise", StableDiffusionXLInpaintDenoiseStep),
+        ("decode", StableDiffusionXLInpaintDecodeStep),
+    ]
+)
+CONTROLNET_BLOCKS = InsertableDict(
+    [
+        ("denoise", StableDiffusionXLAutoControlnetStep),
+    ]
+)
+IP_ADAPTER_BLOCKS = InsertableDict(
+    [
+        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
+    ]
+)
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", StableDiffusionXLTextEncoderStep),
+        ("ip_adapter", StableDiffusionXLAutoIPAdapterStep),
+        ("image_encoder", StableDiffusionXLAutoVaeEncoderStep),
+        ("before_denoise", StableDiffusionXLAutoBeforeDenoiseStep),
+        ("controlnet_input", StableDiffusionXLAutoControlNetInputStep),
+        ("denoise", StableDiffusionXLAutoDenoiseStep),
+        ("decode", StableDiffusionXLAutoDecodeStep),
+    ]
+)
+ALL_BLOCKS = {
+    "text2img": TEXT2IMAGE_BLOCKS,
+    "img2img": IMAGE2IMAGE_BLOCKS,
+    "inpaint": INPAINT_BLOCKS,
+    "controlnet": CONTROLNET_BLOCKS,
+    "ip_adapter": IP_ADAPTER_BLOCKS,
+    "auto": AUTO_BLOCKS,
+}
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+from ...image_processor import PipelineImageInput
+from ...loaders import ModularIPAdapterMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...pipelines.pipeline_utils import StableDiffusionMixin
+from ...pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+from ..modular_pipeline_utils import InputParam, OutputParam
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# YiYi TODO: move to a different file? stable_diffusion_xl_module should have its own folder?
+# YiYi Notes: model specific components:
+## (1) it should inherit from ModularPipeline
+## (2) acts like a container that holds components and configs
+## (3) define default config (related to components), e.g. default_sample_size, vae_scale_factor, num_channels_unet, num_channels_latents
+## (4) inherit from model-specic loader class (e.g. StableDiffusionXLLoraLoaderMixin)
+## (5) how to use together with Components_manager?
+class StableDiffusionXLModularPipeline(
+    ModularPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    ModularIPAdapterMixin,
+):
+    """
+    A ModularPipeline for Stable Diffusion XL.
+    <Tip warning={true}>
+        This is an experimental feature and is likely to change in the future.
+    </Tip>
+    """
+    @property
+    def default_height(self):
+        return self.default_sample_size * self.vae_scale_factor
+    @property
+    def default_width(self):
+        return self.default_sample_size * self.vae_scale_factor
+    @property
+    def default_sample_size(self):
+        default_sample_size = 128
+        if hasattr(self, "unet") and self.unet is not None:
+            default_sample_size = self.unet.config.sample_size
+        return default_sample_size
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        return vae_scale_factor
+    @property
+    def num_channels_unet(self):
+        num_channels_unet = 4
+        if hasattr(self, "unet") and self.unet is not None:
+            num_channels_unet = self.unet.config.in_channels
+        return num_channels_unet
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 4
+        if hasattr(self, "vae") and self.vae is not None:
+            num_channels_latents = self.vae.config.latent_channels
+        return num_channels_latents
+# YiYi/Sayak TODO: not used yet, maintain a list of schema that can be used across all pipeline blocks
+# auto_docstring
+SDXL_INPUTS_SCHEMA = {
+    "prompt": InputParam(
+        "prompt", type_hint=Union[str, List[str]], description="The prompt or prompts to guide the image generation"
+    ),
+    "prompt_2": InputParam(
+        "prompt_2",
+        type_hint=Union[str, List[str]],
+        description="The prompt or prompts to be sent to the tokenizer_2 and text_encoder_2",
+    ),
+    "negative_prompt": InputParam(
+        "negative_prompt",
+        type_hint=Union[str, List[str]],
+        description="The prompt or prompts not to guide the image generation",
+    ),
+    "negative_prompt_2": InputParam(
+        "negative_prompt_2",
+        type_hint=Union[str, List[str]],
+        description="The negative prompt or prompts for text_encoder_2",
+    ),
+    "cross_attention_kwargs": InputParam(
+        "cross_attention_kwargs",
+        type_hint=Optional[dict],
+        description="Kwargs dictionary passed to the AttentionProcessor",
+    ),
+    "clip_skip": InputParam(
+        "clip_skip", type_hint=Optional[int], description="Number of layers to skip in CLIP text encoder"
+    ),
+    "image": InputParam(
+        "image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="The image(s) to modify for img2img or inpainting",
+    ),
+    "mask_image": InputParam(
+        "mask_image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="Mask image for inpainting, white pixels will be repainted",
+    ),
+    "generator": InputParam(
+        "generator",
+        type_hint=Optional[Union[torch.Generator, List[torch.Generator]]],
+        description="Generator(s) for deterministic generation",
+    ),
+    "height": InputParam("height", type_hint=Optional[int], description="Height in pixels of the generated image"),
+    "width": InputParam("width", type_hint=Optional[int], description="Width in pixels of the generated image"),
+    "num_images_per_prompt": InputParam(
+        "num_images_per_prompt", type_hint=int, default=1, description="Number of images to generate per prompt"
+    ),
+    "num_inference_steps": InputParam(
+        "num_inference_steps", type_hint=int, default=50, description="Number of denoising steps"
+    ),
+    "timesteps": InputParam(
+        "timesteps", type_hint=Optional[torch.Tensor], description="Custom timesteps for the denoising process"
+    ),
+    "sigmas": InputParam(
+        "sigmas", type_hint=Optional[torch.Tensor], description="Custom sigmas for the denoising process"
+    ),
+    "denoising_end": InputParam(
+        "denoising_end",
+        type_hint=Optional[float],
+        description="Fraction of denoising process to complete before termination",
+    ),
+    # YiYi Notes: img2img defaults to 0.3, inpainting defaults to 0.9999
+    "strength": InputParam(
+        "strength", type_hint=float, default=0.3, description="How much to transform the reference image"
+    ),
+    "denoising_start": InputParam(
+        "denoising_start", type_hint=Optional[float], description="Starting point of the denoising process"
+    ),
+    "latents": InputParam(
+        "latents", type_hint=Optional[torch.Tensor], description="Pre-generated noisy latents for image generation"
+    ),
+    "padding_mask_crop": InputParam(
+        "padding_mask_crop",
+        type_hint=Optional[Tuple[int, int]],
+        description="Size of margin in crop for image and mask",
+    ),
+    "original_size": InputParam(
+        "original_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Original size of the image for SDXL's micro-conditioning",
+    ),
+    "target_size": InputParam(
+        "target_size", type_hint=Optional[Tuple[int, int]], description="Target size for SDXL's micro-conditioning"
+    ),
+    "negative_original_size": InputParam(
+        "negative_original_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Negative conditioning based on image resolution",
+    ),
+    "negative_target_size": InputParam(
+        "negative_target_size",
+        type_hint=Optional[Tuple[int, int]],
+        description="Negative conditioning based on target resolution",
+    ),
+    "crops_coords_top_left": InputParam(
+        "crops_coords_top_left",
+        type_hint=Tuple[int, int],
+        default=(0, 0),
+        description="Top-left coordinates for SDXL's micro-conditioning",
+    ),
+    "negative_crops_coords_top_left": InputParam(
+        "negative_crops_coords_top_left",
+        type_hint=Tuple[int, int],
+        default=(0, 0),
+        description="Negative conditioning crop coordinates",
+    ),
+    "aesthetic_score": InputParam(
+        "aesthetic_score", type_hint=float, default=6.0, description="Simulates aesthetic score of generated image"
+    ),
+    "negative_aesthetic_score": InputParam(
+        "negative_aesthetic_score", type_hint=float, default=2.0, description="Simulates negative aesthetic score"
+    ),
+    "eta": InputParam("eta", type_hint=float, default=0.0, description="Parameter η in the DDIM paper"),
+    "output_type": InputParam(
+        "output_type", type_hint=str, default="pil", description="Output format (pil/tensor/np.array)"
+    ),
+    "ip_adapter_image": InputParam(
+        "ip_adapter_image",
+        type_hint=PipelineImageInput,
+        required=True,
+        description="Image(s) to be used as IP adapter",
+    ),
+    "control_image": InputParam(
+        "control_image", type_hint=PipelineImageInput, required=True, description="ControlNet input condition"
+    ),
+    "control_guidance_start": InputParam(
+        "control_guidance_start",
+        type_hint=Union[float, List[float]],
+        default=0.0,
+        description="When ControlNet starts applying",
+    ),
+    "control_guidance_end": InputParam(
+        "control_guidance_end",
+        type_hint=Union[float, List[float]],
+        default=1.0,
+        description="When ControlNet stops applying",
+    ),
+    "controlnet_conditioning_scale": InputParam(
+        "controlnet_conditioning_scale",
+        type_hint=Union[float, List[float]],
+        default=1.0,
+        description="Scale factor for ControlNet outputs",
+    ),
+    "guess_mode": InputParam(
+        "guess_mode",
+        type_hint=bool,
+        default=False,
+        description="Enables ControlNet encoder to recognize input without prompts",
+    ),
+    "control_mode": InputParam(
+        "control_mode", type_hint=List[int], required=True, description="Control mode for union controlnet"
+    ),
+}
+SDXL_INTERMEDIATE_INPUTS_SCHEMA = {
+    "prompt_embeds": InputParam(
+        "prompt_embeds",
+        type_hint=torch.Tensor,
+        required=True,
+        description="Text embeddings used to guide image generation",
+    ),
+    "negative_prompt_embeds": InputParam(
+        "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
+    ),
+    "pooled_prompt_embeds": InputParam(
+        "pooled_prompt_embeds", type_hint=torch.Tensor, required=True, description="Pooled text embeddings"
+    ),
+    "negative_pooled_prompt_embeds": InputParam(
+        "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
+    ),
+    "batch_size": InputParam("batch_size", type_hint=int, required=True, description="Number of prompts"),
+    "dtype": InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
+    "preprocess_kwargs": InputParam(
+        "preprocess_kwargs", type_hint=Optional[dict], description="Kwargs for ImageProcessor"
+    ),
+    "latents": InputParam(
+        "latents", type_hint=torch.Tensor, required=True, description="Initial latents for denoising process"
+    ),
+    "timesteps": InputParam("timesteps", type_hint=torch.Tensor, required=True, description="Timesteps for inference"),
+    "num_inference_steps": InputParam(
+        "num_inference_steps", type_hint=int, required=True, description="Number of denoising steps"
+    ),
+    "latent_timestep": InputParam(
+        "latent_timestep", type_hint=torch.Tensor, required=True, description="Initial noise level timestep"
+    ),
+    "image_latents": InputParam(
+        "image_latents", type_hint=torch.Tensor, required=True, description="Latents representing reference image"
+    ),
+    "mask": InputParam("mask", type_hint=torch.Tensor, required=True, description="Mask for inpainting"),
+    "masked_image_latents": InputParam(
+        "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
+    ),
+    "add_time_ids": InputParam(
+        "add_time_ids", type_hint=torch.Tensor, required=True, description="Time ids for conditioning"
+    ),
+    "negative_add_time_ids": InputParam(
+        "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
+    ),
+    "timestep_cond": InputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
+    "noise": InputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
+    "crops_coords": InputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
+    "ip_adapter_embeds": InputParam(
+        "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
+    ),
+    "negative_ip_adapter_embeds": InputParam(
+        "negative_ip_adapter_embeds",
+        type_hint=List[torch.Tensor],
+        description="Negative image embeddings for IP-Adapter",
+    ),
+    "images": InputParam(
+        "images",
+        type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+        required=True,
+        description="Generated images",
+    ),
+}
+SDXL_INTERMEDIATE_OUTPUTS_SCHEMA = {
+    "prompt_embeds": OutputParam(
+        "prompt_embeds", type_hint=torch.Tensor, description="Text embeddings used to guide image generation"
+    ),
+    "negative_prompt_embeds": OutputParam(
+        "negative_prompt_embeds", type_hint=torch.Tensor, description="Negative text embeddings"
+    ),
+    "pooled_prompt_embeds": OutputParam(
+        "pooled_prompt_embeds", type_hint=torch.Tensor, description="Pooled text embeddings"
+    ),
+    "negative_pooled_prompt_embeds": OutputParam(
+        "negative_pooled_prompt_embeds", type_hint=torch.Tensor, description="Negative pooled text embeddings"
+    ),
+    "batch_size": OutputParam("batch_size", type_hint=int, description="Number of prompts"),
+    "dtype": OutputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
+    "image_latents": OutputParam(
+        "image_latents", type_hint=torch.Tensor, description="Latents representing reference image"
+    ),
+    "mask": OutputParam("mask", type_hint=torch.Tensor, description="Mask for inpainting"),
+    "masked_image_latents": OutputParam(
+        "masked_image_latents", type_hint=torch.Tensor, description="Masked image latents for inpainting"
+    ),
+    "crops_coords": OutputParam("crops_coords", type_hint=Optional[Tuple[int]], description="Crop coordinates"),
+    "timesteps": OutputParam("timesteps", type_hint=torch.Tensor, description="Timesteps for inference"),
+    "num_inference_steps": OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps"),
+    "latent_timestep": OutputParam(
+        "latent_timestep", type_hint=torch.Tensor, description="Initial noise level timestep"
+    ),
+    "add_time_ids": OutputParam("add_time_ids", type_hint=torch.Tensor, description="Time ids for conditioning"),
+    "negative_add_time_ids": OutputParam(
+        "negative_add_time_ids", type_hint=torch.Tensor, description="Negative time ids"
+    ),
+    "timestep_cond": OutputParam("timestep_cond", type_hint=torch.Tensor, description="Timestep conditioning for LCM"),
+    "latents": OutputParam("latents", type_hint=torch.Tensor, description="Denoised latents"),
+    "noise": OutputParam("noise", type_hint=torch.Tensor, description="Noise added to image latents"),
+    "ip_adapter_embeds": OutputParam(
+        "ip_adapter_embeds", type_hint=List[torch.Tensor], description="Image embeddings for IP-Adapter"
+    ),
+    "negative_ip_adapter_embeds": OutputParam(
+        "negative_ip_adapter_embeds",
+        type_hint=List[torch.Tensor],
+        description="Negative image embeddings for IP-Adapter",
+    ),
+    "images": OutputParam(
+        "images",
+        type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+        description="Generated images",
+    ),
+}
+SDXL_OUTPUTS_SCHEMA = {
+    "images": OutputParam(
+        "images",
+        type_hint=Union[
+            Tuple[Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]]], StableDiffusionXLPipelineOutput
+        ],
+        description="The final generated images",
+    )
+}
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -248,14 +248,15 @@ def _get_connected_pipeline(pipeline_cls):
        return _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False)
-def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+def _get_model(pipeline_class_name):
-    def get_model(pipeline_class_name):
+    for task_mapping in SUPPORTED_TASKS_MAPPINGS:
-        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
+        for model_name, pipeline in task_mapping.items():
-            for model_name, pipeline in task_mapping.items():
+            if pipeline.__name__ == pipeline_class_name:
-                if pipeline.__name__ == pipeline_class_name:
+                return model_name
-                    return model_name
-    model_name = get_model(pipeline_class_name)
+def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+    model_name = _get_model(pipeline_class_name)
    if model_name is not None:
        task_class = mapping.get(model_name, None)

--- a/src/diffusers/pipelines/pipeline_loading_utils.py
+++ b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -371,6 +371,22 @@ def maybe_raise_or_warn(
        )
+# a simpler version of get_class_obj_and_candidates, it won't work with custom code
+def simple_get_class_obj(library_name, class_name):
+    from diffusers import pipelines
+    is_pipeline_module = hasattr(pipelines, library_name)
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+        class_obj = getattr(pipeline_module, class_name)
+    else:
+        library = importlib.import_module(library_name)
+        class_obj = getattr(library, class_name)
+    return class_obj
 def get_class_obj_and_candidates(
    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
 ):
@@ -452,7 +468,7 @@ def _get_pipeline_class(
            revision=revision,
        )
-    if class_obj.__name__ != "DiffusionPipeline":
+    if class_obj.__name__ != "DiffusionPipeline" and class_obj.__name__ != "ModularPipeline":
        return class_obj
    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
@@ -892,7 +908,10 @@ def _fetch_class_library_tuple(module):
        library = not_compiled_module.__module__
    # retrieve class_name
-    class_name = not_compiled_module.__class__.__name__
+    if isinstance(not_compiled_module, type):
+        class_name = not_compiled_module.__name__
+    else:
+        class_name = not_compiled_module.__class__.__name__
    return (library, class_name)

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1986,11 +1986,13 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
            f"{'' if k.startswith('_') else '_'}{k}": v for k, v in original_config.items() if k not in pipeline_kwargs
        }
+        optional_components = (
+            pipeline._optional_components
+            if hasattr(pipeline, "_optional_components") and pipeline._optional_components
+            else []
+        )
        missing_modules = (
-            set(expected_modules)
+            set(expected_modules) - set(optional_components) - set(pipeline_kwargs.keys()) - set(true_optional_modules)
-            - set(pipeline._optional_components)
-            - set(pipeline_kwargs.keys())
-            - set(true_optional_modules)
        )
        if len(missing_modules) > 0:

--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -2,6 +2,126 @@
 from ..utils import DummyObject, requires_backends
+class AdaptiveProjectedGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class AutoGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class ClassifierFreeGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class ClassifierFreeZeroStarGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class PerturbedAttentionGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class SkipLayerGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class SmoothedEnergyGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class TangentialClassifierFreeGuidance(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
 class FasterCacheConfig(metaclass=DummyObject):
    _backends = ["torch"]
@@ -47,6 +167,21 @@ class HookRegistry(metaclass=DummyObject):
        requires_backends(cls, ["torch"])
+class LayerSkipConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
 class PyramidAttentionBroadcastConfig(metaclass=DummyObject):
    _backends = ["torch"]
@@ -62,6 +197,21 @@ class PyramidAttentionBroadcastConfig(metaclass=DummyObject):
        requires_backends(cls, ["torch"])
+class SmoothedEnergyGuidanceConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
 def apply_faster_cache(*args, **kwargs):
    requires_backends(apply_faster_cache, ["torch"])
@@ -70,6 +220,10 @@ def apply_first_block_cache(*args, **kwargs):
    requires_backends(apply_first_block_cache, ["torch"])
+def apply_layer_skip(*args, **kwargs):
+    requires_backends(apply_layer_skip, ["torch"])
 def apply_pyramid_attention_broadcast(*args, **kwargs):
    requires_backends(apply_pyramid_attention_broadcast, ["torch"])
@@ -1199,6 +1353,66 @@ class WanVACETransformer3DModel(metaclass=DummyObject):
        requires_backends(cls, ["torch"])
+class ComponentsManager(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class ComponentSpec(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class ModularPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+class ModularPipelineBlocks(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
 def get_constant_schedule(*args, **kwargs):
    requires_backends(get_constant_schedule, ["torch"])

--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2,6 +2,36 @@
 from ..utils import DummyObject, requires_backends
+class StableDiffusionXLAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+class StableDiffusionXLModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class AllegroPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -20,8 +20,11 @@ import json
 import os
 import re
 import shutil
+import signal
 import sys
+import threading
 from pathlib import Path
+from types import ModuleType
 from typing import Dict, Optional, Union
 from urllib import request
@@ -37,6 +40,8 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # See https://huggingface.co/datasets/diffusers/community-pipelines-mirror
 COMMUNITY_PIPELINES_MIRROR_ID = "diffusers/community-pipelines-mirror"
+TIME_OUT_REMOTE_CODE = int(os.getenv("DIFFUSERS_TIMEOUT_REMOTE_CODE", 15))
+_HF_REMOTE_CODE_LOCK = threading.Lock()
 def get_diffusers_versions():
@@ -154,33 +159,87 @@ def check_imports(filename):
    return get_relative_imports(filename)
-def get_class_in_module(class_name, module_path, pretrained_model_name_or_path=None):
+def _raise_timeout_error(signum, frame):
+    raise ValueError(
+        "Loading this model requires you to execute custom code contained in the model repository on your local "
+        "machine. Please set the option `trust_remote_code=True` to permit loading of this model."
+    )
+def resolve_trust_remote_code(trust_remote_code, model_name, has_remote_code):
+    if trust_remote_code is None:
+        if has_remote_code and TIME_OUT_REMOTE_CODE > 0:
+            prev_sig_handler = None
+            try:
+                prev_sig_handler = signal.signal(signal.SIGALRM, _raise_timeout_error)
+                signal.alarm(TIME_OUT_REMOTE_CODE)
+                while trust_remote_code is None:
+                    answer = input(
+                        f"The repository for {model_name} contains custom code which must be executed to correctly "
+                        f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
+                        f"You can avoid this prompt in future by passing the argument `trust_remote_code=True`.\n\n"
+                        f"Do you wish to run the custom code? [y/N] "
+                    )
+                    if answer.lower() in ["yes", "y", "1"]:
+                        trust_remote_code = True
+                    elif answer.lower() in ["no", "n", "0", ""]:
+                        trust_remote_code = False
+                signal.alarm(0)
+            except Exception:
+                # OS which does not support signal.SIGALRM
+                raise ValueError(
+                    f"The repository for {model_name} contains custom code which must be executed to correctly "
+                    f"load the model. You can inspect the repository content at https://hf.co/{model_name}.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+            finally:
+                if prev_sig_handler is not None:
+                    signal.signal(signal.SIGALRM, prev_sig_handler)
+                    signal.alarm(0)
+        elif has_remote_code:
+            # For the CI which puts the timeout at 0
+            _raise_timeout_error(None, None)
+    if has_remote_code and not trust_remote_code:
+        raise ValueError(
+            f"Loading {model_name} requires you to execute the configuration file in that"
+            " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+            " set the option `trust_remote_code=True` to remove this error."
+        )
+    return trust_remote_code
+def get_class_in_module(class_name, module_path, force_reload=False):
    """
    Import a module on the cache directory for modules and extract a class from it.
    """
-    module_path = module_path.replace(os.path.sep, ".")
+    name = os.path.normpath(module_path)
-    try:
+    if name.endswith(".py"):
-        module = importlib.import_module(module_path)
+        name = name[:-3]
-    except ModuleNotFoundError as e:
+    name = name.replace(os.path.sep, ".")
-        # This can happen when the repo id contains ".", which Python's import machinery interprets as a directory
+    module_file: Path = Path(HF_MODULES_CACHE) / module_path
-        # separator. We do a bit of monkey patching to detect and fix this case.
-        if not (
+    with _HF_REMOTE_CODE_LOCK:
-            pretrained_model_name_or_path is not None
+        if force_reload:
-            and "." in pretrained_model_name_or_path
+            sys.modules.pop(name, None)
-            and module_path.startswith("diffusers_modules")
+            importlib.invalidate_caches()
-            and pretrained_model_name_or_path.replace("/", "--") in module_path
+        cached_module: Optional[ModuleType] = sys.modules.get(name)
-        ):
+        module_spec = importlib.util.spec_from_file_location(name, location=module_file)
-            raise e  # We can't figure this one out, just reraise the original error
+        module: ModuleType
+        if cached_module is None:
+            module = importlib.util.module_from_spec(module_spec)
+            # insert it into sys.modules before any loading begins
+            sys.modules[name] = module
+        else:
+            module = cached_module
-        corrected_path = os.path.join(HF_MODULES_CACHE, module_path.replace(".", "/")) + ".py"
+        module_spec.loader.exec_module(module)
-        corrected_path = corrected_path.replace(
-            pretrained_model_name_or_path.replace("/", "--").replace(".", "/"),
-            pretrained_model_name_or_path.replace("/", "--"),
-        )
-        module = importlib.machinery.SourceFileLoader(module_path, corrected_path).load_module()
    if class_name is None:
        return find_pipeline_class(module)
    return getattr(module, class_name)
@@ -472,4 +531,4 @@ def get_class_from_dynamic_module(
        revision=revision,
        local_files_only=local_files_only,
    )
-    return get_class_in_module(class_name, final_module.replace(".py", ""), pretrained_model_name_or_path)
+    return get_class_in_module(class_name, final_module)
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -467,6 +467,7 @@ class PushToHubMixin:
        token: Optional[str] = None,
        commit_message: Optional[str] = None,
        create_pr: bool = False,
+        subfolder: Optional[str] = None,
    ):
        """
        Uploads all files in `working_dir` to `repo_id`.
@@ -481,7 +482,12 @@ class PushToHubMixin:
        logger.info(f"Uploading the files of {working_dir} to {repo_id}.")
        return upload_folder(
-            repo_id=repo_id, folder_path=working_dir, token=token, commit_message=commit_message, create_pr=create_pr
+            repo_id=repo_id,
+            folder_path=working_dir,
+            token=token,
+            commit_message=commit_message,
+            create_pr=create_pr,
+            path_in_repo=subfolder,
        )
    def push_to_hub(
@@ -493,6 +499,7 @@ class PushToHubMixin:
        create_pr: bool = False,
        safe_serialization: bool = True,
        variant: Optional[str] = None,
+        subfolder: Optional[str] = None,
    ) -> str:
        """
        Upload model, scheduler, or pipeline files to the 🤗 Hugging Face Hub.
@@ -534,8 +541,9 @@ class PushToHubMixin:
        repo_id = create_repo(repo_id, private=private, token=token, exist_ok=True).repo_id
        # Create a new empty model card and eventually tag it
-        model_card = load_or_create_model_card(repo_id, token=token)
+        if not subfolder:
-        model_card = populate_model_card(model_card)
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
        # Save all files.
        save_kwargs = {"safe_serialization": safe_serialization}
@@ -546,7 +554,8 @@ class PushToHubMixin:
            self.save_pretrained(tmpdir, **save_kwargs)
            # Update model card if needed:
-            model_card.save(os.path.join(tmpdir, "README.md"))
+            if not subfolder:
+                model_card.save(os.path.join(tmpdir, "README.md"))
            return self._upload_folder(
                tmpdir,
@@ -554,4 +563,5 @@ class PushToHubMixin:
                token=token,
                commit_message=commit_message,
                create_pr=create_pr,
+                subfolder=subfolder,
            )