[Modular] Qwen (#12220)

* add qwen modular

[Modular] Qwen (#12220)
* add qwen modular
f50b18ee · YiYi Xu · GitHub · fc337d58 · f50b18ee · f50b18ee
Unverified Commit f50b18ee authored Sep 08, 2025 by YiYi Xu Committed by GitHub Sep 08, 2025
17 changed files
--- a/docs/source/en/api/image_processor.md
+++ b/docs/source/en/api/image_processor.md
@@ -20,6 +20,12 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu

 [[autodoc]] image_processor.VaeImageProcessor

+## InpaintProcessor
+
+The [`InpaintProcessor`] accepts `mask` and `image` inputs and process them together. Optionally, it can accept padding_mask_crop and apply mask overlay.
+
+[[autodoc]] image_processor.InpaintProcessor
+
 ## VaeImageProcessorLDM3D

 The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.

--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -385,6 +385,10 @@ else:
        [
            "FluxAutoBlocks",
            "FluxModularPipeline",
+            "QwenImageAutoBlocks",
+            "QwenImageEditAutoBlocks",
+            "QwenImageEditModularPipeline",
+            "QwenImageModularPipeline",
            "StableDiffusionXLAutoBlocks",
            "StableDiffusionXLModularPipeline",
            "WanAutoBlocks",
@@ -1038,6 +1042,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .modular_pipelines import (
            FluxAutoBlocks,
            FluxModularPipeline,
+            QwenImageAutoBlocks,
+            QwenImageEditAutoBlocks,
+            QwenImageEditModularPipeline,
+            QwenImageModularPipeline,
            StableDiffusionXLAutoBlocks,
            StableDiffusionXLModularPipeline,
            WanAutoBlocks,

--- a/src/diffusers/hooks/_helpers.py
+++ b/src/diffusers/hooks/_helpers.py
@@ -108,6 +108,7 @@ def _register_attention_processors_metadata():
    from ..models.attention_processor import AttnProcessor2_0
    from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor
    from ..models.transformers.transformer_flux import FluxAttnProcessor
+    from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0
    from ..models.transformers.transformer_wan import WanAttnProcessor2_0

    # AttnProcessor2_0
@@ -140,6 +141,14 @@ def _register_attention_processors_metadata():
        metadata=AttentionProcessorMetadata(skip_processor_output_fn=_skip_proc_output_fn_Attention_FluxAttnProcessor),
    )

+    # QwenDoubleStreamAttnProcessor2
+    AttentionProcessorRegistry.register(
+        model_class=QwenDoubleStreamAttnProcessor2_0,
+        metadata=AttentionProcessorMetadata(
+            skip_processor_output_fn=_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0
+        ),
+    )
+

 def _register_transformer_blocks_metadata():
    from ..models.attention import BasicTransformerBlock
@@ -298,4 +307,5 @@ _skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___h
 _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states
 # not sure what this is yet.
 _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
+_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states
 # fmt: on
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -523,6 +523,7 @@ class VaeImageProcessor(ConfigMixin):
                size=(height, width),
            )
            image = self.pt_to_numpy(image)
+
        return image

    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
@@ -838,6 +839,137 @@ class VaeImageProcessor(ConfigMixin):
        return image


+class InpaintProcessor(ConfigMixin):
+    """
+    Image processor for inpainting image and mask.
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        vae_latent_channels: int = 4,
+        resample: str = "lanczos",
+        reducing_gap: int = None,
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_grayscale: bool = False,
+        mask_do_normalize: bool = False,
+        mask_do_binarize: bool = True,
+        mask_do_convert_grayscale: bool = True,
+    ):
+        super().__init__()
+
+        self._image_processor = VaeImageProcessor(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            vae_latent_channels=vae_latent_channels,
+            resample=resample,
+            reducing_gap=reducing_gap,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+        self._mask_processor = VaeImageProcessor(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            vae_latent_channels=vae_latent_channels,
+            resample=resample,
+            reducing_gap=reducing_gap,
+            do_normalize=mask_do_normalize,
+            do_binarize=mask_do_binarize,
+            do_convert_grayscale=mask_do_convert_grayscale,
+        )
+
+    def preprocess(
+        self,
+        image: PIL.Image.Image,
+        mask: PIL.Image.Image = None,
+        height: int = None,
+        width: int = None,
+        padding_mask_crop: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Preprocess the image and mask.
+        """
+        if mask is None and padding_mask_crop is not None:
+            raise ValueError("mask must be provided if padding_mask_crop is provided")
+
+        # if mask is None, same behavior as regular image processor
+        if mask is None:
+            return self._image_processor.preprocess(image, height=height, width=width)
+
+        if padding_mask_crop is not None:
+            crops_coords = self._image_processor.get_crop_region(mask, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        processed_image = self._image_processor.preprocess(
+            image,
+            height=height,
+            width=width,
+            crops_coords=crops_coords,
+            resize_mode=resize_mode,
+        )
+
+        processed_mask = self._mask_processor.preprocess(
+            mask,
+            height=height,
+            width=width,
+            resize_mode=resize_mode,
+            crops_coords=crops_coords,
+        )
+
+        if crops_coords is not None:
+            postprocessing_kwargs = {
+                "crops_coords": crops_coords,
+                "original_image": image,
+                "original_mask": mask,
+            }
+        else:
+            postprocessing_kwargs = {
+                "crops_coords": None,
+                "original_image": None,
+                "original_mask": None,
+            }
+
+        return processed_image, processed_mask, postprocessing_kwargs
+
+    def postprocess(
+        self,
+        image: torch.Tensor,
+        output_type: str = "pil",
+        original_image: Optional[PIL.Image.Image] = None,
+        original_mask: Optional[PIL.Image.Image] = None,
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> Tuple[PIL.Image.Image, PIL.Image.Image]:
+        """
+        Postprocess the image, optionally apply mask overlay
+        """
+        image = self._image_processor.postprocess(
+            image,
+            output_type=output_type,
+        )
+        # optionally apply the mask overlay
+        if crops_coords is not None and (original_image is None or original_mask is None):
+            raise ValueError("original_image and original_mask must be provided if crops_coords is provided")
+
+        elif crops_coords is not None and output_type != "pil":
+            raise ValueError("output_type must be 'pil' if crops_coords is provided")
+
+        elif crops_coords is not None:
+            image = [
+                self._image_processor.apply_overlay(original_mask, original_image, i, crops_coords) for i in image
+            ]
+
+        return image
+
+
 class VaeImageProcessorLDM3D(VaeImageProcessor):
    """
    Image processor for VAE LDM3D.

--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -47,6 +47,12 @@ else:
    _import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"]
    _import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"]
    _import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"]
+    _import_structure["qwenimage"] = [
+        "QwenImageAutoBlocks",
+        "QwenImageModularPipeline",
+        "QwenImageEditModularPipeline",
+        "QwenImageEditAutoBlocks",
+    ]
    _import_structure["components_manager"] = ["ComponentsManager"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -68,6 +74,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SequentialPipelineBlocks,
        )
        from .modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, InsertableDict, OutputParam
+        from .qwenimage import (
+            QwenImageAutoBlocks,
+            QwenImageEditAutoBlocks,
+            QwenImageEditModularPipeline,
+            QwenImageModularPipeline,
+        )
        from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
        from .wan import WanAutoBlocks, WanModularPipeline
 else:

--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -56,6 +56,8 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
        ("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
        ("wan", "WanModularPipeline"),
        ("flux", "FluxModularPipeline"),
+        ("qwenimage", "QwenImageModularPipeline"),
+        ("qwenimage-edit", "QwenImageEditModularPipeline"),
    ]
 )

@@ -64,6 +66,8 @@ MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
        ("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
        ("WanModularPipeline", "WanAutoBlocks"),
        ("FluxModularPipeline", "FluxAutoBlocks"),
+        ("QwenImageModularPipeline", "QwenImageAutoBlocks"),
+        ("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
    ]
 )

@@ -133,8 +137,8 @@ class PipelineState:
        Allow attribute access to intermediate values. If an attribute is not found in the object, look for it in the
        intermediates dict.
        """
-        if name in self.intermediates:
-            return self.intermediates[name]
+        if name in self.values:
+            return self.values[name]
        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")

    def __repr__(self):
@@ -548,8 +552,11 @@ class AutoPipelineBlocks(ModularPipelineBlocks):

    def __init__(self):
        sub_blocks = InsertableDict()
-        for block_name, block_cls in zip(self.block_names, self.block_classes):
-            sub_blocks[block_name] = block_cls()
+        for block_name, block in zip(self.block_names, self.block_classes):
+            if inspect.isclass(block):
+                sub_blocks[block_name] = block()
+            else:
+                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks
        if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
            raise ValueError(
@@ -830,7 +837,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        return expected_configs

    @classmethod
-    def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "SequentialPipelineBlocks":
+    def from_blocks_dict(
+        cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
+    ) -> "SequentialPipelineBlocks":
        """Creates a SequentialPipelineBlocks instance from a dictionary of blocks.

        Args:
@@ -852,12 +861,19 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
        instance.block_classes = [block.__class__ for block in sub_blocks.values()]
        instance.block_names = list(sub_blocks.keys())
        instance.sub_blocks = sub_blocks
+
+        if description is not None:
+            instance.description = description
+
        return instance

    def __init__(self):
        sub_blocks = InsertableDict()
-        for block_name, block_cls in zip(self.block_names, self.block_classes):
-            sub_blocks[block_name] = block_cls()
+        for block_name, block in zip(self.block_names, self.block_classes):
+            if inspect.isclass(block):
+                sub_blocks[block_name] = block()
+            else:
+                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks

    def _get_inputs(self):
@@ -1280,8 +1296,11 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):

    def __init__(self):
        sub_blocks = InsertableDict()
-        for block_name, block_cls in zip(self.block_names, self.block_classes):
-            sub_blocks[block_name] = block_cls()
+        for block_name, block in zip(self.block_names, self.block_classes):
+            if inspect.isclass(block):
+                sub_blocks[block_name] = block()
+            else:
+                sub_blocks[block_name] = block
        self.sub_blocks = sub_blocks

    @classmethod

--- a/src/diffusers/modular_pipelines/qwenimage/__init__.py
+++ b/src/diffusers/modular_pipelines/qwenimage/__init__.py
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["encoders"] = ["QwenImageTextEncoderStep"]
+    _import_structure["modular_blocks"] = [
+        "ALL_BLOCKS",
+        "AUTO_BLOCKS",
+        "CONTROLNET_BLOCKS",
+        "EDIT_AUTO_BLOCKS",
+        "EDIT_BLOCKS",
+        "EDIT_INPAINT_BLOCKS",
+        "IMAGE2IMAGE_BLOCKS",
+        "INPAINT_BLOCKS",
+        "TEXT2IMAGE_BLOCKS",
+        "QwenImageAutoBlocks",
+        "QwenImageEditAutoBlocks",
+    ]
+    _import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .encoders import (
+            QwenImageTextEncoderStep,
+        )
+        from .modular_blocks import (
+            ALL_BLOCKS,
+            AUTO_BLOCKS,
+            CONTROLNET_BLOCKS,
+            EDIT_AUTO_BLOCKS,
+            EDIT_BLOCKS,
+            EDIT_INPAINT_BLOCKS,
+            IMAGE2IMAGE_BLOCKS,
+            INPAINT_BLOCKS,
+            TEXT2IMAGE_BLOCKS,
+            QwenImageAutoBlocks,
+            QwenImageEditAutoBlocks,
+        )
+        from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import InpaintProcessor, VaeImageProcessor
+from ...models import AutoencoderKLQwenImage
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
+
+
+logger = logging.get_logger(__name__)
+
+
+class QwenImageDecoderStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the latents to images"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        components = [
+            ComponentSpec("vae", AutoencoderKLQwenImage),
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
+        return components
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
+                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
+        block_state.latents = components.pachifier.unpack_latents(
+            block_state.latents, block_state.height, block_state.width
+        )
+        block_state.latents = block_state.latents.to(components.vae.dtype)
+
+        latents_mean = (
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(block_state.latents.device, block_state.latents.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+            1, components.vae.config.z_dim, 1, 1, 1
+        ).to(block_state.latents.device, block_state.latents.dtype)
+        block_state.latents = block_state.latents / latents_std + latents_mean
+        block_state.images = components.vae.decode(block_state.latents, return_dict=False)[0][:, :, 0]
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "postprocess the generated image"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("images", required=True, description="the generated image from decoders step"),
+            InputParam(
+                name="output_type",
+                default="pil",
+                type_hint=str,
+                description="The type of the output images, can be 'pil', 'np', 'pt'",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(output_type):
+        if output_type not in ["pil", "np", "pt"]:
+            raise ValueError(f"Invalid output_type: {output_type}")
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(block_state.output_type)
+
+        block_state.images = components.image_processor.postprocess(
+            image=block_state.images,
+            output_type=block_state.output_type,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "postprocess the generated image, optional apply the mask overally to the original image.."
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "image_mask_processor",
+                InpaintProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("images", required=True, description="the generated image from decoders step"),
+            InputParam(
+                name="output_type",
+                default="pil",
+                type_hint=str,
+                description="The type of the output images, can be 'pil', 'np', 'pt'",
+            ),
+            InputParam("mask_overlay_kwargs"),
+        ]
+
+    @staticmethod
+    def check_inputs(output_type, mask_overlay_kwargs):
+        if output_type not in ["pil", "np", "pt"]:
+            raise ValueError(f"Invalid output_type: {output_type}")
+
+        if mask_overlay_kwargs and output_type != "pil":
+            raise ValueError("only support output_type 'pil' for mask overlay")
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(block_state.output_type, block_state.mask_overlay_kwargs)
+
+        if block_state.mask_overlay_kwargs is None:
+            mask_overlay_kwargs = {}
+        else:
+            mask_overlay_kwargs = block_state.mask_overlay_kwargs
+
+        block_state.images = components.image_mask_processor.postprocess(
+            image=block_state.images,
+            **mask_overlay_kwargs,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+
+import torch
+
+from ...models import QwenImageMultiControlNetModel
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
+
+
+def repeat_tensor_to_batch_size(
+    input_name: str,
+    input_tensor: torch.Tensor,
+    batch_size: int,
+    num_images_per_prompt: int = 1,
+) -> torch.Tensor:
+    """Repeat tensor elements to match the final batch size.
+
+    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_images_per_prompt)
+    by repeating each element along dimension 0.
+
+    The input tensor must have batch size 1 or batch_size. The function will:
+    - If batch size is 1: repeat each element (batch_size * num_images_per_prompt) times
+    - If batch size equals batch_size: repeat each element num_images_per_prompt times
+
+    Args:
+        input_name (str): Name of the input tensor (used for error messages)
+        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
+        batch_size (int): The base batch size (number of prompts)
+        num_images_per_prompt (int, optional): Number of images to generate per prompt. Defaults to 1.
+
+    Returns:
+        torch.Tensor: The repeated tensor with final batch size (batch_size * num_images_per_prompt)
+
+    Raises:
+        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
+
+    Examples:
+        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
+        batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
+        [4, 3]
+
+        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
+        tensor, batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        - shape: [4, 3]
+    """
+    # make sure input is a tensor
+    if not isinstance(input_tensor, torch.Tensor):
+        raise ValueError(f"`{input_name}` must be a tensor")
+
+    # make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts
+    if input_tensor.shape[0] == 1:
+        repeat_by = batch_size * num_images_per_prompt
+    elif input_tensor.shape[0] == batch_size:
+        repeat_by = num_images_per_prompt
+    else:
+        raise ValueError(
+            f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}"
+        )
+
+    # expand the tensor to match the batch_size * num_images_per_prompt
+    input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0)
+
+    return input_tensor
+
+
+def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: int) -> Tuple[int, int]:
+    """Calculate image dimensions from latent tensor dimensions.
+
+    This function converts latent space dimensions to image space dimensions by multiplying the latent height and width
+    by the VAE scale factor.
+
+    Args:
+        latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
+            Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
+        vae_scale_factor (int): The scale factor used by the VAE to compress images.
+            Typically 8 for most VAEs (image is 8x larger than latents in each dimension)
+
+    Returns:
+        Tuple[int, int]: The calculated image dimensions as (height, width)
+
+    Raises:
+        ValueError: If latents tensor doesn't have 4 or 5 dimensions
+
+    """
+    # make sure the latents are not packed
+    if latents.ndim != 4 and latents.ndim != 5:
+        raise ValueError(f"unpacked latents must have 4 or 5 dimensions, but got {latents.ndim}")
+
+    latent_height, latent_width = latents.shape[-2:]
+
+    height = latent_height * vae_scale_factor
+    width = latent_width * vae_scale_factor
+
+    return height, width
+
+
+class QwenImageTextInputsStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        summary_section = (
+            "Text input processing step that standardizes text embeddings for the pipeline.\n"
+            "This step:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
+        )
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps."
+
+        return summary_section + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
+            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
+            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(
+        prompt_embeds,
+        prompt_embeds_mask,
+        negative_prompt_embeds,
+        negative_prompt_embeds_mask,
+    ):
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError("`negative_prompt_embeds_mask` is required when `negative_prompt_embeds` is not None")
+
+        if negative_prompt_embeds is None and negative_prompt_embeds_mask is not None:
+            raise ValueError("cannot pass `negative_prompt_embeds_mask` without `negative_prompt_embeds`")
+
+        if prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]:
+            raise ValueError("`prompt_embeds_mask` must have the same batch size as `prompt_embeds`")
+
+        elif negative_prompt_embeds is not None and negative_prompt_embeds.shape[0] != prompt_embeds.shape[0]:
+            raise ValueError("`negative_prompt_embeds` must have the same batch size as `prompt_embeds`")
+
+        elif (
+            negative_prompt_embeds_mask is not None and negative_prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]
+        ):
+            raise ValueError("`negative_prompt_embeds_mask` must have the same batch size as `prompt_embeds`")
+
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(
+            prompt_embeds=block_state.prompt_embeds,
+            prompt_embeds_mask=block_state.prompt_embeds_mask,
+            negative_prompt_embeds=block_state.negative_prompt_embeds,
+            negative_prompt_embeds_mask=block_state.negative_prompt_embeds_mask,
+        )
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+
+        _, seq_len, _ = block_state.prompt_embeds.shape
+
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+        )
+
+        block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.repeat(1, block_state.num_images_per_prompt, 1)
+        block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.view(
+            block_state.batch_size * block_state.num_images_per_prompt, seq_len
+        )
+
+        if block_state.negative_prompt_embeds is not None:
+            _, seq_len, _ = block_state.negative_prompt_embeds.shape
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(
+                1, block_state.num_images_per_prompt, 1
+            )
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
+                block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+            )
+
+            block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.repeat(
+                1, block_state.num_images_per_prompt, 1
+            )
+            block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.view(
+                block_state.batch_size * block_state.num_images_per_prompt, seq_len
+            )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class QwenImageInputsDynamicStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
+
+        This step handles multiple common tasks to prepare inputs for the denoising step:
+        1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
+        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
+
+        This is a dynamic block that allows you to configure which inputs to process.
+
+        Args:
+            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
+                These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
+                list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
+            additional_batch_inputs (List[str], optional):
+                Names of additional conditional input tensors to expand batch size. These tensors will only have their
+                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
+                Defaults to []. Examples: ["processed_mask_image"]
+
+        Examples:
+            # Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
+
+            # Configure to process multiple image latent inputs
+            QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
+
+            # Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
+                image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            )
+        """
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        # Functionality section
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        # Inputs info
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+        # Add image latent inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        # Add additional batch inputs
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate height/width from latents
+            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+
+            # 2. Patchify the image latent tensor
+            image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
+
+            # 3. Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=image_latent_input_name,
+                input_tensor=image_latent_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, image_latent_input_name, image_latent_tensor)
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class QwenImageControlNetInputsStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="control_image_latents", required=True),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        if isinstance(components.controlnet, QwenImageMultiControlNetModel):
+            control_image_latents = []
+            # loop through each control_image_latents
+            for i, control_image_latents_ in enumerate(block_state.control_image_latents):
+                # 1. update height/width if not provided
+                height, width = calculate_dimension_from_latents(control_image_latents_, components.vae_scale_factor)
+                block_state.height = block_state.height or height
+                block_state.width = block_state.width or width
+
+                # 2. pack
+                control_image_latents_ = components.pachifier.pack_latents(control_image_latents_)
+
+                # 3. repeat to match the batch size
+                control_image_latents_ = repeat_tensor_to_batch_size(
+                    input_name=f"control_image_latents[{i}]",
+                    input_tensor=control_image_latents_,
+                    num_images_per_prompt=block_state.num_images_per_prompt,
+                    batch_size=block_state.batch_size,
+                )
+
+                control_image_latents.append(control_image_latents_)
+
+            block_state.control_image_latents = control_image_latents
+
+        else:
+            # 1. update height/width if not provided
+            height, width = calculate_dimension_from_latents(
+                block_state.control_image_latents, components.vae_scale_factor
+            )
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+
+            # 2. pack
+            block_state.control_image_latents = components.pachifier.pack_latents(block_state.control_image_latents)
+
+            # 3. repeat to match the batch size
+            block_state.control_image_latents = repeat_tensor_to_batch_size(
+                input_name="control_image_latents",
+                input_tensor=block_state.control_image_latents,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            block_state.control_image_latents = block_state.control_image_latents
+
+        self.set_block_state(state, block_state)
+
+        return components, state
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
--- a/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_pipeline.py
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import QwenImageLoraLoaderMixin
+from ..modular_pipeline import ModularPipeline
+
+
+class QwenImagePachifier(ConfigMixin):
+    """
+    A class to pack and unpack latents for QwenImage.
+    """
+
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+    ):
+        super().__init__()
+
+    def pack_latents(self, latents):
+        if latents.ndim != 4 and latents.ndim != 5:
+            raise ValueError(f"Latents must have 4 or 5 dimensions, but got {latents.ndim}")
+
+        if latents.ndim == 4:
+            latents = latents.unsqueeze(2)
+
+        batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width = latents.shape
+        patch_size = self.config.patch_size
+
+        if latent_height % patch_size != 0 or latent_width % patch_size != 0:
+            raise ValueError(
+                f"Latent height and width must be divisible by {patch_size}, but got {latent_height} and {latent_width}"
+            )
+
+        latents = latents.view(
+            batch_size,
+            num_channels_latents,
+            latent_height // patch_size,
+            patch_size,
+            latent_width // patch_size,
+            patch_size,
+        )
+        latents = latents.permute(
+            0, 2, 4, 1, 3, 5
+        )  # Batch_size, num_patches_height, num_patches_width, num_channels_latents, patch_size, patch_size
+        latents = latents.reshape(
+            batch_size,
+            (latent_height // patch_size) * (latent_width // patch_size),
+            num_channels_latents * patch_size * patch_size,
+        )
+
+        return latents
+
+    def unpack_latents(self, latents, height, width, vae_scale_factor=8):
+        if latents.ndim != 3:
+            raise ValueError(f"Latents must have 3 dimensions, but got {latents.ndim}")
+
+        batch_size, num_patches, channels = latents.shape
+        patch_size = self.config.patch_size
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = patch_size * (int(height) // (vae_scale_factor * patch_size))
+        width = patch_size * (int(width) // (vae_scale_factor * patch_size))
+
+        latents = latents.view(
+            batch_size,
+            height // patch_size,
+            width // patch_size,
+            channels // (patch_size * patch_size),
+            patch_size,
+            patch_size,
+        )
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (patch_size * patch_size), 1, height, width)
+
+        return latents
+
+
+class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
+    """
+    A ModularPipeline for QwenImage.
+
+    <Tip warning={true}>
+
+        This is an experimental feature and is likely to change in the future.
+
+    </Tip>
+    """
+
+    @property
+    def default_height(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_width(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_sample_size(self):
+        return 128
+
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 16
+        if hasattr(self, "transformer") and self.transformer is not None:
+            num_channels_latents = self.transformer.config.in_channels // 4
+        return num_channels_latents
+
+    @property
+    def is_guidance_distilled(self):
+        is_guidance_distilled = False
+        if hasattr(self, "transformer") and self.transformer is not None:
+            is_guidance_distilled = self.transformer.config.guidance_embeds
+        return is_guidance_distilled
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds
+
+
+class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
+    """
+    A ModularPipeline for QwenImage-Edit.
+
+    <Tip warning={true}>
+
+        This is an experimental feature and is likely to change in the future.
+
+    </Tip>
+    """
+
+    # YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
+    @property
+    def default_height(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_width(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_sample_size(self):
+        return 128
+
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 16
+        if hasattr(self, "transformer") and self.transformer is not None:
+            num_channels_latents = self.transformer.config.in_channels // 4
+        return num_channels_latents
+
+    @property
+    def is_guidance_distilled(self):
+        is_guidance_distilled = False
+        if hasattr(self, "transformer") and self.transformer is not None:
+            is_guidance_distilled = self.transformer.config.guidance_embeds
+        return is_guidance_distilled
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/modular_pipeline.py
@@ -76,6 +76,7 @@ class StableDiffusionXLModularPipeline(
            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        return vae_scale_factor

+    # YiYi TODO: change to num_channels_latents
    @property
    def num_channels_unet(self):
        num_channels_unet = 4

--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -91,6 +91,14 @@ from .pag import (
    StableDiffusionXLPAGPipeline,
 )
 from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
+from .qwenimage import (
+    QwenImageControlNetPipeline,
+    QwenImageEditInpaintPipeline,
+    QwenImageEditPipeline,
+    QwenImageImg2ImgPipeline,
+    QwenImageInpaintPipeline,
+    QwenImagePipeline,
+)
 from .sana import SanaPipeline
 from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
 from .stable_diffusion import (
@@ -150,6 +158,8 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("cogview3", CogView3PlusPipeline),
        ("cogview4", CogView4Pipeline),
        ("cogview4-control", CogView4ControlPipeline),
+        ("qwenimage", QwenImagePipeline),
+        ("qwenimage-controlnet", QwenImageControlNetPipeline),
    ]
 )

@@ -174,6 +184,8 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("flux-controlnet", FluxControlNetImg2ImgPipeline),
        ("flux-control", FluxControlImg2ImgPipeline),
        ("flux-kontext", FluxKontextPipeline),
+        ("qwenimage", QwenImageImg2ImgPipeline),
+        ("qwenimage-edit", QwenImageEditPipeline),
    ]
 )

@@ -195,6 +207,8 @@ AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
        ("flux-controlnet", FluxControlNetInpaintPipeline),
        ("flux-control", FluxControlInpaintPipeline),
        ("stable-diffusion-pag", StableDiffusionPAGInpaintPipeline),
+        ("qwenimage", QwenImageInpaintPipeline),
+        ("qwenimage-edit", QwenImageEditInpaintPipeline),
    ]
 )


--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -32,6 +32,66 @@ class FluxModularPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


+class QwenImageAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class QwenImageEditAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class QwenImageEditModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class QwenImageModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionXLAutoBlocks(metaclass=DummyObject):
    _backends = ["torch", "transformers"]