Add Support for Z-Image Series (#12703)

* Add Support for Z-Image. * Reformatting with make style, black & isort. * Remove init, Modify import utils, Merge forward in transformers block, Remove once func in pipeline. * modified main model forward, freqs_cis left * refactored to add B dim * fixed stack issue * fixed modulation bug * fixed modulation bug * fix bug * remove value_from_time_aware_config * styling * Fix neg embed and devide / bug; Reuse pad zero tensor; Turn cat -> repeat; Add hint for attn processor. * Replace padding with pad_sequence; Add gradient checkpointing. * Fix flash_attn3 in dispatch attn backend by _flash_attn_forward, replace its origin implement; Add DocString in pipeline for that. * Fix Docstring and Make Style. * Revert "Fix flash_attn3 in dispatch attn backend by _flash_attn_forward, replace its origin implement; Add DocString in pipeline for that." This reverts commit fbf26b7ed11d55146103c97740bad4a5f91744e0. * update z-image docstring * Revert attention dispatcher * update z-image docstring * styling * Recover attention_dispatch.py with its origin impl, later would special commit for fa3 compatibility. * Fix prev bug, and support for prompt_embeds pass in args after prompt pre-encode as List of torch Tensor. * Remove einop dependency. * remove redundant imports & make fix-copies * fix import --------- Co-authored-by: liudongyang <liudongyang0114@gmail.com>

Add Support for Z-Image Series (#12703)
* Add Support for Z-Image. * Reformatting with make style, black & isort. * Remove init, Modify import utils, Merge forward in transformers block, Remove once func in pipeline. * modified main model forward, freqs_cis left * refactored to add B dim * fixed stack issue * fixed modulation bug * fixed modulation bug * fix bug * remove value_from_time_aware_config * styling * Fix neg embed and devide / bug; Reuse pad zero tensor; Turn cat -> repeat; Add hint for attn processor. * Replace padding with pad_sequence; Add gradient checkpointing. * Fix flash_attn3 in dispatch attn backend by _flash_attn_forward, replace its origin implement; Add DocString in pipeline for that. * Fix Docstring and Make Style. * Revert "Fix flash_attn3 in dispatch attn backend by _flash_attn_forward, replace its origin implement; Add DocString in pipeline for that." This reverts commit fbf26b7ed11d55146103c97740bad4a5f91744e0. * update z-image docstring * Revert attention dispatcher * update z-image docstring * styling * Recover attention_dispatch.py with its origin impl, later would special commit for fa3 compatibility. * Fix prev bug, and support for prompt_embeds pass in args after prompt pre-encode as List of torch Tensor. * Remove einop dependency. * remove redundant imports & make fix-copies * fix import --------- Co-authored-by: liudongyang <liudongyang0114@gmail.com>
4088e8a8 · Jerry Wu · GitHub · d33d9f67 · 4088e8a8 · 4088e8a8
Unverified Commit 4088e8a8 authored Nov 25, 2025 by Jerry Wu Committed by GitHub Nov 25, 2025
11 changed files
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -271,6 +271,7 @@ else:
            "WanAnimateTransformer3DModel",
            "WanTransformer3DModel",
            "WanVACETransformer3DModel",
+            "ZImageTransformer2DModel",
            "attention_backend",
        ]
    )
@@ -647,6 +648,7 @@ else:
            "WuerstchenCombinedPipeline",
            "WuerstchenDecoderPipeline",
            "WuerstchenPriorPipeline",
+            "ZImagePipeline",
        ]
    )

@@ -1329,6 +1331,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenCombinedPipeline,
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
+            ZImagePipeline,
        )

    try:

--- a/src/diffusers/hooks/_helpers.py
+++ b/src/diffusers/hooks/_helpers.py
@@ -111,6 +111,7 @@ def _register_attention_processors_metadata():
    from ..models.transformers.transformer_hunyuanimage import HunyuanImageAttnProcessor
    from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0
    from ..models.transformers.transformer_wan import WanAttnProcessor2_0
+    from ..models.transformers.transformer_z_image import ZSingleStreamAttnProcessor

    # AttnProcessor2_0
    AttentionProcessorRegistry.register(
@@ -158,6 +159,14 @@ def _register_attention_processors_metadata():
        ),
    )

+    # ZSingleStreamAttnProcessor
+    AttentionProcessorRegistry.register(
+        model_class=ZSingleStreamAttnProcessor,
+        metadata=AttentionProcessorMetadata(
+            skip_processor_output_fn=_skip_proc_output_fn_Attention_ZSingleStreamAttnProcessor,
+        ),
+    )
+

 def _register_transformer_blocks_metadata():
    from ..models.attention import BasicTransformerBlock
@@ -179,6 +188,7 @@ def _register_transformer_blocks_metadata():
    from ..models.transformers.transformer_mochi import MochiTransformerBlock
    from ..models.transformers.transformer_qwenimage import QwenImageTransformerBlock
    from ..models.transformers.transformer_wan import WanTransformerBlock
+    from ..models.transformers.transformer_z_image import ZImageTransformerBlock

    # BasicTransformerBlock
    TransformerBlockRegistry.register(
@@ -312,6 +322,15 @@ def _register_transformer_blocks_metadata():
        ),
    )

+    # ZImage
+    TransformerBlockRegistry.register(
+        model_class=ZImageTransformerBlock,
+        metadata=TransformerBlockMetadata(
+            return_hidden_states_index=0,
+            return_encoder_hidden_states_index=None,
+        ),
+    )
+

 # fmt: off
 def _skip_attention___ret___hidden_states(self, *args, **kwargs):
@@ -338,4 +357,5 @@ _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hid
 _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states
 _skip_proc_output_fn_Attention_HunyuanImageAttnProcessor = _skip_attention___ret___hidden_states
+_skip_proc_output_fn_Attention_ZSingleStreamAttnProcessor = _skip_attention___ret___hidden_states
 # fmt: on
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -110,6 +110,7 @@ if is_torch_available():
    _import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
    _import_structure["transformers.transformer_wan_animate"] = ["WanAnimateTransformer3DModel"]
    _import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
+    _import_structure["transformers.transformer_z_image"] = ["ZImageTransformer2DModel"]
    _import_structure["unets.unet_1d"] = ["UNet1DModel"]
    _import_structure["unets.unet_2d"] = ["UNet2DModel"]
    _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
@@ -218,6 +219,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WanAnimateTransformer3DModel,
            WanTransformer3DModel,
            WanVACETransformer3DModel,
+            ZImageTransformer2DModel,
        )
        from .unets import (
            I2VGenXLUNet,

--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -44,3 +44,4 @@ if is_torch_available():
    from .transformer_wan import WanTransformer3DModel
    from .transformer_wan_animate import WanAnimateTransformer3DModel
    from .transformer_wan_vace import WanVACETransformer3DModel
+    from .transformer_z_image import ZImageTransformer2DModel
--- a/src/diffusers/models/transformers/transformer_z_image.py
+++ b/src/diffusers/models/transformers/transformer_z_image.py
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -395,6 +395,7 @@ else:
        "WanVACEPipeline",
        "WanAnimatePipeline",
    ]
+    _import_structure["z_image"] = ["ZImagePipeline"]
    _import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
    _import_structure["skyreels_v2"] = [
        "SkyReelsV2DiffusionForcingPipeline",
@@ -824,6 +825,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
        )
+        from .z_image import ZImagePipeline

        try:
            if not is_onnx_available():

--- a/src/diffusers/pipelines/z_image/__init__.py
+++ b/src/diffusers/pipelines/z_image/__init__.py
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa: F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
+    _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import ZImagePipelineOutput
+        from .pipeline_z_image import ZImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
--- a/src/diffusers/pipelines/z_image/pipeline_output.py
+++ b/src/diffusers/pipelines/z_image/pipeline_output.py
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class ZImagePipelineOutput(BaseOutput):
+    """
+    Output class for Z-Image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
--- a/src/diffusers/pipelines/z_image/pipeline_z_image.py
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image.py
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -3645,3 +3645,18 @@ class WuerstchenPriorPipeline(metaclass=DummyObject):
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        requires_backends(cls, ["torch", "transformers"])
+
+
+class ZImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
--- a/tests/pipelines/z_image/__init__.py
+++ b/tests/pipelines/z_image/__init__.py