Add ZImageImg2ImgPipeline (#12751)

* Add ZImageImg2ImgPipeline Updated the pipeline structure to include ZImageImg2ImgPipeline alongside ZImagePipeline. Implemented the ZImageImg2ImgPipeline class for image-to-image transformations, including necessary methods for encoding prompts, preparing latents, and denoising. Enhanced the auto_pipeline to map the new ZImageImg2ImgPipeline for image generation tasks. Added unit tests for ZImageImg2ImgPipeline to ensure functionality and performance. Updated dummy objects to include ZImageImg2ImgPipeline for testing purposes. * Address review comments for ZImageImg2ImgPipeline - Add `# Copied from` annotations to encode_prompt and _encode_prompt - Add ZImagePipeline to auto_pipeline.py for AutoPipeline support * Add ZImage pipeline documentation --------- Co-authored-by: YiYi Xu <yixu310@gmail.com> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>

Add ZImageImg2ImgPipeline (#12751)
* Add ZImageImg2ImgPipeline Updated the pipeline structure to include ZImageImg2ImgPipeline alongside ZImagePipeline. Implemented the ZImageImg2ImgPipeline class for image-to-image transformations, including necessary methods for encoding prompts, preparing latents, and denoising. Enhanced the auto_pipeline to map the new ZImageImg2ImgPipeline for image generation tasks. Added unit tests for ZImageImg2ImgPipeline to ensure functionality and performance. Updated dummy objects to include ZImageImg2ImgPipeline for testing purposes. * Address review comments for ZImageImg2ImgPipeline - Add `# Copied from` annotations to encode_prompt and _encode_prompt - Add ZImagePipeline to auto_pipeline.py for AutoPipeline support * Add ZImage pipeline documentation --------- Co-authored-by: YiYi Xu <yixu310@gmail.com> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>
2246d2c7 · CalamitousFelicitousness · GitHub · 671149e0 · 2246d2c7 · 2246d2c7
Unverified Commit 2246d2c7 authored Dec 08, 2025 by CalamitousFelicitousness Committed by GitHub Dec 07, 2025
9 changed files
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
--- a/docs/source/en/api/pipelines/z_image.md
+++ b/docs/source/en/api/pipelines/z_image.md
@@ -26,8 +26,41 @@ specific language governing permissions and limitations under the License.
 Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
+## Image-to-image
+Use [`ZImageImg2ImgPipeline`] to transform an existing image based on a text prompt.
+```python
+import torch
+from diffusers import ZImageImg2ImgPipeline
+from diffusers.utils import load_image
+pipe = ZImageImg2ImgPipeline.from_pretrained("Tongyi-MAI/Z-Image-Turbo", torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+init_image = load_image(url).resize((1024, 1024))
+prompt = "A fantasy landscape with mountains and a river, detailed, vibrant colors"
+image = pipe(
+    prompt,
+    image=init_image,
+    strength=0.6,
+    num_inference_steps=9,
+    guidance_scale=0.0,
+    generator=torch.Generator("cuda").manual_seed(42),
+).images[0]
+image.save("zimage_img2img.png")
+```
 ## ZImagePipeline
 [[autodoc]] ZImagePipeline
 	- all
 	- __call__
+## ZImageImg2ImgPipeline
+[[autodoc]] ZImageImg2ImgPipeline
+	- all
+	- __call__
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -662,6 +662,7 @@ else:
            "WuerstchenCombinedPipeline",
            "WuerstchenDecoderPipeline",
            "WuerstchenPriorPipeline",
+            "ZImageImg2ImgPipeline",
            "ZImagePipeline",
        ]
    )
@@ -1360,6 +1361,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenCombinedPipeline,
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
+            ZImageImg2ImgPipeline,
            ZImagePipeline,
        )

--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -404,7 +404,7 @@ else:
        "Kandinsky5T2IPipeline",
        "Kandinsky5I2IPipeline",
    ]
-    _import_structure["z_image"] = ["ZImagePipeline"]
+    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
    _import_structure["skyreels_v2"] = [
        "SkyReelsV2DiffusionForcingPipeline",
        "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -841,7 +841,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            WuerstchenDecoderPipeline,
            WuerstchenPriorPipeline,
        )
-        from .z_image import ZImagePipeline
+        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
        try:
            if not is_onnx_available():

--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -119,6 +119,7 @@ from .stable_diffusion_xl import (
 )
 from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
 from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
 AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
@@ -162,6 +163,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("cogview4-control", CogView4ControlPipeline),
        ("qwenimage", QwenImagePipeline),
        ("qwenimage-controlnet", QwenImageControlNetPipeline),
+        ("z-image", ZImagePipeline),
    ]
 )
@@ -189,6 +191,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("qwenimage", QwenImageImg2ImgPipeline),
        ("qwenimage-edit", QwenImageEditPipeline),
        ("qwenimage-edit-plus", QwenImageEditPlusPipeline),
+        ("z-image", ZImageImg2ImgPipeline),
    ]
 )

--- a/src/diffusers/pipelines/z_image/__init__.py
+++ b/src/diffusers/pipelines/z_image/__init__.py
@@ -23,6 +23,7 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
    _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+    _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -35,6 +36,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .pipeline_output import ZImagePipelineOutput
        from .pipeline_z_image import ZImagePipeline
+        from .pipeline_z_image_img2img import ZImageImg2ImgPipeline
 else:
    import sys

--- a/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_img2img.py
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -3752,6 +3752,21 @@ class WuerstchenPriorPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class ZImageImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class ZImagePipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/tests/pipelines/z_image/test_z_image_img2img.py
+++ b/tests/pipelines/z_image/test_z_image_img2img.py
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import os
+import unittest
+import numpy as np
+import torch
+from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model
+from diffusers import (
+    AutoencoderKL,
+    FlowMatchEulerDiscreteScheduler,
+    ZImageImg2ImgPipeline,
+    ZImageTransformer2DModel,
+)
+from diffusers.utils.testing_utils import floats_tensor
+from ...testing_utils import torch_device
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+# Z-Image requires torch.use_deterministic_algorithms(False) due to complex64 RoPE operations
+# Cannot use enable_full_determinism() which sets it to True
+# Note: Z-Image does not support FP16 inference due to complex64 RoPE embeddings
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+torch.use_deterministic_algorithms(False)
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+if hasattr(torch.backends, "cuda"):
+    torch.backends.cuda.matmul.allow_tf32 = False
+class ZImageImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = ZImageImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "strength",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    supports_dduf = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+    def setUp(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = ZImageTransformer2DModel(
+            all_patch_size=(2,),
+            all_f_patch_size=(1,),
+            in_channels=16,
+            dim=32,
+            n_layers=2,
+            n_refiner_layers=1,
+            n_heads=2,
+            n_kv_heads=2,
+            norm_eps=1e-5,
+            qk_norm=True,
+            cap_feat_dim=16,
+            rope_theta=256.0,
+            t_scale=1000.0,
+            axes_dims=[8, 4, 4],
+            axes_lens=[256, 32, 32],
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            block_out_channels=[32, 64],
+            layers_per_block=1,
+            latent_channels=16,
+            norm_num_groups=32,
+            sample_size=32,
+            scaling_factor=0.3611,
+            shift_factor=0.1159,
+        )
+        torch.manual_seed(0)
+        scheduler = FlowMatchEulerDiscreteScheduler()
+        torch.manual_seed(0)
+        config = Qwen3Config(
+            hidden_size=16,
+            intermediate_size=16,
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=2,
+            vocab_size=151936,
+            max_position_embeddings=512,
+        )
+        text_encoder = Qwen3Model(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        import random
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "image": image,
+            "strength": 0.6,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 3.0,
+            "cfg_normalization": False,
+            "cfg_truncation": 1.0,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "np",
+        }
+        return inputs
+    def test_inference(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(device)
+        image = pipe(**inputs).images
+        generated_image = image[0]
+        self.assertEqual(generated_image.shape, (32, 32, 3))
+    def test_inference_batch_single_identical(self):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-1)
+    def test_num_images_per_prompt(self):
+        import inspect
+        sig = inspect.signature(self.pipeline_class.__call__)
+        if "num_images_per_prompt" not in sig.parameters:
+            return
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        batch_sizes = [1, 2]
+        num_images_per_prompts = [1, 2]
+        for batch_size in batch_sizes:
+            for num_images_per_prompt in num_images_per_prompts:
+                inputs = self.get_dummy_inputs(torch_device)
+                for key in inputs.keys():
+                    if key in self.batch_params:
+                        inputs[key] = batch_size * [inputs[key]]
+                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0]
+                assert images.shape[0] == batch_size * num_images_per_prompt
+        del pipe
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not self.test_attention_slicing:
+            return
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+    def test_vae_tiling(self, expected_diff_max: float = 0.3):
+        import random
+        generator_device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to("cpu")
+        pipe.set_progress_bar_config(disable=None)
+        # Without tiling
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        # Generate a larger image for the input
+        inputs["image"] = floats_tensor((1, 3, 128, 128), rng=random.Random(0)).to("cpu")
+        output_without_tiling = pipe(**inputs)[0]
+        # With tiling (standard AutoencoderKL doesn't accept parameters)
+        pipe.vae.enable_tiling()
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["height"] = inputs["width"] = 128
+        inputs["image"] = floats_tensor((1, 3, 128, 128), rng=random.Random(0)).to("cpu")
+        output_with_tiling = pipe(**inputs)[0]
+        self.assertLess(
+            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
+            expected_diff_max,
+            "VAE tiling should not affect the inference results",
+        )
+    def test_pipeline_with_accelerator_device_map(self, expected_max_difference=5e-4):
+        # Z-Image RoPE embeddings (complex64) have slightly higher numerical tolerance
+        super().test_pipeline_with_accelerator_device_map(expected_max_difference=expected_max_difference)
+    def test_group_offloading_inference(self):
+        # Block-level offloading conflicts with RoPE cache. Pipeline-level offloading (tested separately) works fine.
+        self.skipTest("Using test_pipeline_level_group_offloading_inference instead")
+    def test_save_load_float16(self, expected_max_diff=1e-2):
+        # Z-Image does not support FP16 due to complex64 RoPE embeddings
+        self.skipTest("Z-Image does not support FP16 inference")
+    def test_float16_inference(self, expected_max_diff=5e-2):
+        # Z-Image does not support FP16 due to complex64 RoPE embeddings
+        self.skipTest("Z-Image does not support FP16 inference")
+    def test_strength_parameter(self):
+        """Test that strength parameter affects the output correctly."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        # Test with different strength values
+        inputs_low_strength = self.get_dummy_inputs(device)
+        inputs_low_strength["strength"] = 0.2
+        inputs_high_strength = self.get_dummy_inputs(device)
+        inputs_high_strength["strength"] = 0.8
+        # Both should complete without errors
+        output_low = pipe(**inputs_low_strength).images[0]
+        output_high = pipe(**inputs_high_strength).images[0]
+        # Outputs should be different (different amount of transformation)
+        self.assertFalse(np.allclose(output_low, output_high, atol=1e-3))
+    def test_invalid_strength(self):
+        """Test that invalid strength values raise appropriate errors."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        inputs = self.get_dummy_inputs(device)
+        # Test strength < 0
+        inputs["strength"] = -0.1
+        with self.assertRaises(ValueError):
+            pipe(**inputs)
+        # Test strength > 1
+        inputs["strength"] = 1.5
+        with self.assertRaises(ValueError):
+            pipe(**inputs)