[Kandinsky] Add combined pipelines / Fix cpu model offload / Fix inpainting (#4207)

* Add combined pipeline * Download readme * Upload * up * up * fix final * Add enable model cpu offload kandinsky * finish * finish * Fix * fix more * make style * fix kandinsky mask * fix inpainting test * add callbacks * add tests * fix tests * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> * docs * docs * correct docs * fix tests * add warning * correct docs --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>

[Kandinsky] Add combined pipelines / Fix cpu model offload / Fix inpainting (#4207)
* Add combined pipeline * Download readme * Upload * up * up * fix final * Add enable model cpu offload kandinsky * finish * finish * Fix * fix more * make style * fix kandinsky mask * fix inpainting test * add callbacks * add tests * fix tests * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> * docs * docs * correct docs * fix tests * add warning * correct docs --------- Co-authored-by: YiYi Xu <yixu310@gmail.com>
b3e5cd6b · Patrick von Platen · GitHub · b37dc3b3 · b3e5cd6b · b3e5cd6b
Unverified Commit b3e5cd6b authored Jul 26, 2023 by Patrick von Platen Committed by GitHub Jul 26, 2023
14 changed files
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -7,6 +7,8 @@ from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTo
 from ...models import PriorTransformer
 from ...schedulers import UnCLIPScheduler
 from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
    logging,
    randn_tensor,
    replace_example_docstring,
@@ -162,7 +164,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.FloatTensor] = None,
        negative_prior_prompt: Optional[str] = None,
-        negative_prompt: Union[str] = "",
+        negative_prompt: str = "",
        guidance_scale: float = 4.0,
        device=None,
    ):
@@ -392,6 +394,35 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
        return prompt_embeds, text_encoder_hidden_states, text_mask
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+        device = torch.device(f"cuda:{gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.prior]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+        # We'll offload the last model manually.
+        self.prior_hook = hook
+        _, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
+        self.final_offload_hook = hook
    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
@@ -549,8 +580,12 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
        # if negative prompt has been defined, we retrieve split the image embedding into two
        if negative_prompt is None:
            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.final_offload_hook.offload()
        else:
            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.prior_hook.offload()
        if output_type not in ["pt", "np"]:
            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -28,7 +28,7 @@ from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
 import PIL
 import torch
-from huggingface_hub import hf_hub_download, model_info, snapshot_download
+from huggingface_hub import ModelCard, hf_hub_download, model_info, snapshot_download
 from packaging import version
 from requests.exceptions import HTTPError
 from tqdm.auto import tqdm
@@ -78,6 +78,7 @@ INDEX_FILE = "diffusion_pytorch_model.bin"
 CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
 DUMMY_MODULES_FOLDER = "diffusers.utils"
 TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
+CONNECTED_PIPES_KEYS = ["prior"]
 logger = logging.get_logger(__name__)
@@ -322,7 +323,9 @@ def get_class_obj_and_candidates(library_name, class_name, importable_classes, p
    return class_obj, class_candidates
-def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None, revision=None):
+def _get_pipeline_class(
+    class_obj, config, load_connected_pipeline=False, custom_pipeline=None, cache_dir=None, revision=None
+):
    if custom_pipeline is not None:
        if custom_pipeline.endswith(".py"):
            path = Path(custom_pipeline)
@@ -340,7 +343,22 @@ def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None,
        return class_obj
    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    return getattr(diffusers_module, config["_class_name"])
+    pipeline_cls = getattr(diffusers_module, config["_class_name"])
+    if load_connected_pipeline:
+        from .auto_pipeline import _get_connected_pipeline
+        connected_pipeline_cls = _get_connected_pipeline(pipeline_cls)
+        if connected_pipeline_cls is not None:
+            logger.info(
+                f"Loading connected pipeline {connected_pipeline_cls.__name__} instead of {pipeline_cls.__name__} as specified via `load_connected_pipeline=True`"
+            )
+        else:
+            logger.info(f"{pipeline_cls.__name__} has no connected pipeline class. Loading {pipeline_cls.__name__}.")
+        pipeline_cls = connected_pipeline_cls or pipeline_cls
+    return pipeline_cls
 def load_sub_model(
@@ -475,6 +493,7 @@ class DiffusionPipeline(ConfigMixin):
    config_name = "model_index.json"
    _optional_components = []
    _exclude_from_cpu_offload = []
+    _load_connected_pipes = False
    def register_modules(self, **kwargs):
        # import it here to avoid circular import
@@ -875,6 +894,7 @@ class DiffusionPipeline(ConfigMixin):
        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
        variant = kwargs.pop("variant", None)
        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
        # 1. Download the checkpoints and configs
        # use snapshot download here to get it working from from_pretrained
@@ -893,6 +913,7 @@ class DiffusionPipeline(ConfigMixin):
                custom_pipeline=custom_pipeline,
                custom_revision=custom_revision,
                variant=variant,
+                load_connected_pipeline=load_connected_pipeline,
                **kwargs,
            )
        else:
@@ -920,7 +941,12 @@ class DiffusionPipeline(ConfigMixin):
        # 3. Load the pipeline class, if using custom module then load it from the hub
        # if we load from explicit class, let's use it
        pipeline_class = _get_pipeline_class(
-            cls, config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, revision=custom_revision
+            cls,
+            config_dict,
+            load_connected_pipeline=load_connected_pipeline,
+            custom_pipeline=custom_pipeline,
+            cache_dir=cache_dir,
+            revision=custom_revision,
        )
        # DEPRECATED: To be removed in 1.0.0
@@ -1061,6 +1087,42 @@ class DiffusionPipeline(ConfigMixin):
            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+        if pipeline_class._load_connected_pipes and os.path.isfile(os.path.join(cached_folder, "README.md")):
+            modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
+            connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
+            load_kwargs = {
+                "cache_dir": cache_dir,
+                "resume_download": resume_download,
+                "force_download": force_download,
+                "proxies": proxies,
+                "local_files_only": local_files_only,
+                "use_auth_token": use_auth_token,
+                "revision": revision,
+                "torch_dtype": torch_dtype,
+                "custom_pipeline": custom_pipeline,
+                "custom_revision": custom_revision,
+                "provider": provider,
+                "sess_options": sess_options,
+                "device_map": device_map,
+                "max_memory": max_memory,
+                "offload_folder": offload_folder,
+                "offload_state_dict": offload_state_dict,
+                "low_cpu_mem_usage": low_cpu_mem_usage,
+                "variant": variant,
+                "use_safetensors": use_safetensors,
+            }
+            connected_pipes = {
+                prefix: DiffusionPipeline.from_pretrained(repo_id, **load_kwargs.copy())
+                for prefix, repo_id in connected_pipes.items()
+                if repo_id is not None
+            }
+            for prefix, connected_pipe in connected_pipes.items():
+                # add connected pipes to `init_kwargs` with <prefix>_<component_name>, e.g. "prior_text_encoder"
+                init_kwargs.update(
+                    {"_".join([prefix, name]): component for name, component in connected_pipe.components.items()}
+                )
        # 7. Potentially add passed objects if expected
        missing_modules = set(expected_modules) - set(init_kwargs.keys())
        passed_modules = list(passed_class_obj.keys())
@@ -1231,6 +1293,7 @@ class DiffusionPipeline(ConfigMixin):
        custom_revision = kwargs.pop("custom_revision", None)
        variant = kwargs.pop("variant", None)
        use_safetensors = kwargs.pop("use_safetensors", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
        if use_safetensors and not is_safetensors_available():
            raise ValueError(
@@ -1242,7 +1305,6 @@ class DiffusionPipeline(ConfigMixin):
            use_safetensors = is_safetensors_available()
            allow_pickle = True
-        pipeline_is_cached = False
        allow_patterns = None
        ignore_patterns = None
@@ -1322,7 +1384,12 @@ class DiffusionPipeline(ConfigMixin):
            # retrieve passed components that should not be downloaded
            pipeline_class = _get_pipeline_class(
-                cls, config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, revision=custom_revision
+                cls,
+                config_dict,
+                load_connected_pipeline=load_connected_pipeline,
+                custom_pipeline=custom_pipeline,
+                cache_dir=cache_dir,
+                revision=custom_revision,
            )
            expected_components, _ = cls._get_signature_keys(pipeline_class)
            passed_components = [k for k in expected_components if k in kwargs]
@@ -1367,6 +1434,10 @@ class DiffusionPipeline(ConfigMixin):
            allow_patterns = [
                p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
            ]
+            if pipeline_class._load_connected_pipes:
+                allow_patterns.append("README.md")
            # Don't download index files of forbidden patterns either
            ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
@@ -1390,7 +1461,7 @@ class DiffusionPipeline(ConfigMixin):
        # download all allow_patterns - ignore_patterns
        try:
-            return snapshot_download(
+            cached_folder = snapshot_download(
                pretrained_model_name,
                cache_dir=cache_dir,
                resume_download=resume_download,
@@ -1402,6 +1473,15 @@ class DiffusionPipeline(ConfigMixin):
                ignore_patterns=ignore_patterns,
                user_agent=user_agent,
            )
+            if pipeline_class._load_connected_pipes:
+                modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
+                connected_pipes = sum([getattr(modelcard.data, k, []) for k in CONNECTED_PIPES_KEYS], [])
+                for connected_pipe_repo_id in connected_pipes:
+                    DiffusionPipeline.download(connected_pipe_repo_id)
+            return cached_folder
        except FileNotFoundError:
            # Means we tried to load pipeline with `local_files_only=True` but the files have not been found in local cache.
            # This can happen in two cases:

--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -167,6 +167,36 @@ class ImageTextPipelineOutput(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class KandinskyCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+class KandinskyImg2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class KandinskyImg2ImgPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
@@ -182,6 +212,21 @@ class KandinskyImg2ImgPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class KandinskyInpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class KandinskyInpaintPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
@@ -227,6 +272,21 @@ class KandinskyPriorPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class KandinskyV22CombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class KandinskyV22ControlnetImg2ImgPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
@@ -257,6 +317,21 @@ class KandinskyV22ControlnetPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class KandinskyV22Img2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class KandinskyV22Img2ImgPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]
@@ -272,6 +347,21 @@ class KandinskyV22Img2ImgPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class KandinskyV22InpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class KandinskyV22InpaintPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -64,6 +64,10 @@ def randn_tensor(
        elif gen_device_type != device.type and gen_device_type == "cuda":
            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
    if isinstance(generator, list):
        shape = (1,) + shape[1:]
        latents = [

--- a/tests/pipelines/kandinsky/test_kandinsky.py
+++ b/tests/pipelines/kandinsky/test_kandinsky.py
@@ -32,30 +32,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyPipeline
-    params = [
-        "prompt",
-        "image_embeds",
-        "negative_image_embeds",
-    ]
-    batch_params = ["prompt", "negative_prompt", "image_embeds", "negative_image_embeds"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -74,7 +51,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_tokenizer(self):
@@ -196,6 +173,39 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        }
        return inputs
+class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPipeline
+    params = [
+        "prompt",
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["prompt", "negative_prompt", "image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Dummies()
+        return dummy.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummy = Dummies()
+        return dummy.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky(self):
        device = "cpu"
@@ -219,9 +229,7 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([1.0000, 1.0000, 0.2766, 1.0000, 0.5447, 0.1737, 1.0000, 0.4316, 0.9024])
-            [0.328663, 1.0, 0.23216873, 1.0, 0.92717564, 0.4639046, 0.96894777, 0.31713378, 0.6293953]
-        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/pipelines/kandinsky/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline
+from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from ..test_pipelines_common import PipelineTesterMixin
+from .test_kandinsky import Dummies
+from .test_kandinsky_img2img import Dummies as Img2ImgDummies
+from .test_kandinsky_inpaint import Dummies as InpaintDummies
+from .test_kandinsky_prior import Dummies as PriorDummies
+enable_full_determinism()
+class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyCombinedPipeline
+    params = [
+        "prompt",
+    ]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Dummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.0000, 0.0000, 0.6777, 0.1363, 0.3624, 0.7868, 0.3869, 0.3395, 0.5068])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgCombinedPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "negative_prompt", "image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Img2ImgDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = Img2ImgDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4260, 0.3596, 0.4571, 0.3890, 0.4087, 0.5137, 0.4819, 0.4116, 0.5053])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyInpaintCombinedPipeline
+    params = ["prompt", "image", "mask_image"]
+    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = InpaintDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = InpaintDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.0477, 0.0808, 0.2972, 0.2705, 0.3620, 0.6247, 0.4464, 0.2870, 0.3530])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
--- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py
@@ -40,32 +40,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyImg2ImgPipeline
-    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
-    batch_params = [
-        "prompt",
-        "negative_prompt",
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "strength",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -84,7 +59,7 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_tokenizer(self):
@@ -216,6 +191,41 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        }
        return inputs
+class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyImg2ImgPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_img2img(self):
        device = "cpu"
@@ -239,9 +249,7 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([0.5816, 0.5872, 0.4634, 0.5982, 0.4767, 0.4710, 0.4669, 0.4717, 0.4966])
-            [0.61474943, 0.6073539, 0.43308544, 0.5928269, 0.47493595, 0.46755973, 0.4613838, 0.45368797, 0.50119233]
-        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"

--- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py
@@ -33,33 +33,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyInpaintPipeline
-    params = ["prompt", "image_embeds", "negative_image_embeds", "image", "mask_image"]
-    batch_params = [
-        "prompt",
-        "negative_prompt",
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-        "mask_image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "negative_prompt",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -78,7 +52,7 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_tokenizer(self):
@@ -189,8 +163,8 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        image = image.cpu().permute(0, 2, 3, 1)[0]
        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
        # create mask
-        mask = np.ones((64, 64), dtype=np.float32)
+        mask = np.zeros((64, 64), dtype=np.float32)
-        mask[:32, :32] = 0
+        mask[:32, :32] = 1
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
@@ -211,6 +185,42 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        }
        return inputs
+class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyInpaintPipeline
+    params = ["prompt", "image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "prompt",
+        "negative_prompt",
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_inpaint(self):
        device = "cpu"
@@ -232,13 +242,9 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        image_slice = image[0, -3:, -3:, -1]
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        print(f"image.shape {image.shape}")
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([0.8222, 0.8896, 0.4373, 0.8088, 0.4905, 0.2609, 0.6816, 0.4291, 0.5129])
-            [0.8326919, 0.73790467, 0.20918581, 0.9309612, 0.5511791, 0.43713328, 0.5513321, 0.49922934, 0.59497786]
-        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -296,8 +302,8 @@ class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase):
        init_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
        )
-        mask = np.ones((768, 768), dtype=np.float32)
+        mask = np.zeros((768, 768), dtype=np.float32)
-        mask[:250, 250:-250] = 0
+        mask[:250, 250:-250] = 1
        prompt = "a hat"

--- a/tests/pipelines/kandinsky/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py
@@ -37,22 +37,7 @@ from ..test_pipelines_common import PipelineTesterMixin
 enable_full_determinism()
-class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyPriorPipeline
-    params = ["prompt"]
-    batch_params = ["prompt", "negative_prompt"]
-    required_optional_params = [
-        "num_images_per_prompt",
-        "generator",
-        "num_inference_steps",
-        "latents",
-        "negative_prompt",
-        "guidance_scale",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -183,6 +168,31 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        }
        return inputs
+class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyPriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Dummies()
+        return dummy.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummy = Dummies()
+        return dummy.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_prior(self):
        device = "cpu"

--- a/tests/pipelines/kandinsky_v22/test_kandinsky.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py
@@ -30,28 +30,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyV22Pipeline
-    params = [
-        "image_embeds",
-        "negative_image_embeds",
-    ]
-    batch_params = ["image_embeds", "negative_image_embeds"]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -70,7 +49,7 @@ class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_unet(self):
@@ -166,6 +145,37 @@ class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        }
        return inputs
+class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Pipeline
+    params = [
+        "image_embeds",
+        "negative_image_embeds",
+    ]
+    batch_params = ["image_embeds", "negative_image_embeds"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
    def test_kandinsky(self):
        device = "cpu"
@@ -189,9 +199,7 @@ class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([0.3420, 0.9505, 0.3919, 1.0000, 0.5188, 0.3109, 0.6139, 0.5624, 0.6811])
-            [0.6237976, 1.0, 0.36441332, 1.0, 0.70639634, 0.29877186, 0.85652125, 0.5216843, 0.54454046]
-        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

--- a/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_combined.py
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from diffusers import (
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Img2ImgCombinedPipeline,
+    KandinskyV22InpaintCombinedPipeline,
+)
+from diffusers.utils import torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu
+from ..test_pipelines_common import PipelineTesterMixin
+from .test_kandinsky import Dummies
+from .test_kandinsky_img2img import Dummies as Img2ImgDummies
+from .test_kandinsky_inpaint import Dummies as InpaintDummies
+from .test_kandinsky_prior import Dummies as PriorDummies
+enable_full_determinism()
+class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22CombinedPipeline
+    params = [
+        "prompt",
+    ]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Dummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(
+            {
+                "height": 64,
+                "width": 64,
+            }
+        )
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.3013, 0.0471, 0.5176, 0.1817, 0.2566, 0.7076, 0.6712, 0.4421, 0.7503])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgCombinedPipeline
+    params = ["prompt", "image"]
+    batch_params = ["prompt", "negative_prompt", "image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = Img2ImgDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = Img2ImgDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4353, 0.4710, 0.5128, 0.4806, 0.5054, 0.5348, 0.5224, 0.4603, 0.5025])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
+class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintCombinedPipeline
+    params = ["prompt", "image", "mask_image"]
+    batch_params = ["prompt", "negative_prompt", "image", "mask_image"]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "negative_prompt",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummy = InpaintDummies()
+        prior_dummy = PriorDummies()
+        components = dummy.get_dummy_components()
+        components.update({f"prior_{k}": v for k, v in prior_dummy.get_dummy_components().items()})
+        return components
+    def get_dummy_inputs(self, device, seed=0):
+        prior_dummy = PriorDummies()
+        dummy = InpaintDummies()
+        inputs = prior_dummy.get_dummy_inputs(device=device, seed=seed)
+        inputs.update(dummy.get_dummy_inputs(device=device, seed=seed))
+        inputs.pop("image_embeds")
+        inputs.pop("negative_image_embeds")
+        return inputs
+    def test_kandinsky(self):
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        output = pipe(**self.get_dummy_inputs(device))
+        image = output.images
+        image_from_tuple = pipe(
+            **self.get_dummy_inputs(device),
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5039, 0.4926, 0.4898, 0.4978, 0.4838, 0.4942, 0.4738, 0.4702, 0.4816])
+        assert (
+            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
+        assert (
+            np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
+    @require_torch_gpu
+    def test_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+        image_slices = []
+        for pipe in pipes:
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=1e-2)
--- a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py
@@ -37,29 +37,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyV22Img2ImgPipeline
-    params = ["image_embeds", "negative_image_embeds", "image"]
-    batch_params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "strength",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -78,7 +56,7 @@ class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_unet(self):
@@ -184,6 +162,38 @@ class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        }
        return inputs
+class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22Img2ImgPipeline
+    params = ["image_embeds", "negative_image_embeds", "image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "strength",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_img2img(self):
        device = "cpu"
@@ -207,9 +217,7 @@ class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
+        expected_slice = np.array([0.5712, 0.5443, 0.4725, 0.6195, 0.5184, 0.4651, 0.4473, 0.4590, 0.5016])
-            [0.6199778, 0.63984406, 0.46145785, 0.62944984, 0.5622215, 0.47306132, 0.47441456, 0.4607606, 0.48719263]
-        )
        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
        ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}"

--- a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py
@@ -37,30 +37,7 @@ from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_diffe
 enable_full_determinism()
-class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyV22InpaintPipeline
-    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
-    batch_params = [
-        "image_embeds",
-        "negative_image_embeds",
-        "image",
-        "mask_image",
-    ]
-    required_optional_params = [
-        "generator",
-        "height",
-        "width",
-        "latents",
-        "guidance_scale",
-        "num_inference_steps",
-        "return_dict",
-        "guidance_scale",
-        "num_images_per_prompt",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -79,7 +56,7 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
    @property
    def cross_attention_dim(self):
-        return 100
+        return 32
    @property
    def dummy_unet(self):
@@ -165,8 +142,8 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        image = image.cpu().permute(0, 2, 3, 1)[0]
        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256))
        # create mask
-        mask = np.ones((64, 64), dtype=np.float32)
+        mask = np.zeros((64, 64), dtype=np.float32)
-        mask[:32, :32] = 0
+        mask[:32, :32] = 1
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
@@ -186,6 +163,39 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        }
        return inputs
+class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22InpaintPipeline
+    params = ["image_embeds", "negative_image_embeds", "image", "mask_image"]
+    batch_params = [
+        "image_embeds",
+        "negative_image_embeds",
+        "image",
+        "mask_image",
+    ]
+    required_optional_params = [
+        "generator",
+        "height",
+        "width",
+        "latents",
+        "guidance_scale",
+        "num_inference_steps",
+        "return_dict",
+        "guidance_scale",
+        "num_images_per_prompt",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_inpaint(self):
        device = "cpu"
@@ -207,8 +217,6 @@ class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCas
        image_slice = image[0, -3:, -3:, -1]
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-        print(f"image.shape {image.shape}")
        assert image.shape == (1, 64, 64, 3)
        expected_slice = np.array(
@@ -244,8 +252,8 @@ class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase):
        init_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
        )
-        mask = np.ones((768, 768), dtype=np.float32)
+        mask = np.zeros((768, 768), dtype=np.float32)
-        mask[:250, 250:-250] = 0
+        mask[:250, 250:-250] = 1
        prompt = "a hat"

--- a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
+++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py
@@ -37,22 +37,7 @@ from ..test_pipelines_common import PipelineTesterMixin
 enable_full_determinism()
-class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class Dummies:
-    pipeline_class = KandinskyV22PriorPipeline
-    params = ["prompt"]
-    batch_params = ["prompt", "negative_prompt"]
-    required_optional_params = [
-        "num_images_per_prompt",
-        "generator",
-        "num_inference_steps",
-        "latents",
-        "negative_prompt",
-        "guidance_scale",
-        "output_type",
-        "return_dict",
-    ]
-    test_xformers_attention = False
    @property
    def text_embedder_hidden_size(self):
        return 32
@@ -183,6 +168,31 @@ class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
        }
        return inputs
+class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = KandinskyV22PriorPipeline
+    params = ["prompt"]
+    batch_params = ["prompt", "negative_prompt"]
+    required_optional_params = [
+        "num_images_per_prompt",
+        "generator",
+        "num_inference_steps",
+        "latents",
+        "negative_prompt",
+        "guidance_scale",
+        "output_type",
+        "return_dict",
+    ]
+    test_xformers_attention = False
+    def get_dummy_components(self):
+        dummies = Dummies()
+        return dummies.get_dummy_components()
+    def get_dummy_inputs(self, device, seed=0):
+        dummies = Dummies()
+        return dummies.get_dummy_inputs(device=device, seed=seed)
    def test_kandinsky_prior(self):
        device = "cpu"