[Refactor] Update from single file (#6428)

* update * update * update * update * update * update * update * update * update * update * update' * update * update * update * update * update * update * up * update * update * update * update * update * update * update * update * update * update * update * update * up * update * update * update * update * update' * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * clean * update * update * clean up * clean up * update * clean * clean * update * updaet * clean up * fix docs * update * update * Revert "update" This reverts commit dbfb8f1ea9c61a2b4e02f926245be2b3d387e577. * update * update * update * update * fix controlnet * fix scheduler * fix controlnet tests

[Refactor] Update from single file (#6428)
* update * update * update * update * update * update * update * update * update * update * update' * update * update * update * update * update * update * up * update * update * update * update * update * update * update * update * update * update * update * update * up * update * update * update * update * update' * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * clean * update * update * clean up * clean up * update * clean * clean * update * updaet * clean up * fix docs * update * update * Revert "update" This reverts commit dbfb8f1ea9c61a2b4e02f926245be2b3d387e577. * update * update * update * update * fix controlnet * fix scheduler * fix controlnet tests
fee93c81 · Dhruv Nair · GitHub · 5308cce9 · fee93c81 · fee93c81
Unverified Commit fee93c81 authored Jan 23, 2024 by Dhruv Nair Committed by GitHub Jan 23, 2024
20 changed files
--- a/docs/source/en/api/loaders/single_file.md
+++ b/docs/source/en/api/loaders/single_file.md
@@ -30,8 +30,8 @@ To learn more about how to load single file weights, see the [Load different Sta
 ## FromOriginalVAEMixin
-[[autodoc]] loaders.single_file.FromOriginalVAEMixin
+[[autodoc]] loaders.autoencoder.FromOriginalVAEMixin
 ## FromOriginalControlnetMixin
-[[autodoc]] loaders.single_file.FromOriginalControlnetMixin
+[[autodoc]] loaders.controlnet.FromOriginalControlNetMixin
\ No newline at end of file
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -54,12 +54,13 @@ if is_transformers_available():
 _import_structure = {}
 if is_torch_available():
-    _import_structure["single_file"] = ["FromOriginalControlnetMixin", "FromOriginalVAEMixin"]
+    _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
+    _import_structure["controlnet"] = ["FromOriginalControlNetMixin"]
    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
    _import_structure["utils"] = ["AttnProcsLayers"]
    if is_transformers_available():
-        _import_structure["single_file"].extend(["FromSingleFileMixin"])
+        _import_structure["single_file"] = ["FromSingleFileMixin"]
        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
@@ -69,7 +70,8 @@ _import_structure["peft"] = ["PeftAdapterMixin"]
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    if is_torch_available():
-        from .single_file import FromOriginalControlnetMixin, FromOriginalVAEMixin
+        from .autoencoder import FromOriginalVAEMixin
+        from .controlnet import FromOriginalControlNetMixin
        from .unet import UNet2DConditionLoadersMixin
        from .utils import AttnProcsLayers

--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.utils import validate_hf_hub_args
+from .single_file_utils import (
+    create_diffusers_vae_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+class FromOriginalVAEMixin:
+    """
+    Load pretrained AutoencoderKL weights saved in the `.ckpt` or `.safetensors` format into a [`AutoencoderKL`].
+    """
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+        <Tip warning={true}>
+            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
+            a VAE from SDXL or a Stable Diffusion v2 model or higher.
+        </Tip>
+        Examples:
+        ```py
+        from diffusers import AutoencoderKL
+        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
+        model = AutoencoderKL.from_single_file(url)
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)
+        class_name = cls.__name__
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+        )
+        image_size = kwargs.pop("image_size", None)
+        component = create_diffusers_vae_model_from_ldm(class_name, original_config, checkpoint, image_size=image_size)
+        vae = component["vae"]
+        if torch_dtype is not None:
+            vae = vae.to(torch_dtype)
+        return vae
--- a/src/diffusers/loaders/controlnet.py
+++ b/src/diffusers/loaders/controlnet.py
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from huggingface_hub.utils import validate_hf_hub_args
+from .single_file_utils import (
+    create_diffusers_controlnet_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+class FromOriginalControlNetMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    """
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
+        model = ControlNetModel.from_single_file(url)
+        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)
+        class_name = cls.__name__
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
+            cache_dir=cache_dir,
+        )
+        upcast_attention = kwargs.pop("upcast_attention", False)
+        image_size = kwargs.pop("image_size", None)
+        component = create_diffusers_controlnet_model_from_ldm(
+            class_name, original_config, checkpoint, upcast_attention=upcast_attention, image_size=image_size
+        )
+        controlnet = component["controlnet"]
+        if torch_dtype is not None:
+            controlnet = controlnet.to(torch_dtype)
+        return controlnet
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -17,7 +17,6 @@ import torch
 import torch.nn as nn
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalVAEMixin
 from ...utils import is_torch_version
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
@@ -162,7 +161,7 @@ class TemporalDecoder(nn.Module):
        return sample
-class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
    r"""
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.

--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -19,7 +19,7 @@ from torch import nn
 from torch.nn import functional as F
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalControlnetMixin
+from ..loaders import FromOriginalControlNetMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
@@ -108,7 +108,7 @@ class ControlNetConditioningEmbedding(nn.Module):
        return embedding
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
    """
    A ControlNet model.

--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -32,6 +32,7 @@ from .. import __version__
 from ..utils import (
    CONFIG_NAME,
    FLAX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
    SAFETENSORS_WEIGHTS_NAME,
    WEIGHTS_NAME,
    _add_variant,
@@ -102,10 +103,11 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
    Reads a checkpoint file, returning properly formatted errors if they arise.
    """
    try:
-        if os.path.basename(checkpoint_file) == _add_variant(WEIGHTS_NAME, variant):
+        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
-            return torch.load(checkpoint_file, map_location="cpu")
+        if file_extension == SAFETENSORS_FILE_EXTENSION:
-        else:
            return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        else:
+            return torch.load(checkpoint_file, map_location="cpu")
    except Exception as e:
        try:
            with open(checkpoint_file) as f:

--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -351,7 +351,7 @@ def get_class_obj_and_candidates(
 def _get_pipeline_class(
    class_obj,
-    config,
+    config=None,
    load_connected_pipeline=False,
    custom_pipeline=None,
    repo_id=None,
@@ -389,7 +389,12 @@ def _get_pipeline_class(
        return class_obj
    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
-    class_name = config["_class_name"]
+    class_name = class_name or config["_class_name"]
+    if not class_name:
+        raise ValueError(
+            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
+        )
    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
    pipeline_cls = getattr(diffusers_module, class_name)

--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -28,6 +28,7 @@ from .constants import (
    MIN_PEFT_VERSION,
    ONNX_EXTERNAL_WEIGHTS_NAME,
    ONNX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
    SAFETENSORS_WEIGHTS_NAME,
    USE_PEFT_BACKEND,
    WEIGHTS_NAME,

--- a/src/diffusers/utils/constants.py
+++ b/src/diffusers/utils/constants.py
@@ -31,6 +31,7 @@ WEIGHTS_NAME = "diffusion_pytorch_model.bin"
 FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
 ONNX_WEIGHTS_NAME = "model.onnx"
 SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
+SAFETENSORS_FILE_EXTENSION = "safetensors"
 ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
 HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"

--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -244,15 +244,15 @@ def _get_model_file(
    pretrained_model_name_or_path: Union[str, Path],
    *,
    weights_name: str,
-    subfolder: Optional[str],
+    subfolder: Optional[str] = None,
-    cache_dir: Optional[str],
+    cache_dir: Optional[str] = None,
-    force_download: bool,
+    force_download: bool = False,
-    proxies: Optional[Dict],
+    proxies: Optional[Dict] = None,
-    resume_download: bool,
+    resume_download: bool = False,
-    local_files_only: bool,
+    local_files_only: bool = False,
-    token: Optional[str],
+    token: Optional[str] = None,
-    user_agent: Union[Dict, str, None],
+    user_agent: Optional[Union[Dict, str]] = None,
-    revision: Optional[str],
+    revision: Optional[str] = None,
    commit_hash: Optional[str] = None,
 ):
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)

--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -37,6 +37,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    load_image,
    load_numpy,
+    numpy_cosine_similarity_distance,
    require_python39_or_higher,
    require_torch_2,
    require_torch_gpu,
@@ -1022,39 +1023,49 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
    def test_load_local(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-        pipe_1 = StableDiffusionControlNetPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
        controlnet = ControlNetModel.from_single_file(
            "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
        )
-        pipe_2 = StableDiffusionControlNetPipeline.from_single_file(
+        pipe_sf = StableDiffusionControlNetPipeline.from_single_file(
            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
            safety_checker=None,
            controlnet=controlnet,
+            scheduler_type="pndm",
        )
-        pipes = [pipe_1, pipe_2]
+        pipe_sf.unet.set_default_attn_processor()
-        images = []
+        pipe_sf.enable_model_cpu_offload()
-        for pipe in pipes:
+        control_image = load_image(
-            pipe.enable_model_cpu_offload()
-            pipe.set_progress_bar_config(disable=None)
-            generator = torch.Generator(device="cpu").manual_seed(0)
-            prompt = "bird"
-            image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-            )
+        ).resize((512, 512))
+        prompt = "bird"
-            output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+        generator = torch.Generator(device="cpu").manual_seed(0)
-            images.append(output.images[0])
+        output = pipe(
+            prompt,
+            image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
-            del pipe
+        generator = torch.Generator(device="cpu").manual_seed(0)
-            gc.collect()
+        output_sf = pipe_sf(
-            torch.cuda.empty_cache()
+            prompt,
+            image=control_image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
 @slow

--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -39,6 +39,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    load_numpy,
+    numpy_cosine_similarity_distance,
    require_torch_gpu,
    slow,
    torch_device,
@@ -421,33 +422,33 @@ class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
    def test_load_local(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-        pipe_1 = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
        controlnet = ControlNetModel.from_single_file(
            "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
        )
-        pipe_2 = StableDiffusionControlNetImg2ImgPipeline.from_single_file(
+        pipe_sf = StableDiffusionControlNetImg2ImgPipeline.from_single_file(
            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
            safety_checker=None,
            controlnet=controlnet,
+            scheduler_type="pndm",
        )
+        pipe_sf.unet.set_default_attn_processor()
+        pipe_sf.enable_model_cpu_offload()
        control_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
        ).resize((512, 512))
        image = load_image(
            "https://huggingface.co/lllyasviel/sd-controlnet-canny/resolve/main/images/bird.png"
        ).resize((512, 512))
+        prompt = "bird"
-        pipes = [pipe_1, pipe_2]
-        images = []
-        for pipe in pipes:
-            pipe.enable_model_cpu_offload()
-            pipe.set_progress_bar_config(disable=None)
        generator = torch.Generator(device="cpu").manual_seed(0)
-            prompt = "bird"
        output = pipe(
            prompt,
            image=image,
@@ -456,11 +457,18 @@ class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
            generator=generator,
            output_type="np",
            num_inference_steps=3,
-            )
+        ).images[0]
-            images.append(output.images[0])
-            del pipe
+        generator = torch.Generator(device="cpu").manual_seed(0)
-            gc.collect()
+        output_sf = pipe_sf(
-            torch.cuda.empty_cache()
+            prompt,
+            image=image,
+            control_image=control_image,
+            strength=0.9,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+        ).images[0]
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(output_sf.flatten(), output.flatten())
+        assert max_diff < 1e-3
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -569,6 +569,7 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
            safety_checker=None,
            controlnet=controlnet,
+            scheduler_type="pndm",
        )
        control_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
@@ -605,4 +606,5 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
            gc.collect()
            torch.cuda.empty_cache()
-        assert np.abs(images[0] - images[1]).max() < 1e-3
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), images[1].flatten())
+        assert max_diff < 1e-3
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -31,7 +31,14 @@ from diffusers import (
 from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D
 from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    numpy_cosine_similarity_distance,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 from diffusers.utils.torch_utils import randn_tensor
 from ..pipeline_params import (
@@ -819,6 +826,41 @@ class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
        expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
        assert np.allclose(original_image, expected_image, atol=1e-04)
+    def test_download_ckpt_diff_format_is_same(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0", torch_dtype=torch.float16)
+        single_file_url = (
+            "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+        )
+        pipe_single_file = StableDiffusionXLControlNetPipeline.from_single_file(
+            single_file_url, controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe_single_file.unet.set_default_attn_processor()
+        pipe_single_file.enable_model_cpu_offload()
+        pipe_single_file.set_progress_bar_config(disable=None)
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+        single_file_images = pipe_single_file(
+            prompt, image=image, generator=generator, output_type="np", num_inference_steps=2
+        ).images
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        )
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=2).images
+        assert images[0].shape == (512, 512, 3)
+        assert single_file_images[0].shape == (512, 512, 3)
+        max_diff = numpy_cosine_similarity_distance(images[0].flatten(), single_file_images[0].flatten())
+        assert max_diff < 5e-2
 class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNetPipelineFastTests):
    def test_controlnet_sdxl_guess(self):

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -1262,13 +1262,13 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
    def test_download_ckpt_diff_format_is_same(self):
        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
-        pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
+        sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
+        sf_pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
+        sf_pipe.to("cuda")
        generator = torch.Generator(device="cpu").manual_seed(0)
-        image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+        image_single_file = sf_pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
@@ -1278,7 +1278,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
        generator = torch.Generator(device="cpu").manual_seed(0)
        image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
-        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten())
        assert max_diff < 1e-3

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -43,6 +43,7 @@ from diffusers.utils.testing_utils import (
    load_image,
    load_numpy,
    nightly,
+    numpy_cosine_similarity_distance,
    require_python39_or_higher,
    require_torch_2,
    require_torch_gpu,
@@ -771,7 +772,9 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
        inputs["num_inference_steps"] = 5
        image = pipe(**inputs).images[0]
-        assert np.max(np.abs(image - image_ckpt)) < 5e-4
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+        assert max_diff < 1e-4
 @slow

--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import copy
+import gc
 import tempfile
 import unittest
@@ -1024,6 +1025,11 @@ class StableDiffusionXLPipelineFastTests(
 @slow
 class StableDiffusionXLPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
    def test_stable_diffusion_lcm(self):
        torch.manual_seed(0)
        unet = UNet2DConditionModel.from_pretrained(
@@ -1049,3 +1055,30 @@ class StableDiffusionXLPipelineIntegrationTests(unittest.TestCase):
        max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
        assert max_diff < 1e-2
+    def test_download_ckpt_diff_format_is_same(self):
+        ckpt_path = (
+            "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors"
+        )
+        pipe = StableDiffusionXLPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_ckpt = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        )
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_default_attn_processor()
+        pipe.enable_model_cpu_offload()
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0]
+        max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten())
+        assert max_diff < 6e-3