Refactor model offload (#4514)

* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

Refactor model offload (#4514)
* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
93579650 · Patrick von Platen · GitHub · 16a056a7 · 93579650 · 93579650
Unverified Commit 93579650 authored Sep 11, 2023 by Patrick von Platen Committed by GitHub Sep 11, 2023
20 changed files
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -19,8 +19,6 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, XLMRobertaTokenizer
-from diffusers.utils import is_accelerate_available, is_accelerate_version
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -100,6 +98,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -221,34 +220,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def _encode_prompt(
        self,
        prompt,
@@ -750,9 +721,8 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -21,8 +21,6 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, XLMRobertaTokenizer
-from diffusers.utils import is_accelerate_available, is_accelerate_version
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
@@ -127,6 +125,7 @@ class AltDiffusionImg2ImgPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -219,34 +218,6 @@ class AltDiffusionImg2ImgPipeline(
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def _encode_prompt(
        self,
        prompt,
@@ -773,9 +744,8 @@ class AltDiffusionImg2ImgPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -72,6 +72,7 @@ class AudioLDMPipeline(DiffusionPipeline):
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            Vocoder of class `SpeechT5HifiGan`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,

--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -947,6 +947,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
                    if callback is not None and i % callback_steps == 0:
                        callback(i, t, latents)
+        self.maybe_free_model_hooks()
        # 8. Post-processing
        if not output_type == "latent":
            latents = 1 / self.vae.config.scaling_factor * latents

--- a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
+++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
@@ -5,8 +5,6 @@ import torch
 from ...models import UNet2DModel
 from ...schedulers import CMStochasticIterativeScheduler
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -62,6 +60,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
            compatible with [`CMStochasticIterativeScheduler`].
    """
+    model_cpu_offload_seq = "unet"
    def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None:
        super().__init__()
@@ -73,34 +72,6 @@ class ConsistencyModelPipeline(DiffusionPipeline):
        self.safety_checker = None
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.unet]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels, height, width)
        if isinstance(generator, list) and len(generator) != batch_size:
@@ -280,9 +251,8 @@ class ConsistencyModelPipeline(DiffusionPipeline):
        # 6. Post-process image sample
        image = self.postprocess_image(sample, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -29,8 +29,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -125,6 +123,7 @@ class StableDiffusionControlNetPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -210,34 +209,6 @@ class StableDiffusionControlNetPipeline(
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1031,9 +1002,8 @@ class StableDiffusionControlNetPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -28,8 +28,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -149,6 +147,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -234,34 +233,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1107,9 +1078,8 @@ class StableDiffusionControlNetImg2ImgPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -30,8 +30,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -273,6 +271,7 @@ class StableDiffusionControlNetInpaintPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -361,34 +360,6 @@ class StableDiffusionControlNetInpaintPipeline(
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            # the safety checker can offload the vae again
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # control net hook has be manually offloaded as it alternates with unet
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1373,9 +1344,8 @@ class StableDiffusionControlNetInpaintPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -166,6 +166,7 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    _optional_components = ["tokenizer", "text_encoder"]
    def __init__(
@@ -248,38 +249,6 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -145,6 +145,9 @@ class StableDiffusionXLControlNetPipeline(
            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
            watermarker is used.
    """
+    model_cpu_offload_seq = (
+        "text_encoder->text_encoder_2->unet->vae"  # leave controlnet out on purpose because it iterates with unet
+    )
    def __init__(
        self,
@@ -221,38 +224,6 @@ class StableDiffusionXLControlNetPipeline(
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,
@@ -1170,12 +1141,10 @@ class StableDiffusionXLControlNetPipeline(
                    if callback is not None and i % callback_steps == 0:
                        callback(i, t, latents)
-        # If we do sequential model offloading, let's offload unet and controlnet
        # manually for max memory savings
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
-            self.unet.to("cpu")
+            self.upcast_vae()
-            self.controlnet.to("cpu")
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-            torch.cuda.empty_cache()
        if not output_type == "latent":
            # make sure the VAE is in float32 mode, as it overflows in float16
@@ -1192,17 +1161,16 @@ class StableDiffusionXLControlNetPipeline(
                self.vae.to(dtype=torch.float16)
        else:
            image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
-        # apply watermark if available
+        if not output_type == "latent":
-        if self.watermark is not None:
+            # apply watermark if available
-            image = self.watermark.apply_watermark(image)
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
-        image = self.image_processor.postprocess(image, output_type=output_type)
+            image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -36,8 +36,6 @@ from ...models.attention_processor import (
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -179,6 +177,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    _optional_components = ["tokenizer", "text_encoder"]
    def __init__(
@@ -258,38 +257,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        cpu_offload_with_hook(self.controlnet, device)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,

--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -39,6 +39,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
            [`IPNDMScheduler`].
    """
+    model_cpu_offload_seq = "unet"
    def __init__(self, unet, scheduler):
        super().__init__()

--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -35,6 +35,7 @@ class DDIMPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
    """
+    model_cpu_offload_seq = "unet"
    def __init__(self, unet, scheduler):
        super().__init__()

--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -35,6 +35,7 @@ class DDPMPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
            [`DDPMScheduler`], or [`DDIMScheduler`].
    """
+    model_cpu_offload_seq = "unet"
    def __init__(self, unet, scheduler):
        super().__init__()

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -13,7 +13,6 @@ from ...schedulers import DDPMScheduler
 from ...utils import (
    BACKENDS_MAPPING,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -103,6 +102,7 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
    )  # noqa
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
    def __init__(
        self,
@@ -144,47 +144,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def remove_all_hooks(self):
        if is_accelerate_available():
            from accelerate.hooks import remove_hook_from_module
@@ -806,9 +765,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
            # 9. Run safety checker
            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, nsfw_detected, watermark_detected)

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -16,7 +16,6 @@ from ...utils import (
    BACKENDS_MAPPING,
    PIL_INTERPOLATION,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -127,6 +126,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
    )  # noqa
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
    def __init__(
        self,
@@ -168,48 +168,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
    def remove_all_hooks(self):
        if is_accelerate_available():
@@ -930,9 +888,8 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
            # 9. Run safety checker
            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, nsfw_detected, watermark_detected)

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -17,7 +17,6 @@ from ...utils import (
    BACKENDS_MAPPING,
    PIL_INTERPOLATION,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -131,6 +130,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
    )  # noqa
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet"
    def __init__(
        self,
@@ -179,48 +179,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
    def remove_all_hooks(self):
        if is_accelerate_available():
@@ -1048,9 +1006,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
            # 11. Run safety checker
            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, nsfw_detected, watermark_detected)

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -16,7 +16,6 @@ from ...utils import (
    BACKENDS_MAPPING,
    PIL_INTERPOLATION,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -130,6 +129,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
    )  # noqa
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
    def __init__(
        self,
@@ -171,48 +171,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
    def remove_all_hooks(self):
        if is_accelerate_available():
@@ -1049,9 +1007,8 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
            # 9. Run safety checker
            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, nsfw_detected, watermark_detected)

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -17,7 +17,6 @@ from ...utils import (
    BACKENDS_MAPPING,
    PIL_INTERPOLATION,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -132,6 +131,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
    )  # noqa
+    model_cpu_offload_seq = "text_encoder->unet"
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
    def __init__(
@@ -181,48 +181,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
    def remove_all_hooks(self):
        if is_accelerate_available():

--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -16,7 +16,6 @@ from ...schedulers import DDPMScheduler
 from ...utils import (
    BACKENDS_MAPPING,
    is_accelerate_available,
-    is_accelerate_version,
    is_bs4_available,
    is_ftfy_available,
    logging,
@@ -89,6 +88,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
    )  # noqa
    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
    def __init__(
        self,
@@ -137,48 +137,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        if self.text_encoder is not None:
-            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
-            # Accelerate will move the next model to the device _before_ calling the offload hook of the
-            # previous model. This will cause both models to be present on the device at the same time.
-            # IF uses T5 for its text encoder which is really large. We can manually call the offload
-            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
-            # the GPU.
-            self.text_encoder_offload_hook = hook
-        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
-        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
-        self.unet_offload_hook = hook
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
    def remove_all_hooks(self):
        if is_accelerate_available():
@@ -904,9 +862,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
            # 10. Run safety checker
            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, nsfw_detected, watermark_detected)