Refactor model offload (#4514)

* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

Refactor model offload (#4514)
* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
93579650 · Patrick von Platen · GitHub · 16a056a7 · 93579650 · 93579650
Unverified Commit 93579650 authored Sep 11, 2023 by Patrick von Platen Committed by GitHub Sep 11, 2023
20 changed files
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -39,8 +39,6 @@ from ...utils import (
    PIL_INTERPOLATION,
    BaseOutput,
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -309,6 +307,7 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
            Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
            pipeline publicly.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = [
        "safety_checker",
        "feature_extractor",
@@ -365,30 +364,6 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        hook = None
-        for cpu_offloaded_model in [self.vae, self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1081,9 +1056,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)
@@ -1286,9 +1260,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
        image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (inverted_latents, image)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py
@@ -117,6 +117,7 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin)
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -32,7 +32,7 @@ from ...models.attention_processor import (
 )
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -90,6 +90,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
    def __init__(
@@ -140,32 +141,6 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
        self.register_to_config(max_noise_level=max_noise_level)
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            if cpu_offloaded_model is not None:
-                _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is not None:
            feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -27,8 +27,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -92,6 +90,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
    """
    _exclude_from_cpu_offload = ["prior", "image_normalizer"]
+    model_cpu_offload_seq = "text_encoder->prior_text_encoder->unet->vae"
    # prior components
    prior_tokenizer: CLIPTokenizer
@@ -164,31 +163,6 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
        """
        self.vae.disable_slicing()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior_text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
    def _encode_prior_prompt(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -19,15 +19,13 @@ import PIL
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-from diffusers.utils.import_utils import is_accelerate_available
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.embeddings import get_timestep_embedding
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_version, logging, replace_example_docstring
+from ...utils import deprecate, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -94,6 +92,7 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
    _exclude_from_cpu_offload = ["image_normalizer"]
    # image encoding components
@@ -161,31 +160,6 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin
        """
        self.vae.disable_slicing()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.image_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,

--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -47,6 +47,7 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -129,6 +129,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    def __init__(
        self,
@@ -198,36 +199,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def encode_prompt(
        self,
        prompt: str,
@@ -900,17 +871,16 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
                self.vae.to(dtype=torch.float16)
        else:
            image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
-        # apply watermark if available
+        if not output_type == "latent":
-        if self.watermark is not None:
+            # apply watermark if available
-            image = self.watermark.apply_watermark(image)
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
-        image = self.image_processor.postprocess(image, output_type=output_type)
+            image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -134,6 +134,8 @@ class StableDiffusionXLImg2ImgPipeline(
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    _optional_components = ["tokenizer", "text_encoder"]
    def __init__(
@@ -205,36 +207,6 @@ class StableDiffusionXLImg2ImgPipeline(
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,
@@ -1057,9 +1029,8 @@ class StableDiffusionXLImg2ImgPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -280,6 +280,8 @@ class StableDiffusionXLInpaintPipeline(
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    _optional_components = ["tokenizer", "text_encoder"]
    def __init__(
@@ -354,37 +356,6 @@ class StableDiffusionXLInpaintPipeline(
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,
@@ -1377,9 +1348,8 @@ class StableDiffusionXLInpaintPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -32,8 +32,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    is_invisible_watermark_available,
    logging,
    replace_example_docstring,
@@ -143,6 +141,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    def __init__(
        self,
@@ -211,38 +210,6 @@ class StableDiffusionXLInstructPix2PixPipeline(
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,
        prompt: str,
@@ -965,9 +932,8 @@ class StableDiffusionXLInstructPix2PixPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image,)

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -30,8 +30,6 @@ from ...utils import (
    PIL_INTERPOLATION,
    BaseOutput,
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -151,6 +149,7 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
        feature_extractor ([`CLIPFeatureExtractor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->adapter->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -217,34 +216,6 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
        """
        self.vae.disable_slicing()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.adapter, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -815,9 +786,8 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline):
            # 9. Run safety checker
            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -35,8 +35,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    PIL_INTERPOLATION,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -159,6 +157,7 @@ class StableDiffusionXLAdapterPipeline(
        feature_extractor ([`CLIPFeatureExtractor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
    def __init__(
        self,
@@ -222,37 +221,6 @@ class StableDiffusionXLAdapterPipeline(
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        model_sequence = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-        model_sequence.extend([self.unet, self.vae])
-        hook = None
-        for cpu_offloaded_model in model_sequence:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -25,8 +25,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -95,6 +93,7 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,
@@ -148,31 +147,6 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -678,9 +652,8 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lora
        else:
            video = tensor2vid(video_tensor)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (video,)

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -26,8 +26,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -157,6 +155,7 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,
@@ -210,31 +209,6 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.vae, self.unet]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -753,9 +727,8 @@ class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        else:
            video = tensor2vid(video_tensor)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (video,)

--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -635,9 +635,8 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline):
            # Run safety checker
            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -76,6 +76,8 @@ class UnCLIPPipeline(DiffusionPipeline):
    decoder_scheduler: UnCLIPScheduler
    super_res_scheduler: UnCLIPScheduler
+    model_cpu_offload_seq = "text_encoder->text_proj->decoder->super_res_first->super_res_last"
    def __init__(
        self,
        prior: PriorTransformer,

--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -77,6 +77,7 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline):
    decoder_scheduler: UnCLIPScheduler
    super_res_scheduler: UnCLIPScheduler
+    model_cpu_offload_seq = "text_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_last"
    def __init__(
        self,

--- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -103,6 +103,9 @@ class UniDiffuserPipeline(DiffusionPipeline):
            original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler.
    """
+    # TODO: support for moving submodules for components with enable_model_cpu_offload
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae->text_decoder"
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -173,7 +176,15 @@ class UniDiffuserPipeline(DiffusionPipeline):
            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae, self.image_encoder, self.text_decoder]:
+        for cpu_offloaded_model in [
+            self.text_encoder.text_model,
+            self.image_encoder,
+            self.unet,
+            self.vae,
+            self.text_decoder.encode_prefix,
+            self.text_decoder.decode_prefix,
+            self.text_decoder,
+        ]:
            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
        if self.safety_checker is not None:
@@ -1344,6 +1355,8 @@ class UniDiffuserPipeline(DiffusionPipeline):
                for output, length in zip(output_list, seq_lengths)
            ]
+        self.maybe_free_model_hooks()
        # 10. Convert to PIL
        if output_type == "pil" and gen_image is not None:
            gen_image = self.numpy_to_pil(gen_image)

--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -58,6 +58,8 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "bert->unet->vqvae"
    tokenizer: CLIPTokenizer
    image_feature_extractor: CLIPImageProcessor
    text_encoder: CLIPTextModelWithProjection

--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -52,6 +52,8 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "bert->unet->vqvae"
    image_feature_extractor: CLIPImageProcessor
    image_encoder: CLIPVisionModelWithProjection
    image_unet: UNet2DConditionModel