Refactor model offload (#4514)

* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>

Refactor model offload (#4514)
* [Draft] Refactor model offload * [Draft] Refactor model offload * Apply suggestions from code review * cpu offlaod updates * remove model cpu offload from individual pipelines * add hook to offload models to cpu * clean up * model offload * add model cpu offload string * make style * clean up * fixes for offload issues * fix tests issues * resolve merge conflicts * update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * make style * Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>
93579650 · Patrick von Platen · GitHub · 16a056a7 · 93579650 · 93579650
Unverified Commit 93579650 authored Sep 11, 2023 by Patrick von Platen Committed by GitHub Sep 11, 2023
20 changed files
--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -25,8 +25,6 @@ from ...models import PriorTransformer
 from ...schedulers import HeunDiscreteScheduler
 from ...utils import (
    BaseOutput,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -99,6 +97,9 @@ class ShapEPipeline(DiffusionPipeline):
            rendering method.
    """
+    model_cpu_offload_seq = "text_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
    def __init__(
        self,
        prior: PriorTransformer,
@@ -129,34 +130,6 @@ class ShapEPipeline(DiffusionPipeline):
        latents = latents * scheduler.init_noise_sigma
        return latents
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.prior, self.shap_e_renderer]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def _encode_prompt(
        self,
        prompt,
@@ -318,6 +291,9 @@ class ShapEPipeline(DiffusionPipeline):
                sample=latents,
            ).prev_sample
+        # Offload all models
+        self.maybe_free_model_hooks()
        if output_type not in ["np", "pil", "latent", "mesh"]:
            raise ValueError(
                f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
@@ -352,10 +328,6 @@ class ShapEPipeline(DiffusionPipeline):
            if output_type == "pil":
                images = [self.numpy_to_pil(image) for image in images]
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
        if not return_dict:
            return (images,)

--- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
+++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -98,6 +98,9 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
            rendering method.
    """
+    model_cpu_offload_seq = "image_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
    def __init__(
        self,
        prior: PriorTransformer,
@@ -309,9 +312,8 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
            if output_type == "pil":
                images = [self.numpy_to_pil(image) for image in images]
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (images,)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -21,8 +21,6 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers.utils import is_accelerate_available, is_accelerate_version
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
@@ -150,6 +148,7 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -228,35 +227,6 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lor
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -27,8 +27,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -101,6 +99,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -222,34 +221,6 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def _encode_prompt(
        self,
        prompt,
@@ -745,9 +716,8 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -189,6 +189,7 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversion
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -85,6 +85,7 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py
@@ -32,8 +32,6 @@ from ...utils import (
    PIL_INTERPOLATION,
    BaseOutput,
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -272,6 +270,7 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
    def __init__(
@@ -400,35 +399,6 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1070,9 +1040,8 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
        if output_type == "pil":
            mask_image = self.image_processor.numpy_to_pil(mask_image)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        return mask_image
@@ -1305,9 +1274,8 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
        if decode_latents and output_type == "pil":
            image = self.image_processor.numpy_to_pil(image)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (latents, image)
@@ -1548,9 +1516,8 @@ class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderM
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
@@ -28,8 +28,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -125,6 +123,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
    _optional_components = ["safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,
@@ -197,34 +196,6 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -33,8 +33,6 @@ from ...models.attention import GatedSelfAttentionDense
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -182,6 +180,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -261,34 +260,6 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
        """
        self.vae.disable_tiling()
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
    def encode_prompt(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -65,6 +65,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
    # TODO: feature_extractor is required to encode images (if they are in PIL format),
    # we should give a descriptive message if the pipeline doesn't have one.
    _optional_components = ["safety_checker"]
+    model_cpu_offload_seq = "image_encoder->unet->vae"
    def __init__(
        self,
@@ -392,6 +393,8 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
                    if callback is not None and i % callback_steps == 0:
                        callback(i, t, latents)
+        self.maybe_free_model_hooks()
        if not output_type == "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -30,8 +30,6 @@ from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    PIL_INTERPOLATION,
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -129,6 +127,7 @@ class StableDiffusionImg2ImgPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -221,35 +220,6 @@ class StableDiffusionImg2ImgPipeline(
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -775,9 +745,8 @@ class StableDiffusionImg2ImgPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -27,7 +27,7 @@ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoa
 from ...models import AsymmetricAutoencoderKL, AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -192,6 +192,7 @@ class StableDiffusionInpaintPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -292,35 +293,6 @@ class StableDiffusionInpaintPipeline(
        )
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -1064,9 +1036,8 @@ class StableDiffusionInpaintPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -27,7 +27,7 @@ from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoa
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -115,6 +115,7 @@ class StableDiffusionInpaintPipelineLegacy(
        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["feature_extractor"]
    def __init__(
@@ -214,35 +215,6 @@ class StableDiffusionInpaintPipelineLegacy(
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
        self.register_to_config(requires_safety_checker=requires_safety_checker)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -761,9 +733,8 @@ class StableDiffusionInpaintPipelineLegacy(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -24,7 +24,7 @@ from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -89,6 +89,7 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -392,44 +393,14 @@ class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversion
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)
        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    def _encode_prompt(
        self,
        prompt,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -24,7 +24,7 @@ from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import LMSDiscreteScheduler
-from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging
+from ...utils import deprecate, logging
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
@@ -80,6 +80,7 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
        feature_extractor ([`CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -128,35 +129,6 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
        sampling = getattr(library, "sampling")
        self.sampler = getattr(sampling, scheduler_type)
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -630,9 +602,8 @@ class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoade
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -78,6 +78,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline):
        scheduler ([`SchedulerMixin`]):
            A [`EulerDiscreteScheduler`] to be used in combination with `unet` to denoise the encoded image latents.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    def __init__(
        self,

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -29,8 +29,6 @@ from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    BaseOutput,
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -114,6 +112,7 @@ class StableDiffusionLDM3DPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -191,35 +190,6 @@ class StableDiffusionLDM3DPipeline(
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -696,9 +666,8 @@ class StableDiffusionLDM3DPipeline(
        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return ((rgb, depth), has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py
@@ -66,6 +66,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
        with_augs ([`list`]):
            Textual augmentations to apply while editing the text-to-image model. Set to `[]` for no augmentations.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -787,9 +788,8 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoa
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py
@@ -78,6 +78,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(

--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py
@@ -25,8 +25,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
    logging,
    replace_example_docstring,
 )
@@ -95,6 +93,7 @@ class StableDiffusionParadigmsPipeline(
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
    """
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = ["safety_checker", "feature_extractor"]
    def __init__(
@@ -175,35 +174,6 @@ class StableDiffusionParadigmsPipeline(
        """
        self.vae.disable_tiling()
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
-        time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
-        Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
-        iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-        device = torch.device(f"cuda:{gpu_id}")
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
    def _encode_prompt(
        self,
@@ -803,9 +773,8 @@ class StableDiffusionParadigmsPipeline(
        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-        # Offload last model to CPU
+        # Offload all models
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+        self.maybe_free_model_hooks()
-            self.final_offload_hook.offload()
        if not return_dict:
            return (image, has_nsfw_concept)