Add model offload to x4 upscaler (#3187)

* Add model offload to x4 upscaler * fix

Add model offload to x4 upscaler (#3187)
* Add model offload to x4 upscaler * fix
9c856118 · Patrick von Platen · GitHub · 9bce375f · 9c856118
Unverified Commit 9c856118 authored Apr 21, 2023 by Patrick von Platen Committed by GitHub Apr 21, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 2 deletions

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py ...nes/stable_diffusion/pipeline_stable_diffusion_upscale.py +32 -2

No files found.
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -23,7 +23,7 @@ from transformers import CLIPTextModel, CLIPTokenizer
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
+from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
@@ -129,10 +129,36 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
        device = torch.device(f"cuda:{gpu_id}")
-        for cpu_offloaded_model in [self.unet, self.text_encoder]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
            if cpu_offloaded_model is not None:
                cpu_offload(cpu_offloaded_model, device)
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+        device = torch.device(f"cuda:{gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
    @property
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
    def _execution_device(self):
@@ -647,6 +673,10 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
        self.vae.to(dtype=torch.float32)
        image = self.decode_latents(latents.float())
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
        # 11. Convert to PIL
        if output_type == "pil":
            image = self.numpy_to_pil(image)