Unverified Commit 93579650 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Refactor model offload (#4514)



* [Draft] Refactor model offload

* [Draft] Refactor model offload

* Apply suggestions from code review

* cpu offlaod updates

* remove model cpu offload from individual pipelines

* add hook to offload models to cpu

* clean up

* model offload

* add model cpu offload string

* make style

* clean up

* fixes for offload issues

* fix tests issues

* resolve merge conflicts

* update src/diffusers/pipelines/pipeline_utils.py
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* make style

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py

---------
Co-authored-by: default avatarDhruv Nair <dhruv.nair@gmail.com>
parent 16a056a7
...@@ -19,8 +19,6 @@ import torch ...@@ -19,8 +19,6 @@ import torch
from packaging import version from packaging import version
from transformers import CLIPImageProcessor, XLMRobertaTokenizer from transformers import CLIPImageProcessor, XLMRobertaTokenizer
from diffusers.utils import is_accelerate_available, is_accelerate_version
from ...configuration_utils import FrozenDict from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
...@@ -100,6 +98,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -100,6 +98,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
def __init__( def __init__(
...@@ -221,34 +220,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -221,34 +220,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
...@@ -750,9 +721,8 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -750,9 +721,8 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, has_nsfw_concept) return (image, has_nsfw_concept)
......
...@@ -21,8 +21,6 @@ import torch ...@@ -21,8 +21,6 @@ import torch
from packaging import version from packaging import version
from transformers import CLIPImageProcessor, XLMRobertaTokenizer from transformers import CLIPImageProcessor, XLMRobertaTokenizer
from diffusers.utils import is_accelerate_available, is_accelerate_version
from ...configuration_utils import FrozenDict from ...configuration_utils import FrozenDict
from ...image_processor import PipelineImageInput, VaeImageProcessor from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
...@@ -127,6 +125,7 @@ class AltDiffusionImg2ImgPipeline( ...@@ -127,6 +125,7 @@ class AltDiffusionImg2ImgPipeline(
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
def __init__( def __init__(
...@@ -219,34 +218,6 @@ class AltDiffusionImg2ImgPipeline( ...@@ -219,34 +218,6 @@ class AltDiffusionImg2ImgPipeline(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
...@@ -773,9 +744,8 @@ class AltDiffusionImg2ImgPipeline( ...@@ -773,9 +744,8 @@ class AltDiffusionImg2ImgPipeline(
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, has_nsfw_concept) return (image, has_nsfw_concept)
......
...@@ -72,6 +72,7 @@ class AudioLDMPipeline(DiffusionPipeline): ...@@ -72,6 +72,7 @@ class AudioLDMPipeline(DiffusionPipeline):
vocoder ([`~transformers.SpeechT5HifiGan`]): vocoder ([`~transformers.SpeechT5HifiGan`]):
Vocoder of class `SpeechT5HifiGan`. Vocoder of class `SpeechT5HifiGan`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
def __init__( def __init__(
self, self,
......
...@@ -947,6 +947,8 @@ class AudioLDM2Pipeline(DiffusionPipeline): ...@@ -947,6 +947,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
if callback is not None and i % callback_steps == 0: if callback is not None and i % callback_steps == 0:
callback(i, t, latents) callback(i, t, latents)
self.maybe_free_model_hooks()
# 8. Post-processing # 8. Post-processing
if not output_type == "latent": if not output_type == "latent":
latents = 1 / self.vae.config.scaling_factor * latents latents = 1 / self.vae.config.scaling_factor * latents
......
...@@ -5,8 +5,6 @@ import torch ...@@ -5,8 +5,6 @@ import torch
from ...models import UNet2DModel from ...models import UNet2DModel
from ...schedulers import CMStochasticIterativeScheduler from ...schedulers import CMStochasticIterativeScheduler
from ...utils import ( from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging, logging,
replace_example_docstring, replace_example_docstring,
) )
...@@ -62,6 +60,7 @@ class ConsistencyModelPipeline(DiffusionPipeline): ...@@ -62,6 +60,7 @@ class ConsistencyModelPipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
compatible with [`CMStochasticIterativeScheduler`]. compatible with [`CMStochasticIterativeScheduler`].
""" """
model_cpu_offload_seq = "unet"
def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None: def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None:
super().__init__() super().__init__()
...@@ -73,34 +72,6 @@ class ConsistencyModelPipeline(DiffusionPipeline): ...@@ -73,34 +72,6 @@ class ConsistencyModelPipeline(DiffusionPipeline):
self.safety_checker = None self.safety_checker = None
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a
time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs.
Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the
iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None): def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
shape = (batch_size, num_channels, height, width) shape = (batch_size, num_channels, height, width)
if isinstance(generator, list) and len(generator) != batch_size: if isinstance(generator, list) and len(generator) != batch_size:
...@@ -280,9 +251,8 @@ class ConsistencyModelPipeline(DiffusionPipeline): ...@@ -280,9 +251,8 @@ class ConsistencyModelPipeline(DiffusionPipeline):
# 6. Post-process image sample # 6. Post-process image sample
image = self.postprocess_image(sample, output_type=output_type) image = self.postprocess_image(sample, output_type=output_type)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image,) return (image,)
......
...@@ -29,8 +29,6 @@ from ...models.lora import adjust_lora_scale_text_encoder ...@@ -29,8 +29,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import ( from ...utils import (
deprecate, deprecate,
is_accelerate_available,
is_accelerate_version,
logging, logging,
replace_example_docstring, replace_example_docstring,
) )
...@@ -125,6 +123,7 @@ class StableDiffusionControlNetPipeline( ...@@ -125,6 +123,7 @@ class StableDiffusionControlNetPipeline(
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
def __init__( def __init__(
...@@ -210,34 +209,6 @@ class StableDiffusionControlNetPipeline( ...@@ -210,34 +209,6 @@ class StableDiffusionControlNetPipeline(
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
# the safety checker can offload the vae again
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# control net hook has be manually offloaded as it alternates with unet
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
...@@ -1031,9 +1002,8 @@ class StableDiffusionControlNetPipeline( ...@@ -1031,9 +1002,8 @@ class StableDiffusionControlNetPipeline(
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, has_nsfw_concept) return (image, has_nsfw_concept)
......
...@@ -28,8 +28,6 @@ from ...models.lora import adjust_lora_scale_text_encoder ...@@ -28,8 +28,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import ( from ...utils import (
deprecate, deprecate,
is_accelerate_available,
is_accelerate_version,
logging, logging,
replace_example_docstring, replace_example_docstring,
) )
...@@ -149,6 +147,7 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -149,6 +147,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
def __init__( def __init__(
...@@ -234,34 +233,6 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -234,34 +233,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
# the safety checker can offload the vae again
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# control net hook has be manually offloaded as it alternates with unet
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
...@@ -1107,9 +1078,8 @@ class StableDiffusionControlNetImg2ImgPipeline( ...@@ -1107,9 +1078,8 @@ class StableDiffusionControlNetImg2ImgPipeline(
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, has_nsfw_concept) return (image, has_nsfw_concept)
......
...@@ -30,8 +30,6 @@ from ...models.lora import adjust_lora_scale_text_encoder ...@@ -30,8 +30,6 @@ from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import ( from ...utils import (
deprecate, deprecate,
is_accelerate_available,
is_accelerate_version,
logging, logging,
replace_example_docstring, replace_example_docstring,
) )
...@@ -273,6 +271,7 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -273,6 +271,7 @@ class StableDiffusionControlNetInpaintPipeline(
feature_extractor ([`~transformers.CLIPImageProcessor`]): feature_extractor ([`~transformers.CLIPImageProcessor`]):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
""" """
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"] _optional_components = ["safety_checker", "feature_extractor"]
def __init__( def __init__(
...@@ -361,34 +360,6 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -361,34 +360,6 @@ class StableDiffusionControlNetInpaintPipeline(
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
if self.safety_checker is not None:
# the safety checker can offload the vae again
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# control net hook has be manually offloaded as it alternates with unet
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
...@@ -1373,9 +1344,8 @@ class StableDiffusionControlNetInpaintPipeline( ...@@ -1373,9 +1344,8 @@ class StableDiffusionControlNetInpaintPipeline(
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, has_nsfw_concept) return (image, has_nsfw_concept)
......
...@@ -166,6 +166,7 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi ...@@ -166,6 +166,7 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
""" """
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
_optional_components = ["tokenizer", "text_encoder"] _optional_components = ["tokenizer", "text_encoder"]
def __init__( def __init__(
...@@ -248,38 +249,6 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi ...@@ -248,38 +249,6 @@ class StableDiffusionXLControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMi
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
model_sequence = (
[self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
)
model_sequence.extend([self.unet, self.vae])
hook = None
for cpu_offloaded_model in model_sequence:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt( def encode_prompt(
self, self,
......
...@@ -145,6 +145,9 @@ class StableDiffusionXLControlNetPipeline( ...@@ -145,6 +145,9 @@ class StableDiffusionXLControlNetPipeline(
watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
watermarker is used. watermarker is used.
""" """
model_cpu_offload_seq = (
"text_encoder->text_encoder_2->unet->vae" # leave controlnet out on purpose because it iterates with unet
)
def __init__( def __init__(
self, self,
...@@ -221,38 +224,6 @@ class StableDiffusionXLControlNetPipeline( ...@@ -221,38 +224,6 @@ class StableDiffusionXLControlNetPipeline(
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
model_sequence = (
[self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
)
model_sequence.extend([self.unet, self.vae])
hook = None
for cpu_offloaded_model in model_sequence:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt( def encode_prompt(
self, self,
...@@ -1170,12 +1141,10 @@ class StableDiffusionXLControlNetPipeline( ...@@ -1170,12 +1141,10 @@ class StableDiffusionXLControlNetPipeline(
if callback is not None and i % callback_steps == 0: if callback is not None and i % callback_steps == 0:
callback(i, t, latents) callback(i, t, latents)
# If we do sequential model offloading, let's offload unet and controlnet
# manually for max memory savings # manually for max memory savings
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
self.unet.to("cpu") self.upcast_vae()
self.controlnet.to("cpu") latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
torch.cuda.empty_cache()
if not output_type == "latent": if not output_type == "latent":
# make sure the VAE is in float32 mode, as it overflows in float16 # make sure the VAE is in float32 mode, as it overflows in float16
...@@ -1192,17 +1161,16 @@ class StableDiffusionXLControlNetPipeline( ...@@ -1192,17 +1161,16 @@ class StableDiffusionXLControlNetPipeline(
self.vae.to(dtype=torch.float16) self.vae.to(dtype=torch.float16)
else: else:
image = latents image = latents
return StableDiffusionXLPipelineOutput(images=image)
# apply watermark if available if not output_type == "latent":
if self.watermark is not None: # apply watermark if available
image = self.watermark.apply_watermark(image) if self.watermark is not None:
image = self.watermark.apply_watermark(image)
image = self.image_processor.postprocess(image, output_type=output_type) image = self.image_processor.postprocess(image, output_type=output_type)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image,) return (image,)
......
...@@ -36,8 +36,6 @@ from ...models.attention_processor import ( ...@@ -36,8 +36,6 @@ from ...models.attention_processor import (
from ...models.lora import adjust_lora_scale_text_encoder from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import ( from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging, logging,
replace_example_docstring, replace_example_docstring,
) )
...@@ -179,6 +177,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver ...@@ -179,6 +177,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
watermark output images. If not defined, it will default to True if the package is installed, otherwise no watermark output images. If not defined, it will default to True if the package is installed, otherwise no
watermarker will be used. watermarker will be used.
""" """
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
_optional_components = ["tokenizer", "text_encoder"] _optional_components = ["tokenizer", "text_encoder"]
def __init__( def __init__(
...@@ -258,38 +257,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver ...@@ -258,38 +257,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(DiffusionPipeline, TextualInver
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
model_sequence = (
[self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
)
model_sequence.extend([self.unet, self.vae])
hook = None
for cpu_offloaded_model in model_sequence:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
cpu_offload_with_hook(self.controlnet, device)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt( def encode_prompt(
self, self,
......
...@@ -39,6 +39,7 @@ class DanceDiffusionPipeline(DiffusionPipeline): ...@@ -39,6 +39,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
[`IPNDMScheduler`]. [`IPNDMScheduler`].
""" """
model_cpu_offload_seq = "unet"
def __init__(self, unet, scheduler): def __init__(self, unet, scheduler):
super().__init__() super().__init__()
......
...@@ -35,6 +35,7 @@ class DDIMPipeline(DiffusionPipeline): ...@@ -35,6 +35,7 @@ class DDIMPipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`]. [`DDPMScheduler`], or [`DDIMScheduler`].
""" """
model_cpu_offload_seq = "unet"
def __init__(self, unet, scheduler): def __init__(self, unet, scheduler):
super().__init__() super().__init__()
......
...@@ -35,6 +35,7 @@ class DDPMPipeline(DiffusionPipeline): ...@@ -35,6 +35,7 @@ class DDPMPipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
[`DDPMScheduler`], or [`DDIMScheduler`]. [`DDPMScheduler`], or [`DDIMScheduler`].
""" """
model_cpu_offload_seq = "unet"
def __init__(self, unet, scheduler): def __init__(self, unet, scheduler):
super().__init__() super().__init__()
......
...@@ -13,7 +13,6 @@ from ...schedulers import DDPMScheduler ...@@ -13,7 +13,6 @@ from ...schedulers import DDPMScheduler
from ...utils import ( from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -103,6 +102,7 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -103,6 +102,7 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
) # noqa ) # noqa
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
model_cpu_offload_seq = "text_encoder->unet"
def __init__( def __init__(
self, self,
...@@ -144,47 +144,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -144,47 +144,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
from accelerate.hooks import remove_hook_from_module from accelerate.hooks import remove_hook_from_module
...@@ -806,9 +765,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -806,9 +765,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
# 9. Run safety checker # 9. Run safety checker
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, nsfw_detected, watermark_detected) return (image, nsfw_detected, watermark_detected)
......
...@@ -16,7 +16,6 @@ from ...utils import ( ...@@ -16,7 +16,6 @@ from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
PIL_INTERPOLATION, PIL_INTERPOLATION,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -127,6 +126,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -127,6 +126,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
) # noqa ) # noqa
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
model_cpu_offload_seq = "text_encoder->unet"
def __init__( def __init__(
self, self,
...@@ -168,48 +168,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -168,48 +168,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
...@@ -930,9 +888,8 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -930,9 +888,8 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
# 9. Run safety checker # 9. Run safety checker
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, nsfw_detected, watermark_detected) return (image, nsfw_detected, watermark_detected)
......
...@@ -17,7 +17,6 @@ from ...utils import ( ...@@ -17,7 +17,6 @@ from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
PIL_INTERPOLATION, PIL_INTERPOLATION,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -131,6 +130,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -131,6 +130,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) # noqa ) # noqa
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
model_cpu_offload_seq = "text_encoder->unet"
def __init__( def __init__(
self, self,
...@@ -179,48 +179,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -179,48 +179,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
...@@ -1048,9 +1006,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -1048,9 +1006,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
# 11. Run safety checker # 11. Run safety checker
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, nsfw_detected, watermark_detected) return (image, nsfw_detected, watermark_detected)
......
...@@ -16,7 +16,6 @@ from ...utils import ( ...@@ -16,7 +16,6 @@ from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
PIL_INTERPOLATION, PIL_INTERPOLATION,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -130,6 +129,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -130,6 +129,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
) # noqa ) # noqa
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
model_cpu_offload_seq = "text_encoder->unet"
def __init__( def __init__(
self, self,
...@@ -171,48 +171,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -171,48 +171,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
...@@ -1049,9 +1007,8 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -1049,9 +1007,8 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
# 9. Run safety checker # 9. Run safety checker
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, nsfw_detected, watermark_detected) return (image, nsfw_detected, watermark_detected)
......
...@@ -17,7 +17,6 @@ from ...utils import ( ...@@ -17,7 +17,6 @@ from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
PIL_INTERPOLATION, PIL_INTERPOLATION,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -132,6 +131,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -132,6 +131,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
) # noqa ) # noqa
model_cpu_offload_seq = "text_encoder->unet"
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
def __init__( def __init__(
...@@ -181,48 +181,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -181,48 +181,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
......
...@@ -16,7 +16,6 @@ from ...schedulers import DDPMScheduler ...@@ -16,7 +16,6 @@ from ...schedulers import DDPMScheduler
from ...utils import ( from ...utils import (
BACKENDS_MAPPING, BACKENDS_MAPPING,
is_accelerate_available, is_accelerate_available,
is_accelerate_version,
is_bs4_available, is_bs4_available,
is_ftfy_available, is_ftfy_available,
logging, logging,
...@@ -89,6 +88,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -89,6 +88,7 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) # noqa ) # noqa
_optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
model_cpu_offload_seq = "text_encoder->unet"
def __init__( def __init__(
self, self,
...@@ -137,48 +137,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -137,48 +137,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
if self.text_encoder is not None:
_, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
# Accelerate will move the next model to the device _before_ calling the offload hook of the
# previous model. This will cause both models to be present on the device at the same time.
# IF uses T5 for its text encoder which is really large. We can manually call the offload
# hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
# the GPU.
self.text_encoder_offload_hook = hook
_, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
# if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
self.unet_offload_hook = hook
if self.safety_checker is not None:
_, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
def remove_all_hooks(self): def remove_all_hooks(self):
if is_accelerate_available(): if is_accelerate_available():
...@@ -904,9 +862,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -904,9 +862,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
# 10. Run safety checker # 10. Run safety checker
image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
# Offload last model to CPU # Offload all models
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.maybe_free_model_hooks()
self.final_offload_hook.offload()
if not return_dict: if not return_dict:
return (image, nsfw_detected, watermark_detected) return (image, nsfw_detected, watermark_detected)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment