"tests/nn/vscode:/vscode.git/clone" did not exist on "8a42a8e3ff1c33ccde9ee603426b17a29cfead3c"
Unverified Commit 93579650 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Refactor model offload (#4514)



* [Draft] Refactor model offload

* [Draft] Refactor model offload

* Apply suggestions from code review

* cpu offlaod updates

* remove model cpu offload from individual pipelines

* add hook to offload models to cpu

* clean up

* model offload

* add model cpu offload string

* make style

* clean up

* fixes for offload issues

* fix tests issues

* resolve merge conflicts

* update src/diffusers/pipelines/pipeline_utils.py
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* make style

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py

---------
Co-authored-by: default avatarDhruv Nair <dhruv.nair@gmail.com>
parent 16a056a7
......@@ -43,6 +43,7 @@ class DiTPipeline(DiffusionPipeline):
scheduler ([`DDIMScheduler`]):
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
"""
model_cpu_offload_seq = "transformer->vae"
def __init__(
self,
......
......@@ -22,8 +22,6 @@ from transformers import (
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDIMScheduler, DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -95,6 +93,8 @@ class KandinskyPipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "text_encoder->unet->movq"
def __init__(
self,
text_encoder: MultilingualCLIP,
......@@ -228,31 +228,6 @@ class KandinskyPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......
......@@ -142,6 +142,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
"""
_load_connected_pipes = True
model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
def __init__(
self,
......@@ -191,16 +192,6 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
......@@ -365,6 +356,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
"""
_load_connected_pipes = True
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
def __init__(
self,
......@@ -414,16 +406,6 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
......@@ -611,6 +593,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
"""
_load_connected_pipes = True
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
def __init__(
self,
......@@ -660,16 +643,6 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
......
......@@ -25,8 +25,6 @@ from transformers import (
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDIMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -117,6 +115,8 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
MoVQ image encoder and decoder
"""
model_cpu_offload_seq = "text_encoder->unet->movq"
def __init__(
self,
text_encoder: MultilingualCLIP,
......@@ -263,31 +263,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
# add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
def add_noise(
self,
......
......@@ -29,8 +29,6 @@ from ... import __version__
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDIMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -259,6 +257,8 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
MoVQ image encoder and decoder
"""
model_cpu_offload_seq = "text_encoder->unet->movq"
def __init__(
self,
text_encoder: MultilingualCLIP,
......@@ -393,31 +393,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......
......@@ -24,8 +24,6 @@ from ...models import PriorTransformer
from ...schedulers import UnCLIPScheduler
from ...utils import (
BaseOutput,
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -149,6 +147,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
"""
_exclude_from_cpu_offload = ["prior"]
model_cpu_offload_seq = "text_encoder->prior"
def __init__(
self,
......@@ -395,35 +394,6 @@ class KandinskyPriorPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.prior]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.prior_hook = hook
_, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......@@ -557,8 +527,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
if negative_prompt is None:
zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
self.maybe_free_model_hooks
else:
image_embeddings, zero_embeds = image_embeddings.chunk(2)
......
......@@ -19,8 +19,6 @@ import torch
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -82,6 +80,8 @@ class KandinskyV22Pipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "unet->movq"
def __init__(
self,
unet: UNet2DConditionModel,
......@@ -109,31 +109,6 @@ class KandinskyV22Pipeline(DiffusionPipeline):
latents = latents * scheduler.init_noise_sigma
return latents
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......@@ -273,9 +248,7 @@ class KandinskyV22Pipeline(DiffusionPipeline):
# post-processing
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
self.maybe_free_model_hooks()
if output_type not in ["pt", "np", "pil"]:
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
......
......@@ -136,6 +136,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
A image_processor to be used to preprocess image from clip.
"""
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
_load_connected_pipes = True
def __init__(
......@@ -180,16 +181,6 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
......@@ -351,6 +342,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
A image_processor to be used to preprocess image from clip.
"""
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
_load_connected_pipes = True
def __init__(
......@@ -588,6 +580,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
A image_processor to be used to preprocess image from clip.
"""
model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
_load_connected_pipes = True
def __init__(
......@@ -632,16 +625,6 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
self.prior_pipe.enable_model_cpu_offload()
self.decoder_pipe.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
......
......@@ -19,10 +19,7 @@ import torch
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
......@@ -122,6 +119,8 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "unet->movq"
def __init__(
self,
unet: UNet2DConditionModel,
......@@ -149,34 +148,7 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
latents = latents * scheduler.init_noise_sigma
return latents
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
......@@ -327,9 +299,8 @@ class KandinskyV22ControlnetPipeline(DiffusionPipeline):
# post-processing
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
# Offload all models
self.maybe_free_model_hooks()
if output_type not in ["pt", "np", "pil"]:
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
......
......@@ -22,10 +22,7 @@ from PIL import Image
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
......@@ -136,6 +133,8 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "unet->movq"
def __init__(
self,
unet: UNet2DConditionModel,
......@@ -204,34 +203,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
return latents
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
......@@ -388,9 +360,8 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
# post-processing
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
# Offload all models
self.maybe_free_model_hooks()
if output_type not in ["pt", "np", "pil"]:
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
......
......@@ -22,10 +22,7 @@ from PIL import Image
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
......@@ -110,6 +107,8 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "unet->movq"
def __init__(
self,
unet: UNet2DConditionModel,
......@@ -177,34 +176,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
return latents
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
......@@ -352,9 +324,8 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
# post-processing
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
# Offload all models
self.maybe_free_model_hooks()
if output_type not in ["pt", "np", "pil"]:
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
......
......@@ -26,10 +26,7 @@ from ... import __version__
from ...models import UNet2DConditionModel, VQModel
from ...schedulers import DDPMScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
......@@ -253,6 +250,8 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
MoVQ Decoder to generate the image from the latents.
"""
model_cpu_offload_seq = "unet->movq"
def __init__(
self,
unet: UNet2DConditionModel,
......@@ -281,34 +280,7 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
latents = latents * scheduler.init_noise_sigma
return latents
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.unet, self.movq]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
......@@ -505,9 +477,8 @@ class KandinskyV22InpaintPipeline(DiffusionPipeline):
latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
image = self.movq.decode(latents, force_not_quantize=True)["sample"]
# Offload last model to CPU
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
self.final_offload_hook.offload()
# Offload all models
self.maybe_free_model_hooks()
if output_type not in ["pt", "np", "pil"]:
raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
......
......@@ -7,8 +7,6 @@ from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTo
from ...models import PriorTransformer
from ...schedulers import UnCLIPScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -106,6 +104,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
A image_processor to be used to preprocess image from clip.
"""
model_cpu_offload_seq = "text_encoder->image_encoder->prior"
_exclude_from_cpu_offload = ["prior"]
def __init__(
......@@ -355,35 +354,6 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.prior]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.prior_hook = hook
_, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......
......@@ -7,8 +7,6 @@ from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTo
from ...models import PriorTransformer
from ...schedulers import UnCLIPScheduler
from ...utils import (
is_accelerate_available,
is_accelerate_version,
logging,
replace_example_docstring,
)
......@@ -122,6 +120,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
A scheduler to be used in combination with `prior` to generate image embedding.
"""
model_cpu_offload_seq = "text_encoder->image_encoder->prior"
_exclude_from_cpu_offload = ["prior"]
def __init__(
......@@ -394,35 +393,6 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
hook = None
for cpu_offloaded_model in [self.text_encoder, self.prior]:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.prior_hook = hook
_, hook = cpu_offload_with_hook(self.image_encoder, device, prev_module_hook=self.prior_hook)
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......
......@@ -49,6 +49,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
"""
model_cpu_offload_seq = "bert->unet->vqvae"
def __init__(
self,
......
......@@ -28,7 +28,13 @@ from transformers import (
from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import is_librosa_available, logging, replace_example_docstring
from ...utils import (
is_accelerate_available,
is_accelerate_version,
is_librosa_available,
logging,
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
......@@ -391,6 +397,40 @@ class MusicLDMPipeline(DiffusionPipeline):
latents = latents * self.scheduler.init_noise_sigma
return latents
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
model_sequence = [
self.text_encoder.text_model,
self.text_encoder.text_projection,
self.unet,
self.vae,
self.vocoder,
self.text_encoder,
]
hook = None
for cpu_offloaded_model in model_sequence:
_, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
# We'll offload the last model manually.
self.final_offload_hook = hook
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
......@@ -578,6 +618,8 @@ class MusicLDMPipeline(DiffusionPipeline):
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
self.maybe_free_model_hooks()
# 8. Post-processing
if not output_type == "latent":
latents = 1 / self.vae.config.scaling_factor * latents
......
......@@ -169,6 +169,9 @@ class PaintByExamplePipeline(DiffusionPipeline):
"""
# TODO: feature_extractor is required to encode initial images (if they are in PIL format),
# we should give a descriptive message if the pipeline doesn't have one.
model_cpu_offload_seq = "unet->vae"
_exclude_from_cpu_offload = ["image_encoder"]
_optional_components = ["safety_checker"]
def __init__(
......@@ -580,6 +583,8 @@ class PaintByExamplePipeline(DiffusionPipeline):
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
self.maybe_free_model_hooks()
if not output_type == "latent":
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
......
......@@ -495,6 +495,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
pipeline to function (should be overridden by subclasses).
"""
config_name = "model_index.json"
model_cpu_offload_seq = None
_optional_components = []
_exclude_from_cpu_offload = []
_load_connected_pipes = False
......@@ -1224,6 +1225,72 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
return torch.device(module._hf_hook.execution_device)
return self.device
def enable_model_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
"""
if self.model_cpu_offload_seq is None:
raise ValueError(
"Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
)
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
from accelerate import cpu_offload_with_hook
else:
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
device_mod = getattr(torch, self.device.type, None)
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
self._all_hooks = []
hook = None
for model_str in self.model_cpu_offload_seq.split("->"):
model = all_model_components.pop(model_str)
if not isinstance(model, torch.nn.Module):
continue
_, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
self._all_hooks.append(hook)
# CPU offload models that are not in the seq chain unless they are explicitly excluded
# these models will stay on CPU until maybe_free_model_hooks is called
# some models cannot be in the seq chain because they are iteratively called, such as controlnet
for name, model in all_model_components.items():
if not isinstance(model, torch.nn.Module):
continue
if name in self._exclude_from_cpu_offload:
model.to(device)
else:
_, hook = cpu_offload_with_hook(model, device)
self._all_hooks.append(hook)
def maybe_free_model_hooks(self):
r"""
TODO: Better doc string
"""
if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
# `enable_model_cpu_offload` has not be called, so silently do nothing
return
for hook in self._all_hooks:
# offload model and remove hook from model
hook.offload()
hook.remove()
# make sure the model is in the same state as before calling it
self.enable_model_cpu_offload()
def enable_sequential_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
......
......@@ -89,6 +89,7 @@ class RePaintPipeline(DiffusionPipeline):
unet: UNet2DModel
scheduler: RePaintScheduler
model_cpu_offload_seq = "unet"
def __init__(self, unet, scheduler):
super().__init__()
......
......@@ -46,6 +46,7 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline):
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"]
def __init__(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment