Unverified Commit 45b42d12 authored by Disty0's avatar Disty0 Committed by GitHub
Browse files

Add device arg to offloading with combined pipelines (#7471)


Co-authored-by: default avatarSayak Paul <spsayakpaul@gmail.com>
parent 5199ee4f
...@@ -178,7 +178,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline): ...@@ -178,7 +178,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_sequential_cpu_offload(self, gpu_id=0): def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
...@@ -186,8 +186,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline): ...@@ -186,8 +186,8 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
Note that offloading happens on a submodule basis. Memory savings are higher than with Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower. `enable_model_cpu_offload`, but performance is lower.
""" """
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
def progress_bar(self, iterable=None, total=None): def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total) self.prior_pipe.progress_bar(iterable=iterable, total=total)
...@@ -405,17 +405,17 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): ...@@ -405,17 +405,17 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
""" """
self.prior_pipe.enable_model_cpu_offload() self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_model_cpu_offload() self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
def enable_sequential_cpu_offload(self, gpu_id=0): def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
...@@ -423,8 +423,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline): ...@@ -423,8 +423,8 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
Note that offloading happens on a submodule basis. Memory savings are higher than with Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower. `enable_model_cpu_offload`, but performance is lower.
""" """
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
def progress_bar(self, iterable=None, total=None): def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total) self.prior_pipe.progress_bar(iterable=iterable, total=total)
...@@ -653,7 +653,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): ...@@ -653,7 +653,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_sequential_cpu_offload(self, gpu_id=0): def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
...@@ -661,8 +661,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline): ...@@ -661,8 +661,8 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
Note that offloading happens on a submodule basis. Memory savings are higher than with Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower. `enable_model_cpu_offload`, but performance is lower.
""" """
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
def progress_bar(self, iterable=None, total=None): def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total) self.prior_pipe.progress_bar(iterable=iterable, total=total)
......
...@@ -117,25 +117,25 @@ class StableCascadeCombinedPipeline(DiffusionPipeline): ...@@ -117,25 +117,25 @@ class StableCascadeCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
""" """
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
def enable_sequential_cpu_offload(self, gpu_id=0): def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗 Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis. GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower. Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
""" """
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
def progress_bar(self, iterable=None, total=None): def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total) self.prior_pipe.progress_bar(iterable=iterable, total=total)
......
...@@ -112,25 +112,25 @@ class WuerstchenCombinedPipeline(DiffusionPipeline): ...@@ -112,25 +112,25 @@ class WuerstchenCombinedPipeline(DiffusionPipeline):
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op) self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
""" """
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
def enable_sequential_cpu_offload(self, gpu_id=0): def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
r""" r"""
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗 Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis. GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower. Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
""" """
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id) self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
def progress_bar(self, iterable=None, total=None): def progress_bar(self, iterable=None, total=None):
self.prior_pipe.progress_bar(iterable=iterable, total=total) self.prior_pipe.progress_bar(iterable=iterable, total=total)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment