Unverified Commit 93579650 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Refactor model offload (#4514)



* [Draft] Refactor model offload

* [Draft] Refactor model offload

* Apply suggestions from code review

* cpu offlaod updates

* remove model cpu offload from individual pipelines

* add hook to offload models to cpu

* clean up

* model offload

* add model cpu offload string

* make style

* clean up

* fixes for offload issues

* fix tests issues

* resolve merge conflicts

* update src/diffusers/pipelines/pipeline_utils.py
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* make style

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py

---------
Co-authored-by: default avatarDhruv Nair <dhruv.nair@gmail.com>
parent 16a056a7
...@@ -51,6 +51,8 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline): ...@@ -51,6 +51,8 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
""" """
model_cpu_offload_seq = "bert->unet->vqvae"
tokenizer: CLIPTokenizer tokenizer: CLIPTokenizer
image_feature_extractor: CLIPImageProcessor image_feature_extractor: CLIPImageProcessor
text_encoder: CLIPTextModelWithProjection text_encoder: CLIPTextModelWithProjection
......
...@@ -44,7 +44,7 @@ from diffusers import ( ...@@ -44,7 +44,7 @@ from diffusers import (
LMSDiscreteScheduler, LMSDiscreteScheduler,
PNDMScheduler, PNDMScheduler,
) )
from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
...@@ -491,26 +491,6 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase): ...@@ -491,26 +491,6 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
@unittest.skipIf(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
)
def test_model_cpu_offload(self, expected_max_diff=2e-4):
components = self.get_dummy_components()
audioldm_pipe = AudioLDM2Pipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
output_without_offload = audioldm_pipe(**inputs)[0]
audioldm_pipe.enable_model_cpu_offload()
inputs = self.get_dummy_inputs(torch_device)
output_with_offload = audioldm_pipe(**inputs)[0]
max_diff = np.abs(output_with_offload - output_without_offload).max()
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
@slow @slow
class AudioLDM2PipelineSlowTests(unittest.TestCase): class AudioLDM2PipelineSlowTests(unittest.TestCase):
......
...@@ -163,8 +163,8 @@ class StableDiffusionAttendAndExcitePipelineFastTests( ...@@ -163,8 +163,8 @@ class StableDiffusionAttendAndExcitePipelineFastTests(
max_diff = np.abs(image_slice.flatten() - expected_slice).max() max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3) self.assertLessEqual(max_diff, 1e-3)
def test_cpu_offload_forward_pass(self): def test_sequential_cpu_offload_forward_pass(self):
super().test_cpu_offload_forward_pass(expected_max_diff=5e-4) super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
def test_inference_batch_consistent(self): def test_inference_batch_consistent(self):
# NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
......
...@@ -181,8 +181,8 @@ class StableDiffusionLatentUpscalePipelineFastTests( ...@@ -181,8 +181,8 @@ class StableDiffusionLatentUpscalePipelineFastTests(
def test_attention_slicing_forward_pass(self): def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=7e-3) super().test_attention_slicing_forward_pass(expected_max_diff=7e-3)
def test_cpu_offload_forward_pass(self): def test_sequential_cpu_offload_forward_pass(self):
super().test_cpu_offload_forward_pass(expected_max_diff=3e-3) super().test_sequential_cpu_offload_forward_pass(expected_max_diff=3e-3)
def test_dict_tuple_outputs_equivalent(self): def test_dict_tuple_outputs_equivalent(self):
super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3) super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
......
...@@ -717,7 +717,7 @@ class PipelineTesterMixin: ...@@ -717,7 +717,7 @@ class PipelineTesterMixin:
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
) )
def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4): def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4):
components = self.get_dummy_components() components = self.get_dummy_components()
pipe = self.pipeline_class(**components) pipe = self.pipeline_class(**components)
for component in pipe.components.values(): for component in pipe.components.values():
...@@ -726,11 +726,39 @@ class PipelineTesterMixin: ...@@ -726,11 +726,39 @@ class PipelineTesterMixin:
pipe.to(torch_device) pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device) generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
output_without_offload = pipe(**inputs)[0] output_without_offload = pipe(**inputs)[0]
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload()
inputs = self.get_dummy_inputs(torch_device)
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
@unittest.skipIf(
torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
)
def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
generator_device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(generator_device)
output_without_offload = pipe(**inputs)[0]
pipe.enable_model_cpu_offload()
inputs = self.get_dummy_inputs(generator_device)
output_with_offload = pipe(**inputs)[0] output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment