Unverified Commit 27062c36 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

Refactor execution device & cpu offload (#4114)

* create general cpu offload & execution device

* Remove boiler plate

* finish

* kp

* Correct offload more pipelines

* up

* Update src/diffusers/pipelines/pipeline_utils.py

* make style

* up
parent 6427aa99
...@@ -226,31 +226,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -226,31 +226,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
from accelerate import cpu_offload
else:
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -279,24 +254,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL ...@@ -279,24 +254,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
......
...@@ -224,31 +224,6 @@ class AltDiffusionImg2ImgPipeline( ...@@ -224,31 +224,6 @@ class AltDiffusionImg2ImgPipeline(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
from accelerate import cpu_offload
else:
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -277,24 +252,6 @@ class AltDiffusionImg2ImgPipeline( ...@@ -277,24 +252,6 @@ class AltDiffusionImg2ImgPipeline(
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
......
...@@ -22,7 +22,7 @@ from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaT ...@@ -22,7 +22,7 @@ from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaT
from ...models import AutoencoderKL, UNet2DConditionModel from ...models import AutoencoderKL, UNet2DConditionModel
from ...schedulers import KarrasDiffusionSchedulers from ...schedulers import KarrasDiffusionSchedulers
from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring from ...utils import logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
...@@ -108,41 +108,6 @@ class AudioLDMPipeline(DiffusionPipeline): ...@@ -108,41 +108,6 @@ class AudioLDMPipeline(DiffusionPipeline):
""" """
self.vae.disable_slicing() self.vae.disable_slicing()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and vocoder have their state dicts saved to CPU and then are moved to a `torch.device('meta')
and loaded to GPU only when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.vocoder]:
cpu_offload(cpu_offloaded_model, device)
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
......
...@@ -76,31 +76,6 @@ class ConsistencyModelPipeline(DiffusionPipeline): ...@@ -76,31 +76,6 @@ class ConsistencyModelPipeline(DiffusionPipeline):
self.safety_checker = None self.safety_checker = None
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
from accelerate import cpu_offload
else:
raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
device = torch.device(f"cuda:{gpu_id}")
if self.device.type != "cpu":
self.to("cpu", silence_dtype_warnings=True)
torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
for cpu_offloaded_model in [self.unet]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -129,25 +104,6 @@ class ConsistencyModelPipeline(DiffusionPipeline): ...@@ -129,25 +104,6 @@ class ConsistencyModelPipeline(DiffusionPipeline):
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None): def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
shape = (batch_size, num_channels, height, width) shape = (batch_size, num_channels, height, width)
if isinstance(generator, list) and len(generator) != batch_size: if isinstance(generator, list) and len(generator) != batch_size:
......
...@@ -212,27 +212,6 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -212,27 +212,6 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -261,25 +240,6 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade ...@@ -261,25 +240,6 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
......
...@@ -238,27 +238,6 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi ...@@ -238,27 +238,6 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -287,25 +266,6 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi ...@@ -287,25 +266,6 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversi
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
......
...@@ -355,27 +355,6 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversi ...@@ -355,27 +355,6 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversi
""" """
self.vae.disable_tiling() self.vae.disable_tiling()
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
Note that offloading happens on a submodule basis. Memory savings are higher than with
`enable_model_cpu_offload`, but performance is lower.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -404,25 +383,6 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversi ...@@ -404,25 +383,6 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversi
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt( def _encode_prompt(
self, self,
......
...@@ -102,7 +102,7 @@ class DanceDiffusionPipeline(DiffusionPipeline): ...@@ -102,7 +102,7 @@ class DanceDiffusionPipeline(DiffusionPipeline):
f" size of {batch_size}. Make sure the batch size matches the length of the generators." f" size of {batch_size}. Make sure the batch size matches the length of the generators."
) )
audio = randn_tensor(shape, generator=generator, device=self.device, dtype=dtype) audio = randn_tensor(shape, generator=generator, device=self._execution_device, dtype=dtype)
# set step values # set step values
self.scheduler.set_timesteps(num_inference_steps, device=audio.device) self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
......
...@@ -95,7 +95,7 @@ class DDIMPipeline(DiffusionPipeline): ...@@ -95,7 +95,7 @@ class DDIMPipeline(DiffusionPipeline):
f" size of {batch_size}. Make sure the batch size matches the length of the generators." f" size of {batch_size}. Make sure the batch size matches the length of the generators."
) )
image = randn_tensor(image_shape, generator=generator, device=self.device, dtype=self.unet.dtype) image = randn_tensor(image_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype)
# set step values # set step values
self.scheduler.set_timesteps(num_inference_steps) self.scheduler.set_timesteps(num_inference_steps)
......
...@@ -144,30 +144,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -144,30 +144,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -223,25 +199,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -223,25 +199,6 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
self.text_encoder_offload_hook = None self.text_encoder_offload_hook = None
self.final_offload_hook = None self.final_offload_hook = None
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
def encode_prompt( def encode_prompt(
self, self,
......
...@@ -168,31 +168,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -168,31 +168,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
...@@ -250,25 +225,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -250,25 +225,6 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
self.text_encoder_offload_hook = None self.text_encoder_offload_hook = None
self.final_offload_hook = None self.final_offload_hook = None
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt( def encode_prompt(
......
...@@ -179,31 +179,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -179,31 +179,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
...@@ -401,25 +376,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -401,25 +376,6 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
return caption.strip() return caption.strip()
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt( def encode_prompt(
......
...@@ -171,31 +171,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -171,31 +171,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
...@@ -253,25 +228,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -253,25 +228,6 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
self.text_encoder_offload_hook = None self.text_encoder_offload_hook = None
self.final_offload_hook = None self.final_offload_hook = None
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt( def encode_prompt(
......
...@@ -181,31 +181,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -181,31 +181,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
...@@ -403,25 +378,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -403,25 +378,6 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
return caption.strip() return caption.strip()
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt( def encode_prompt(
......
...@@ -137,31 +137,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -137,31 +137,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
) )
self.register_to_config(requires_safety_checker=requires_safety_checker) self.register_to_config(requires_safety_checker=requires_safety_checker)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.text_encoder,
self.unet,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
if self.safety_checker is not None:
cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
...@@ -359,25 +334,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): ...@@ -359,25 +334,6 @@ class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
return caption.strip() return caption.strip()
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
def encode_prompt( def encode_prompt(
......
...@@ -121,13 +121,13 @@ class DiTPipeline(DiffusionPipeline): ...@@ -121,13 +121,13 @@ class DiTPipeline(DiffusionPipeline):
latents = randn_tensor( latents = randn_tensor(
shape=(batch_size, latent_channels, latent_size, latent_size), shape=(batch_size, latent_channels, latent_size, latent_size),
generator=generator, generator=generator,
device=self.device, device=self._execution_device,
dtype=self.transformer.dtype, dtype=self.transformer.dtype,
) )
latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents
class_labels = torch.tensor(class_labels, device=self.device).reshape(-1) class_labels = torch.tensor(class_labels, device=self._execution_device).reshape(-1)
class_null = torch.tensor([1000] * batch_size, device=self.device) class_null = torch.tensor([1000] * batch_size, device=self._execution_device)
class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
# set step values # set step values
......
...@@ -229,28 +229,6 @@ class KandinskyPipeline(DiffusionPipeline): ...@@ -229,28 +229,6 @@ class KandinskyPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.unet,
self.text_encoder,
self.movq,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -276,25 +254,6 @@ class KandinskyPipeline(DiffusionPipeline): ...@@ -276,25 +254,6 @@ class KandinskyPipeline(DiffusionPipeline):
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING) @replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__( def __call__(
......
...@@ -264,28 +264,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): ...@@ -264,28 +264,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.unet,
self.text_encoder,
self.movq,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -311,25 +289,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline): ...@@ -311,25 +289,6 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
# add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
def add_noise( def add_noise(
self, self,
......
...@@ -389,28 +389,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline): ...@@ -389,28 +389,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
return prompt_embeds, text_encoder_hidden_states, text_mask return prompt_embeds, text_encoder_hidden_states, text_mask
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.unet,
self.text_encoder,
self.movq,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
def enable_model_cpu_offload(self, gpu_id=0): def enable_model_cpu_offload(self, gpu_id=0):
r""" r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
...@@ -436,25 +414,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline): ...@@ -436,25 +414,6 @@ class KandinskyInpaintPipeline(DiffusionPipeline):
# We'll offload the last model manually. # We'll offload the last model manually.
self.final_offload_hook = hook self.final_offload_hook = hook
@property
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if not hasattr(self.unet, "_hf_hook"):
return self.device
for module in self.unet.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
@torch.no_grad() @torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING) @replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__( def __call__(
......
...@@ -25,7 +25,6 @@ from ...pipelines import DiffusionPipeline ...@@ -25,7 +25,6 @@ from ...pipelines import DiffusionPipeline
from ...schedulers import UnCLIPScheduler from ...schedulers import UnCLIPScheduler
from ...utils import ( from ...utils import (
BaseOutput, BaseOutput,
is_accelerate_available,
logging, logging,
randn_tensor, randn_tensor,
replace_example_docstring, replace_example_docstring,
...@@ -147,6 +146,8 @@ class KandinskyPriorPipeline(DiffusionPipeline): ...@@ -147,6 +146,8 @@ class KandinskyPriorPipeline(DiffusionPipeline):
A scheduler to be used in combination with `prior` to generate image embedding. A scheduler to be used in combination with `prior` to generate image embedding.
""" """
_exclude_from_cpu_offload = ["prior"]
def __init__( def __init__(
self, self,
prior: PriorTransformer, prior: PriorTransformer,
...@@ -295,45 +296,6 @@ class KandinskyPriorPipeline(DiffusionPipeline): ...@@ -295,45 +296,6 @@ class KandinskyPriorPipeline(DiffusionPipeline):
zero_image_emb = zero_image_emb.repeat(batch_size, 1) zero_image_emb = zero_image_emb.repeat(batch_size, 1)
return zero_image_emb return zero_image_emb
def enable_sequential_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
when their specific submodule has its `forward` method called.
"""
if is_accelerate_available():
from accelerate import cpu_offload
else:
raise ImportError("Please install accelerate via `pip install accelerate`")
device = torch.device(f"cuda:{gpu_id}")
models = [
self.image_encoder,
self.text_encoder,
]
for cpu_offloaded_model in models:
if cpu_offloaded_model is not None:
cpu_offload(cpu_offloaded_model, device)
@property
def _execution_device(self):
r"""
Returns the device on which the pipeline's models will be executed. After calling
`pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
hooks.
"""
if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"):
return self.device
for module in self.text_encoder.modules():
if (
hasattr(module, "_hf_hook")
and hasattr(module._hf_hook, "execution_device")
and module._hf_hook.execution_device is not None
):
return torch.device(module._hf_hook.execution_device)
return self.device
def _encode_prompt( def _encode_prompt(
self, self,
prompt, prompt,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment