Unverified Commit 7855ac59 authored by Fanli Lin's avatar Fanli Lin Committed by GitHub
Browse files

[tests] make tests device-agnostic (part 4) (#10508)



* initial comit

* fix empty cache

* fix one more

* fix style

* update device functions

* update

* update

* Update src/diffusers/utils/testing_utils.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update src/diffusers/utils/testing_utils.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update src/diffusers/utils/testing_utils.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/controlnet/test_controlnet.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update src/diffusers/utils/testing_utils.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update src/diffusers/utils/testing_utils.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/controlnet/test_controlnet.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* with gc.collect

* update

* make style

* check_torch_dependencies

* add mps empty cache

* add changes

* bug fix

* enable on xpu

* update more cases

* revert

* revert back

* Update test_stable_diffusion_xl.py

* Update tests/pipelines/stable_diffusion/test_stable_diffusion.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/stable_diffusion/test_stable_diffusion.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Update tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* Apply suggestions from code review
Co-authored-by: default avatarhlky <hlky@hlky.ac>

* add test marker

---------
Co-authored-by: default avatarhlky <hlky@hlky.ac>
parent 30cef6bf
...@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadeDecoderPipeline ...@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadeDecoderPipeline
from diffusers.models import StableCascadeUNet from diffusers.models import StableCascadeUNet
from diffusers.pipelines.wuerstchen import PaellaVQModel from diffusers.pipelines.wuerstchen import PaellaVQModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
load_numpy, load_numpy,
load_pt, load_pt,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
skip_mps, skip_mps,
slow, slow,
torch_device, torch_device,
...@@ -278,25 +279,25 @@ class StableCascadeDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCa ...@@ -278,25 +279,25 @@ class StableCascadeDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCa
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase): class StableCascadeDecoderPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_cascade_decoder(self): def test_stable_cascade_decoder(self):
pipe = StableCascadeDecoderPipeline.from_pretrained( pipe = StableCascadeDecoderPipeline.from_pretrained(
"stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16 "stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16
) )
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
......
...@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadePriorPipeline ...@@ -24,11 +24,12 @@ from diffusers import DDPMWuerstchenScheduler, StableCascadePriorPipeline
from diffusers.models import StableCascadeUNet from diffusers.models import StableCascadeUNet
from diffusers.utils.import_utils import is_peft_available from diffusers.utils.import_utils import is_peft_available
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
load_numpy, load_numpy,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_peft_backend, require_peft_backend,
require_torch_gpu, require_torch_accelerator,
skip_mps, skip_mps,
slow, slow,
torch_device, torch_device,
...@@ -246,25 +247,25 @@ class StableCascadePriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase ...@@ -246,25 +247,25 @@ class StableCascadePriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableCascadePriorPipelineIntegrationTests(unittest.TestCase): class StableCascadePriorPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_cascade_prior(self): def test_stable_cascade_prior(self):
pipe = StableCascadePriorPipeline.from_pretrained( pipe = StableCascadePriorPipeline.from_pretrained(
"stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16 "stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=torch.bfloat16
) )
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background." prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
......
...@@ -44,6 +44,10 @@ from diffusers import ( ...@@ -44,6 +44,10 @@ from diffusers import (
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
CaptureLogger, CaptureLogger,
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
is_torch_compile, is_torch_compile,
load_image, load_image,
...@@ -52,7 +56,7 @@ from diffusers.utils.testing_utils import ( ...@@ -52,7 +56,7 @@ from diffusers.utils.testing_utils import (
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_accelerate_version_greater, require_accelerate_version_greater,
require_torch_2, require_torch_2,
require_torch_gpu, require_torch_accelerator,
require_torch_multi_gpu, require_torch_multi_gpu,
run_test_in_subprocess, run_test_in_subprocess,
skip_mps, skip_mps,
...@@ -781,11 +785,11 @@ class StableDiffusionPipelineFastTests( ...@@ -781,11 +785,11 @@ class StableDiffusionPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionPipelineSlowTests(unittest.TestCase): class StableDiffusionPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -887,7 +891,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -887,7 +891,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
assert np.abs(image_slice - expected_slice).max() < 3e-3 assert np.abs(image_slice - expected_slice).max() < 3e-3
def test_stable_diffusion_attention_slicing(self): def test_stable_diffusion_attention_slicing(self):
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe.unet.set_default_attn_processor() pipe.unet.set_default_attn_processor()
pipe = pipe.to(torch_device) pipe = pipe.to(torch_device)
...@@ -898,8 +902,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -898,8 +902,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
image_sliced = pipe(**inputs).images image_sliced = pipe(**inputs).images
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
# make sure that less than 3.75 GB is allocated # make sure that less than 3.75 GB is allocated
assert mem_bytes < 3.75 * 10**9 assert mem_bytes < 3.75 * 10**9
...@@ -910,13 +914,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -910,13 +914,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
image = pipe(**inputs).images image = pipe(**inputs).images
# make sure that more than 3.75 GB is allocated # make sure that more than 3.75 GB is allocated
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
assert mem_bytes > 3.75 * 10**9 assert mem_bytes > 3.75 * 10**9
max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
assert max_diff < 1e-3 assert max_diff < 1e-3
def test_stable_diffusion_vae_slicing(self): def test_stable_diffusion_vae_slicing(self):
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe = pipe.to(torch_device) pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
...@@ -929,8 +933,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -929,8 +933,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
inputs["latents"] = torch.cat([inputs["latents"]] * 4) inputs["latents"] = torch.cat([inputs["latents"]] * 4)
image_sliced = pipe(**inputs).images image_sliced = pipe(**inputs).images
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
# make sure that less than 4 GB is allocated # make sure that less than 4 GB is allocated
assert mem_bytes < 4e9 assert mem_bytes < 4e9
...@@ -942,14 +946,14 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -942,14 +946,14 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
image = pipe(**inputs).images image = pipe(**inputs).images
# make sure that more than 4 GB is allocated # make sure that more than 4 GB is allocated
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
assert mem_bytes > 4e9 assert mem_bytes > 4e9
# There is a small discrepancy at the image borders vs. a fully batched version. # There is a small discrepancy at the image borders vs. a fully batched version.
max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten())
assert max_diff < 1e-2 assert max_diff < 1e-2
def test_stable_diffusion_vae_tiling(self): def test_stable_diffusion_vae_tiling(self):
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
model_id = "CompVis/stable-diffusion-v1-4" model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained( pipe = StableDiffusionPipeline.from_pretrained(
model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None model_id, variant="fp16", torch_dtype=torch.float16, safety_checker=None
...@@ -963,7 +967,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -963,7 +967,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
# enable vae tiling # enable vae tiling
pipe.enable_vae_tiling() pipe.enable_vae_tiling()
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
generator = torch.Generator(device="cpu").manual_seed(0) generator = torch.Generator(device="cpu").manual_seed(0)
output_chunked = pipe( output_chunked = pipe(
[prompt], [prompt],
...@@ -976,7 +980,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -976,7 +980,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
) )
image_chunked = output_chunked.images image_chunked = output_chunked.images
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# disable vae tiling # disable vae tiling
pipe.disable_vae_tiling() pipe.disable_vae_tiling()
...@@ -1069,26 +1073,25 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1069,26 +1073,25 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
assert 2 * low_cpu_mem_usage_time < normal_load_time assert 2 * low_cpu_mem_usage_time < normal_load_time
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.8 GB is allocated # make sure that less than 2.8 GB is allocated
assert mem_bytes < 2.8 * 10**9 assert mem_bytes < 2.8 * 10**9
def test_stable_diffusion_pipeline_with_model_offloading(self): def test_stable_diffusion_pipeline_with_model_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_peak_memory_stats(torch_device)
torch.cuda.reset_peak_memory_stats()
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
...@@ -1102,7 +1105,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1102,7 +1105,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
pipe.to(torch_device) pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
outputs = pipe(**inputs) outputs = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# With model offloading # With model offloading
...@@ -1113,16 +1116,16 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1113,16 +1116,16 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
) )
pipe.unet.set_default_attn_processor() pipe.unet.set_default_attn_processor()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
outputs_offloaded = pipe(**inputs) outputs_offloaded = pipe(**inputs)
mem_bytes_offloaded = torch.cuda.max_memory_allocated() mem_bytes_offloaded = backend_max_memory_allocated(torch_device)
images = outputs.images images = outputs.images
offloaded_images = outputs_offloaded.images offloaded_images = outputs_offloaded.images
...@@ -1135,13 +1138,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1135,13 +1138,13 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
assert module.device == torch.device("cpu") assert module.device == torch.device("cpu")
# With attention slicing # With attention slicing
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe.enable_attention_slicing() pipe.enable_attention_slicing()
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes_slicing = torch.cuda.max_memory_allocated() mem_bytes_slicing = backend_max_memory_allocated(torch_device)
assert mem_bytes_slicing < mem_bytes_offloaded assert mem_bytes_slicing < mem_bytes_offloaded
assert mem_bytes_slicing < 3 * 10**9 assert mem_bytes_slicing < 3 * 10**9
...@@ -1156,7 +1159,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1156,7 +1159,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
) )
pipe.load_textual_inversion(a111_file) pipe.load_textual_inversion(a111_file)
pipe.load_textual_inversion(a111_file_neg) pipe.load_textual_inversion(a111_file_neg)
pipe.to("cuda") pipe.to(torch_device)
generator = torch.Generator(device="cpu").manual_seed(1) generator = torch.Generator(device="cpu").manual_seed(1)
...@@ -1173,7 +1176,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1173,7 +1176,7 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self): def test_stable_diffusion_textual_inversion_with_model_cpu_offload(self):
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
...@@ -1198,8 +1201,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1198,8 +1201,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self): def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self):
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons").to(torch_device)
a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
a111_file_neg = hf_hub_download( a111_file_neg = hf_hub_download(
...@@ -1257,17 +1260,17 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): ...@@ -1257,17 +1260,17 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionPipelineCkptTests(unittest.TestCase): class StableDiffusionPipelineCkptTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_download_from_hub(self): def test_download_from_hub(self):
ckpt_paths = [ ckpt_paths = [
...@@ -1278,7 +1281,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase): ...@@ -1278,7 +1281,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
for ckpt_path in ckpt_paths: for ckpt_path in ckpt_paths:
pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda") pipe.to(torch_device)
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
...@@ -1294,7 +1297,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase): ...@@ -1294,7 +1297,7 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16 ckpt_filename, config_files={"v1": config_filename}, torch_dtype=torch.float16
) )
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda") pipe.to(torch_device)
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
...@@ -1302,17 +1305,17 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase): ...@@ -1302,17 +1305,17 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionPipelineNightlyTests(unittest.TestCase): class StableDiffusionPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -1412,7 +1415,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase): ...@@ -1412,7 +1415,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, generator_device="cpu", seed=0): def get_inputs(self, generator_device="cpu", seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
......
...@@ -35,6 +35,10 @@ from diffusers import ( ...@@ -35,6 +35,10 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
is_torch_compile, is_torch_compile,
...@@ -42,7 +46,7 @@ from diffusers.utils.testing_utils import ( ...@@ -42,7 +46,7 @@ from diffusers.utils.testing_utils import (
load_numpy, load_numpy,
nightly, nightly,
require_torch_2, require_torch_2,
require_torch_gpu, require_torch_accelerator,
run_test_in_subprocess, run_test_in_subprocess,
skip_mps, skip_mps,
slow, slow,
...@@ -400,17 +404,17 @@ class StableDiffusionImg2ImgPipelineFastTests( ...@@ -400,17 +404,17 @@ class StableDiffusionImg2ImgPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -513,28 +517,28 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): ...@@ -513,28 +517,28 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
assert number_of_steps == 2 assert number_of_steps == 2
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionImg2ImgPipeline.from_pretrained( pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.2 GB is allocated # make sure that less than 2.2 GB is allocated
assert mem_bytes < 2.2 * 10**9 assert mem_bytes < 2.2 * 10**9
def test_stable_diffusion_pipeline_with_model_offloading(self): def test_stable_diffusion_pipeline_with_model_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
...@@ -548,7 +552,7 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): ...@@ -548,7 +552,7 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
pipe.to(torch_device) pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe(**inputs) pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# With model offloading # With model offloading
...@@ -559,14 +563,14 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): ...@@ -559,14 +563,14 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
torch_dtype=torch.float16, torch_dtype=torch.float16,
) )
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes_offloaded = torch.cuda.max_memory_allocated() mem_bytes_offloaded = backend_max_memory_allocated(torch_device)
assert mem_bytes_offloaded < mem_bytes assert mem_bytes_offloaded < mem_bytes
for module in pipe.text_encoder, pipe.unet, pipe.vae: for module in pipe.text_encoder, pipe.unet, pipe.vae:
...@@ -663,17 +667,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): ...@@ -663,17 +667,17 @@ class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
......
...@@ -37,6 +37,10 @@ from diffusers import ( ...@@ -37,6 +37,10 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
is_torch_compile, is_torch_compile,
...@@ -44,7 +48,7 @@ from diffusers.utils.testing_utils import ( ...@@ -44,7 +48,7 @@ from diffusers.utils.testing_utils import (
load_numpy, load_numpy,
nightly, nightly,
require_torch_2, require_torch_2,
require_torch_gpu, require_torch_accelerator,
run_test_in_subprocess, run_test_in_subprocess,
slow, slow,
torch_device, torch_device,
...@@ -602,7 +606,7 @@ class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipeli ...@@ -602,7 +606,7 @@ class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipeli
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -610,7 +614,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): ...@@ -610,7 +614,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -704,21 +708,21 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): ...@@ -704,21 +708,21 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
assert np.abs(expected_slice - image_slice).max() < 6e-3 assert np.abs(expected_slice - image_slice).max() < 6e-3
def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionInpaintPipeline.from_pretrained( pipe = StableDiffusionInpaintPipeline.from_pretrained(
"botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16 "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, torch_dtype=torch.float16
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.2 GB is allocated # make sure that less than 2.2 GB is allocated
assert mem_bytes < 2.2 * 10**9 assert mem_bytes < 2.2 * 10**9
...@@ -793,7 +797,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): ...@@ -793,7 +797,7 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase): class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -801,7 +805,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te ...@@ -801,7 +805,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -907,9 +911,9 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te ...@@ -907,9 +911,9 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
assert np.abs(expected_slice - image_slice).max() < 6e-3 assert np.abs(expected_slice - image_slice).max() < 6e-3
def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
vae = AsymmetricAutoencoderKL.from_pretrained( vae = AsymmetricAutoencoderKL.from_pretrained(
"cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16 "cross-attention/asymmetric-autoencoder-kl-x-1-5", torch_dtype=torch.float16
...@@ -920,12 +924,12 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te ...@@ -920,12 +924,12 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
pipe.vae = vae pipe.vae = vae
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.45 GB is allocated # make sure that less than 2.45 GB is allocated
assert mem_bytes < 2.45 * 10**9 assert mem_bytes < 2.45 * 10**9
...@@ -1009,7 +1013,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te ...@@ -1009,7 +1013,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.vae = vae pipe.vae = vae
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda") pipe.to(torch_device)
inputs = self.get_inputs(torch_device) inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 1 inputs["num_inference_steps"] = 1
...@@ -1019,17 +1023,17 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te ...@@ -1019,17 +1023,17 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase): class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
......
...@@ -33,10 +33,14 @@ from diffusers import ( ...@@ -33,10 +33,14 @@ from diffusers import (
) )
from diffusers.image_processor import VaeImageProcessor from diffusers.image_processor import VaeImageProcessor
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -266,17 +270,17 @@ class StableDiffusionInstructPix2PixPipelineFastTests( ...@@ -266,17 +270,17 @@ class StableDiffusionInstructPix2PixPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, seed=0): def get_inputs(self, seed=0):
generator = torch.manual_seed(seed) generator = torch.manual_seed(seed)
...@@ -384,21 +388,21 @@ class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): ...@@ -384,21 +388,21 @@ class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
assert number_of_steps == 3 assert number_of_steps == 3
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
"timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs() inputs = self.get_inputs()
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.2 GB is allocated # make sure that less than 2.2 GB is allocated
assert mem_bytes < 2.2 * 10**9 assert mem_bytes < 2.2 * 10**9
......
...@@ -34,12 +34,13 @@ from diffusers import ( ...@@ -34,12 +34,13 @@ from diffusers import (
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
CaptureLogger, CaptureLogger,
backend_empty_cache, backend_empty_cache,
backend_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
load_numpy, load_numpy,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_accelerator, require_torch_accelerator,
require_torch_gpu,
skip_mps, skip_mps,
slow, slow,
torch_device, torch_device,
...@@ -330,9 +331,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): ...@@ -330,9 +331,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
backend_empty_cache(torch_device) backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
_generator_device = "cpu" if not generator_device.startswith("cuda") else "cuda"
if not str(device).startswith("mps"): if not str(device).startswith("mps"):
generator = torch.Generator(device=_generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
else: else:
generator = torch.manual_seed(seed) generator = torch.manual_seed(seed)
...@@ -361,9 +361,9 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): ...@@ -361,9 +361,9 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
assert np.abs(image_slice - expected_slice).max() < 7e-3 assert np.abs(image_slice - expected_slice).max() < 7e-3
@require_torch_gpu @require_torch_accelerator
def test_stable_diffusion_attention_slicing(self): def test_stable_diffusion_attention_slicing(self):
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionPipeline.from_pretrained( pipe = StableDiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
) )
...@@ -376,8 +376,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): ...@@ -376,8 +376,8 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
image_sliced = pipe(**inputs).images image_sliced = pipe(**inputs).images
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
# make sure that less than 3.3 GB is allocated # make sure that less than 3.3 GB is allocated
assert mem_bytes < 3.3 * 10**9 assert mem_bytes < 3.3 * 10**9
...@@ -388,7 +388,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): ...@@ -388,7 +388,7 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase):
image = pipe(**inputs).images image = pipe(**inputs).images
# make sure that more than 3.3 GB is allocated # make sure that more than 3.3 GB is allocated
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
assert mem_bytes > 3.3 * 10**9 assert mem_bytes > 3.3 * 10**9
max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten()) max_diff = numpy_cosine_similarity_distance(image.flatten(), image_sliced.flatten())
assert max_diff < 5e-3 assert max_diff < 5e-3
......
...@@ -37,6 +37,7 @@ from diffusers import ( ...@@ -37,6 +37,7 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
...@@ -44,7 +45,7 @@ from diffusers.utils.testing_utils import ( ...@@ -44,7 +45,7 @@ from diffusers.utils.testing_utils import (
nightly, nightly,
require_accelerate_version_greater, require_accelerate_version_greater,
require_accelerator, require_accelerator,
require_torch_gpu, require_torch_accelerator,
skip_mps, skip_mps,
slow, slow,
torch_device, torch_device,
...@@ -378,17 +379,17 @@ class StableDiffusionDepth2ImgPipelineFastTests( ...@@ -378,17 +379,17 @@ class StableDiffusionDepth2ImgPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=device).manual_seed(seed) generator = torch.Generator(device=device).manual_seed(seed)
...@@ -425,17 +426,17 @@ class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): ...@@ -425,17 +426,17 @@ class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=device).manual_seed(seed) generator = torch.Generator(device=device).manual_seed(seed)
......
...@@ -33,12 +33,13 @@ from diffusers import ( ...@@ -33,12 +33,13 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
torch_device, torch_device,
) )
...@@ -299,18 +300,18 @@ class StableDiffusionDiffEditPipelineFastTests( ...@@ -299,18 +300,18 @@ class StableDiffusionDiffEditPipelineFastTests(
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict) return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@require_torch_gpu @require_torch_accelerator
@nightly @nightly
class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -331,7 +332,7 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): ...@@ -331,7 +332,7 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
pipe.scheduler.clip_sample = True pipe.scheduler.clip_sample = True
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
source_prompt = "a bowl of fruit" source_prompt = "a bowl of fruit"
...@@ -377,17 +378,17 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase): ...@@ -377,17 +378,17 @@ class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase): class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
...@@ -24,11 +24,14 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer ...@@ -24,11 +24,14 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
load_numpy, load_numpy,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -161,19 +164,19 @@ class StableDiffusion2InpaintPipelineFastTests( ...@@ -161,19 +164,19 @@ class StableDiffusion2InpaintPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_diffusion_inpaint_pipeline(self): def test_stable_diffusion_inpaint_pipeline(self):
init_image = load_image( init_image = load_image(
...@@ -248,9 +251,9 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): ...@@ -248,9 +251,9 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
assert np.abs(expected_image - image).max() < 5e-1 assert np.abs(expected_image - image).max() < 5e-1
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
init_image = load_image( init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
...@@ -270,7 +273,7 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): ...@@ -270,7 +273,7 @@ class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
prompt = "Face of a yellow cat, high resolution, sitting on a park bench" prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
......
...@@ -31,11 +31,12 @@ from diffusers import ( ...@@ -31,11 +31,12 @@ from diffusers import (
) )
from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
load_numpy, load_numpy,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -284,29 +285,29 @@ class StableDiffusionLatentUpscalePipelineFastTests( ...@@ -284,29 +285,29 @@ class StableDiffusionLatentUpscalePipelineFastTests(
pass pass
@require_torch_gpu @require_torch_accelerator
@slow @slow
class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase): class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_latent_upscaler_fp16(self): def test_latent_upscaler_fp16(self):
generator = torch.manual_seed(33) generator = torch.manual_seed(33)
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
pipe.to("cuda") pipe.to(torch_device)
upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
"stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
) )
upscaler.to("cuda") upscaler.to(torch_device)
prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic" prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
...@@ -332,7 +333,7 @@ class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase): ...@@ -332,7 +333,7 @@ class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
"stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
) )
upscaler.to("cuda") upscaler.to(torch_device)
prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas" prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
......
...@@ -25,12 +25,16 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer ...@@ -25,12 +25,16 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
load_numpy, load_numpy,
require_accelerator, require_accelerator,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -44,13 +48,13 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase): ...@@ -44,13 +48,13 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@property @property
def dummy_image(self): def dummy_image(self):
...@@ -381,19 +385,19 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase): ...@@ -381,19 +385,19 @@ class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_diffusion_upscale_pipeline(self): def test_stable_diffusion_upscale_pipeline(self):
image = load_image( image = load_image(
...@@ -459,9 +463,9 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): ...@@ -459,9 +463,9 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
assert np.abs(expected_image - image).max() < 5e-1 assert np.abs(expected_image - image).max() < 5e-1
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
image = load_image( image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
...@@ -475,7 +479,7 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): ...@@ -475,7 +479,7 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
prompt = "a cat sitting on a park bench" prompt = "a cat sitting on a park bench"
...@@ -488,6 +492,6 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): ...@@ -488,6 +492,6 @@ class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
output_type="np", output_type="np",
) )
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.9 GB is allocated # make sure that less than 2.9 GB is allocated
assert mem_bytes < 2.9 * 10**9 assert mem_bytes < 2.9 * 10**9
...@@ -31,11 +31,15 @@ from diffusers import ( ...@@ -31,11 +31,15 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
load_numpy, load_numpy,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_accelerator, require_accelerator,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -49,13 +53,13 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase): ...@@ -49,13 +53,13 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
@property @property
def dummy_cond_unet(self): def dummy_cond_unet(self):
...@@ -258,19 +262,19 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase): ...@@ -258,19 +262,19 @@ class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
def setUp(self): def setUp(self):
# clean up the VRAM before each test # clean up the VRAM before each test
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
# clean up the VRAM after each test # clean up the VRAM after each test
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_diffusion_v_pred_default(self): def test_stable_diffusion_v_pred_default(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
...@@ -357,7 +361,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -357,7 +361,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_attention_slicing_v_pred(self): def test_stable_diffusion_attention_slicing_v_pred(self):
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
model_id = "stabilityai/stable-diffusion-2" model_id = "stabilityai/stable-diffusion-2"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.to(torch_device) pipe.to(torch_device)
...@@ -373,8 +377,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -373,8 +377,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
) )
image_chunked = output_chunked.images image_chunked = output_chunked.images
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
# make sure that less than 5.5 GB is allocated # make sure that less than 5.5 GB is allocated
assert mem_bytes < 5.5 * 10**9 assert mem_bytes < 5.5 * 10**9
...@@ -385,7 +389,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -385,7 +389,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
image = output.images image = output.images
# make sure that more than 3.0 GB is allocated # make sure that more than 3.0 GB is allocated
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
assert mem_bytes > 3 * 10**9 assert mem_bytes > 3 * 10**9
max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten()) max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten())
assert max_diff < 1e-3 assert max_diff < 1e-3
...@@ -421,7 +425,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -421,7 +425,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
pipe.scheduler = DDIMScheduler.from_config( pipe.scheduler = DDIMScheduler.from_config(
pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True
) )
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
...@@ -466,7 +470,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -466,7 +470,7 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
...@@ -530,20 +534,20 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): ...@@ -530,20 +534,20 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
assert 2 * low_cpu_mem_usage_time < normal_load_time assert 2 * low_cpu_mem_usage_time < normal_load_time
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipeline_id = "stabilityai/stable-diffusion-2" pipeline_id = "stabilityai/stable-diffusion-2"
prompt = "Andromeda galaxy in a bottle" prompt = "Andromeda galaxy in a bottle"
pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
pipeline.enable_attention_slicing(1) pipeline.enable_attention_slicing(1)
pipeline.enable_sequential_cpu_offload() pipeline.enable_sequential_cpu_offload(device=torch_device)
generator = torch.manual_seed(0) generator = torch.manual_seed(0)
_ = pipeline(prompt, generator=generator, num_inference_steps=5) _ = pipeline(prompt, generator=generator, num_inference_steps=5)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.8 GB is allocated # make sure that less than 2.8 GB is allocated
assert mem_bytes < 2.8 * 10**9 assert mem_bytes < 2.8 * 10**9
...@@ -8,6 +8,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProject ...@@ -8,6 +8,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProject
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_gpu_with_torch_cuda,
slow, slow,
...@@ -240,12 +241,12 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase): ...@@ -240,12 +241,12 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0): def get_inputs(self, device, seed=0):
if str(device).startswith("mps"): if str(device).startswith("mps"):
...@@ -263,7 +264,7 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase): ...@@ -263,7 +264,7 @@ class StableDiffusion3PipelineSlowTests(unittest.TestCase):
def test_sd3_inference(self): def test_sd3_inference(self):
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device) inputs = self.get_inputs(torch_device)
......
...@@ -15,6 +15,7 @@ from diffusers import ( ...@@ -15,6 +15,7 @@ from diffusers import (
) )
from diffusers.utils import load_image from diffusers.utils import load_image
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
floats_tensor, floats_tensor,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda, require_big_gpu_with_torch_cuda,
...@@ -174,12 +175,12 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase): ...@@ -174,12 +175,12 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0): def get_inputs(self, device, seed=0):
init_image = load_image( init_image = load_image(
...@@ -202,7 +203,7 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase): ...@@ -202,7 +203,7 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
def test_sd3_img2img_inference(self): def test_sd3_img2img_inference(self):
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload() pipe.enable_model_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device) inputs = self.get_inputs(torch_device)
......
...@@ -35,12 +35,13 @@ from diffusers import ( ...@@ -35,12 +35,13 @@ from diffusers import (
from diffusers.utils import logging from diffusers.utils import logging
from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
load_numpy, load_numpy,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -604,17 +605,17 @@ class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterM ...@@ -604,17 +605,17 @@ class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterM
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase): class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def test_stable_diffusion_adapter_depth_sd_v15(self): def test_stable_diffusion_adapter_depth_sd_v15(self):
adapter_model = "TencentARC/t2iadapter_depth_sd15v2" adapter_model = "TencentARC/t2iadapter_depth_sd15v2"
......
...@@ -30,13 +30,17 @@ from diffusers import ( ...@@ -30,13 +30,17 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
) )
from diffusers.utils.testing_utils import ( from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
load_numpy, load_numpy,
nightly, nightly,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -164,17 +168,17 @@ class StableDiffusionImageVariationPipelineFastTests( ...@@ -164,17 +168,17 @@ class StableDiffusionImageVariationPipelineFastTests(
@slow @slow
@require_torch_gpu @require_torch_accelerator
class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
...@@ -258,37 +262,37 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): ...@@ -258,37 +262,37 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
assert number_of_steps == inputs["num_inference_steps"] assert number_of_steps == inputs["num_inference_steps"]
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
torch.cuda.empty_cache() backend_empty_cache(torch_device)
torch.cuda.reset_max_memory_allocated() backend_reset_max_memory_allocated(torch_device)
torch.cuda.reset_peak_memory_stats() backend_reset_peak_memory_stats(torch_device)
pipe = StableDiffusionImageVariationPipeline.from_pretrained( pipe = StableDiffusionImageVariationPipeline.from_pretrained(
"lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16 "lambdalabs/sd-image-variations-diffusers", safety_checker=None, torch_dtype=torch.float16
) )
pipe.set_progress_bar_config(disable=None) pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1) pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload() pipe.enable_sequential_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device, dtype=torch.float16) inputs = self.get_inputs(torch_device, dtype=torch.float16)
_ = pipe(**inputs) _ = pipe(**inputs)
mem_bytes = torch.cuda.max_memory_allocated() mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 2.6 GB is allocated # make sure that less than 2.6 GB is allocated
assert mem_bytes < 2.6 * 10**9 assert mem_bytes < 2.6 * 10**9
@nightly @nightly
@require_torch_gpu @require_torch_accelerator
class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase): class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
def setUp(self): def setUp(self):
super().setUp() super().setUp()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def tearDown(self): def tearDown(self):
super().tearDown() super().tearDown()
gc.collect() gc.collect()
torch.cuda.empty_cache() backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed) generator = torch.Generator(device=generator_device).manual_seed(seed)
......
...@@ -38,7 +38,7 @@ from diffusers.utils.testing_utils import ( ...@@ -38,7 +38,7 @@ from diffusers.utils.testing_utils import (
enable_full_determinism, enable_full_determinism,
load_image, load_image,
numpy_cosine_similarity_distance, numpy_cosine_similarity_distance,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -265,7 +265,7 @@ class StableDiffusionXLPipelineFastTests( ...@@ -265,7 +265,7 @@ class StableDiffusionXLPipelineFastTests(
def test_inference_batch_single_identical(self): def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3) super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@require_torch_gpu @require_torch_accelerator
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -274,12 +274,12 @@ class StableDiffusionXLPipelineFastTests( ...@@ -274,12 +274,12 @@ class StableDiffusionXLPipelineFastTests(
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components) sd_pipe = StableDiffusionXLPipeline(**components)
sd_pipe.enable_model_cpu_offload() sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components) sd_pipe = StableDiffusionXLPipeline(**components)
sd_pipe.enable_sequential_cpu_offload() sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
image_slices = [] image_slices = []
......
...@@ -42,7 +42,7 @@ from diffusers.utils.testing_utils import ( ...@@ -42,7 +42,7 @@ from diffusers.utils.testing_utils import (
enable_full_determinism, enable_full_determinism,
floats_tensor, floats_tensor,
load_image, load_image,
require_torch_gpu, require_torch_accelerator,
slow, slow,
torch_device, torch_device,
) )
...@@ -293,7 +293,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests( ...@@ -293,7 +293,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(
assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4)
@require_torch_gpu @require_torch_accelerator
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -302,12 +302,12 @@ class StableDiffusionXLImg2ImgPipelineFastTests( ...@@ -302,12 +302,12 @@ class StableDiffusionXLImg2ImgPipelineFastTests(
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe.enable_model_cpu_offload() sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe.enable_sequential_cpu_offload() sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
image_slices = [] image_slices = []
...@@ -596,7 +596,7 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests( ...@@ -596,7 +596,7 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@require_torch_gpu @require_torch_accelerator
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -605,12 +605,12 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests( ...@@ -605,12 +605,12 @@ class StableDiffusionXLImg2ImgRefinerOnlyPipelineFastTests(
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe.enable_model_cpu_offload() sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
sd_pipe.enable_sequential_cpu_offload() sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
image_slices = [] image_slices = []
......
...@@ -41,7 +41,13 @@ from diffusers import ( ...@@ -41,7 +41,13 @@ from diffusers import (
UNet2DConditionModel, UNet2DConditionModel,
UniPCMultistepScheduler, UniPCMultistepScheduler,
) )
from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, slow, torch_device from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
require_torch_accelerator,
slow,
torch_device,
)
from ..pipeline_params import ( from ..pipeline_params import (
TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
...@@ -305,7 +311,48 @@ class StableDiffusionXLInpaintPipelineFastTests( ...@@ -305,7 +311,48 @@ class StableDiffusionXLInpaintPipelineFastTests(
def test_save_load_optional_components(self): def test_save_load_optional_components(self):
pass pass
@require_torch_gpu @require_torch_accelerator
def test_stable_diffusion_xl_inpaint_negative_prompt_embeds(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLInpaintPipeline(**components)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
# forward without prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with prompt embeds
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
prompt = 3 * [inputs.pop("prompt")]
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
output = sd_pipe(
**inputs,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
)
image_slice_2 = output.images[0, -3:, -3:, -1]
# make sure that it's equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
@require_torch_accelerator
def test_stable_diffusion_xl_offloads(self): def test_stable_diffusion_xl_offloads(self):
pipes = [] pipes = []
components = self.get_dummy_components() components = self.get_dummy_components()
...@@ -314,12 +361,12 @@ class StableDiffusionXLInpaintPipelineFastTests( ...@@ -314,12 +361,12 @@ class StableDiffusionXLInpaintPipelineFastTests(
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLInpaintPipeline(**components) sd_pipe = StableDiffusionXLInpaintPipeline(**components)
sd_pipe.enable_model_cpu_offload() sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
components = self.get_dummy_components() components = self.get_dummy_components()
sd_pipe = StableDiffusionXLInpaintPipeline(**components) sd_pipe = StableDiffusionXLInpaintPipeline(**components)
sd_pipe.enable_sequential_cpu_offload() sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe) pipes.append(sd_pipe)
image_slices = [] image_slices = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment