Unverified Commit 15ad97f7 authored by Fanli Lin's avatar Fanli Lin Committed by GitHub
Browse files

[tests] make cuda only tests device-agnostic (#11058)

* enable bnb on xpu

* add 2 more cases

* add missing change

* add missing change

* add one more

* enable cuda only tests on xpu

* enable big gpu cases
parent 9f2d5c9e
......@@ -449,9 +449,9 @@ class TextualInversionLoaderMixin:
# 7.5 Offload the model again
if is_model_cpu_offload:
self.enable_model_cpu_offload()
self.enable_model_cpu_offload(device=device)
elif is_sequential_cpu_offload:
self.enable_sequential_cpu_offload()
self.enable_sequential_cpu_offload(device=device)
# / Unsafe Code >
......
......@@ -320,6 +320,21 @@ def require_torch_multi_gpu(test_case):
return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)
def require_torch_multi_accelerator(test_case):
"""
Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine
without multiple hardware accelerators.
"""
if not is_torch_available():
return unittest.skip("test requires PyTorch")(test_case)
import torch
return unittest.skipUnless(
torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
)(test_case)
def require_torch_accelerator_with_fp16(test_case):
"""Decorator marking a test that requires an accelerator with support for the FP16 data type."""
return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
......@@ -354,6 +369,31 @@ def require_big_gpu_with_torch_cuda(test_case):
)(test_case)
def require_big_accelerator(test_case):
"""
Decorator marking a test that requires a bigger hardware accelerator (24GB) for execution. Some example pipelines:
Flux, SD3, Cog, etc.
"""
if not is_torch_available():
return unittest.skip("test requires PyTorch")(test_case)
import torch
if not (torch.cuda.is_available() or torch.xpu.is_available()):
return unittest.skip("test requires PyTorch CUDA")(test_case)
if torch.xpu.is_available():
device_properties = torch.xpu.get_device_properties(0)
else:
device_properties = torch.cuda.get_device_properties(0)
total_memory = device_properties.total_memory / (1024**3)
return unittest.skipUnless(
total_memory >= BIG_GPU_MEMORY,
f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
)(test_case)
def require_torch_accelerator_with_training(test_case):
"""Decorator marking a test that requires an accelerator with support for training."""
return unittest.skipUnless(
......
......@@ -124,7 +124,7 @@ class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
return model
def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
......
......@@ -165,7 +165,7 @@ class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
model.eval()
# Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
generator = torch.Generator(device=generator_device).manual_seed(0)
else:
......@@ -263,7 +263,7 @@ class AutoencoderKLIntegrationTests(unittest.TestCase):
return model
def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
......
......@@ -183,7 +183,7 @@ class AutoencoderOobleckIntegrationTests(unittest.TestCase):
return model
def get_generator(self, seed=0):
generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
if torch_device != "mps":
return torch.Generator(device=generator_device).manual_seed(seed)
return torch.manual_seed(seed)
......
......@@ -63,7 +63,7 @@ from diffusers.utils.testing_utils import (
require_torch_accelerator,
require_torch_accelerator_with_training,
require_torch_gpu,
require_torch_multi_gpu,
require_torch_multi_accelerator,
run_test_in_subprocess,
torch_all_close,
torch_device,
......@@ -1227,7 +1227,7 @@ class ModelTesterMixin:
self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))
@require_torch_multi_gpu
@require_torch_multi_accelerator
def test_model_parallelism(self):
config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**config).eval()
......
......@@ -31,9 +31,10 @@ from diffusers import (
from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
......@@ -219,7 +220,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3ControlNetPipeline
......@@ -227,12 +228,12 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def test_canny(self):
controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
......@@ -272,7 +273,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
......@@ -304,7 +305,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
......@@ -338,7 +339,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
......
......@@ -12,7 +12,7 @@ from diffusers.utils.testing_utils import (
backend_empty_cache,
nightly,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
......@@ -204,7 +204,7 @@ class FluxPipelineFastTests(
@nightly
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxPipeline
......@@ -292,7 +292,7 @@ class FluxPipelineSlowTests(unittest.TestCase):
@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
pipeline_class = FluxPipeline
......@@ -304,12 +304,12 @@ class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0):
if str(device).startswith("mps"):
......
......@@ -8,15 +8,16 @@ import torch
from diffusers import FluxPipeline, FluxPriorReduxPipeline
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class FluxReduxSlowTests(unittest.TestCase):
pipeline_class = FluxPriorReduxPipeline
......@@ -27,12 +28,12 @@ class FluxReduxSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0):
init_image = load_image(
......@@ -59,7 +60,7 @@ class FluxReduxSlowTests(unittest.TestCase):
self.base_repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
)
pipe_redux.to(torch_device)
pipe_base.enable_model_cpu_offload()
pipe_base.enable_model_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device)
base_pipeline_inputs = self.get_base_pipeline_inputs(torch_device)
......
......@@ -262,7 +262,7 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
pipeline = AutoPipelineForImage2Image.from_pretrained(
self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.(4|17)"]
)
pipeline.enable_model_cpu_offload()
pipeline.enable_model_cpu_offload(device=torch_device)
pipeline.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device, guidance_scale=0.0, pag_scale=1.8)
......
......@@ -57,7 +57,7 @@ from diffusers.utils.testing_utils import (
require_accelerate_version_greater,
require_torch_2,
require_torch_accelerator,
require_torch_multi_gpu,
require_torch_multi_accelerator,
run_test_in_subprocess,
skip_mps,
slow,
......@@ -1409,7 +1409,7 @@ class StableDiffusionPipelineNightlyTests(unittest.TestCase):
# (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
@slow
@require_torch_multi_gpu
@require_torch_multi_accelerator
@require_accelerate_version_greater("0.27.0")
class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
def tearDown(self):
......@@ -1497,7 +1497,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
assert sd_pipe_with_device_map.hf_device_map is None
# Make sure `to()` can be used and the pipeline can be called.
pipe = sd_pipe_with_device_map.to("cuda")
pipe = sd_pipe_with_device_map.to(torch_device)
_ = pipe("hello", num_inference_steps=2)
def test_reset_device_map_enable_model_cpu_offload(self):
......@@ -1509,7 +1509,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
assert sd_pipe_with_device_map.hf_device_map is None
# Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
sd_pipe_with_device_map.enable_model_cpu_offload()
sd_pipe_with_device_map.enable_model_cpu_offload(device=torch_device)
_ = sd_pipe_with_device_map("hello", num_inference_steps=2)
def test_reset_device_map_enable_sequential_cpu_offload(self):
......@@ -1521,5 +1521,5 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
assert sd_pipe_with_device_map.hf_device_map is None
# Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
sd_pipe_with_device_map.enable_sequential_cpu_offload()
sd_pipe_with_device_map.enable_sequential_cpu_offload(device=torch_device)
_ = sd_pipe_with_device_map("hello", num_inference_steps=2)
......@@ -10,7 +10,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transfo
from diffusers.utils.testing_utils import (
backend_empty_cache,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
......@@ -232,7 +232,7 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3PipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3Pipeline
......
......@@ -18,7 +18,7 @@ from diffusers.utils.testing_utils import (
backend_empty_cache,
floats_tensor,
numpy_cosine_similarity_distance,
require_big_gpu_with_torch_cuda,
require_big_accelerator,
slow,
torch_device,
)
......@@ -166,7 +166,7 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte
@slow
@require_big_gpu_with_torch_cuda
@require_big_accelerator
@pytest.mark.big_gpu_with_torch_cuda
class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
pipeline_class = StableDiffusion3Img2ImgPipeline
......@@ -202,11 +202,10 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
}
def test_sd3_img2img_inference(self):
torch.manual_seed(0)
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload(device=torch_device)
inputs = self.get_inputs(torch_device)
image = pipe(**inputs).images[0]
image_slice = image[0, :10, :10]
expected_slice = np.array(
......
......@@ -45,6 +45,7 @@ from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.source_code_parsing_utils import ReturnNameVisitor
from diffusers.utils.testing_utils import (
CaptureLogger,
backend_empty_cache,
require_accelerate_version_greater,
require_accelerator,
require_hf_hub_version_greater,
......@@ -1108,13 +1109,13 @@ class PipelineTesterMixin:
# clean up the VRAM before each test
super().setUp()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test in case of CUDA runtime errors
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
backend_empty_cache(torch_device)
def test_save_load_local(self, expected_max_difference=5e-4):
components = self.get_dummy_components()
......@@ -1423,7 +1424,6 @@ class PipelineTesterMixin:
def test_save_load_optional_components(self, expected_max_difference=1e-4):
if not hasattr(self.pipeline_class, "_optional_components"):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
......@@ -1438,6 +1438,7 @@ class PipelineTesterMixin:
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
......@@ -1456,6 +1457,7 @@ class PipelineTesterMixin:
)
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
......@@ -1550,12 +1552,14 @@ class PipelineTesterMixin:
generator_device = "cpu"
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output_without_offload = pipe(**inputs)[0]
pipe.enable_sequential_cpu_offload(device=torch_device)
assert pipe._execution_device.type == torch_device
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
......@@ -1613,12 +1617,14 @@ class PipelineTesterMixin:
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output_without_offload = pipe(**inputs)[0]
pipe.enable_model_cpu_offload(device=torch_device)
assert pipe._execution_device.type == torch_device
inputs = self.get_dummy_inputs(generator_device)
torch.manual_seed(0)
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
......
......@@ -303,6 +303,7 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
generator = torch.Generator(device=device).manual_seed(0)
decoder_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
......
......@@ -407,6 +407,7 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa
pipe.super_res_first.config.sample_size,
pipe.super_res_first.config.sample_size,
)
generator = torch.Generator(device=device).manual_seed(0)
super_res_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
......
......@@ -64,7 +64,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 167.47821044921875) < 1e-2
assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
elif torch_device in ["cuda"]:
elif torch_device in ["cuda", "xpu"]:
assert abs(result_sum.item() - 171.59352111816406) < 1e-2
assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
else:
......@@ -96,7 +96,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 124.77149200439453) < 1e-2
assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
elif torch_device in ["cuda"]:
elif torch_device in ["cuda", "xpu"]:
assert abs(result_sum.item() - 128.1663360595703) < 1e-2
assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
else:
......@@ -127,7 +127,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 167.46957397460938) < 1e-2
assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
elif torch_device in ["cuda"]:
elif torch_device in ["cuda", "xpu"]:
assert abs(result_sum.item() - 171.59353637695312) < 1e-2
assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
else:
......@@ -159,7 +159,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
if torch_device in ["mps"]:
assert abs(result_sum.item() - 176.66974135742188) < 1e-2
assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
elif torch_device in ["cuda"]:
elif torch_device in ["cuda", "xpu"]:
assert abs(result_sum.item() - 177.63653564453125) < 1e-2
assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment