[tests] make cuda only tests device-agnostic (#11058)

* enable bnb on xpu * add 2 more cases * add missing change * add missing change * add one more * enable cuda only tests on xpu * enable big gpu cases

[tests] make cuda only tests device-agnostic (#11058)
* enable bnb on xpu * add 2 more cases * add missing change * add missing change * add one more * enable cuda only tests on xpu * enable big gpu cases
15ad97f7 · Fanli Lin · GitHub · 9f2d5c9e · 15ad97f7 · 15ad97f7
Unverified Commit 15ad97f7 authored Mar 20, 2025 by Fanli Lin Committed by GitHub Mar 20, 2025
17 changed files
--- a/src/diffusers/loaders/textual_inversion.py
+++ b/src/diffusers/loaders/textual_inversion.py
@@ -449,9 +449,9 @@ class TextualInversionLoaderMixin:

        # 7.5 Offload the model again
        if is_model_cpu_offload:
-            self.enable_model_cpu_offload()
+            self.enable_model_cpu_offload(device=device)
        elif is_sequential_cpu_offload:
-            self.enable_sequential_cpu_offload()
+            self.enable_sequential_cpu_offload(device=device)

        # / Unsafe Code >


--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -320,6 +320,21 @@ def require_torch_multi_gpu(test_case):
    return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple GPUs")(test_case)


+def require_torch_multi_accelerator(test_case):
+    """
+    Decorator marking a test that requires a multi-accelerator setup (in PyTorch). These tests are skipped on a machine
+    without multiple hardware accelerators.
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    return unittest.skipUnless(
+        torch.cuda.device_count() > 1 or torch.xpu.device_count() > 1, "test requires multiple hardware accelerators"
+    )(test_case)
+
+
 def require_torch_accelerator_with_fp16(test_case):
    """Decorator marking a test that requires an accelerator with support for the FP16 data type."""
    return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
@@ -354,6 +369,31 @@ def require_big_gpu_with_torch_cuda(test_case):
    )(test_case)


+def require_big_accelerator(test_case):
+    """
+    Decorator marking a test that requires a bigger hardware accelerator (24GB) for execution. Some example pipelines:
+    Flux, SD3, Cog, etc.
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if not (torch.cuda.is_available() or torch.xpu.is_available()):
+        return unittest.skip("test requires PyTorch CUDA")(test_case)
+
+    if torch.xpu.is_available():
+        device_properties = torch.xpu.get_device_properties(0)
+    else:
+        device_properties = torch.cuda.get_device_properties(0)
+
+    total_memory = device_properties.total_memory / (1024**3)
+    return unittest.skipUnless(
+        total_memory >= BIG_GPU_MEMORY,
+        f"test requires a hardware accelerator with at least {BIG_GPU_MEMORY} GB memory",
+    )(test_case)
+
+
 def require_torch_accelerator_with_training(test_case):
    """Decorator marking a test that requires an accelerator with support for training."""
    return unittest.skipUnless(

--- a/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
+++ b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py
@@ -124,7 +124,7 @@ class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase):
        return model

    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
        if torch_device != "mps":
            return torch.Generator(device=generator_device).manual_seed(seed)
        return torch.manual_seed(seed)

--- a/tests/models/autoencoders/test_models_autoencoder_kl.py
+++ b/tests/models/autoencoders/test_models_autoencoder_kl.py
@@ -165,7 +165,7 @@ class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
        model.eval()

        # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
        if torch_device != "mps":
            generator = torch.Generator(device=generator_device).manual_seed(0)
        else:
@@ -263,7 +263,7 @@ class AutoencoderKLIntegrationTests(unittest.TestCase):
        return model

    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
        if torch_device != "mps":
            return torch.Generator(device=generator_device).manual_seed(seed)
        return torch.manual_seed(seed)

--- a/tests/models/autoencoders/test_models_autoencoder_oobleck.py
+++ b/tests/models/autoencoders/test_models_autoencoder_oobleck.py
@@ -183,7 +183,7 @@ class AutoencoderOobleckIntegrationTests(unittest.TestCase):
        return model

    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
+        generator_device = "cpu" if not torch_device.startswith(torch_device) else torch_device
        if torch_device != "mps":
            return torch.Generator(device=generator_device).manual_seed(seed)
        return torch.manual_seed(seed)

--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -63,7 +63,7 @@ from diffusers.utils.testing_utils import (
    require_torch_accelerator,
    require_torch_accelerator_with_training,
    require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
    run_test_in_subprocess,
    torch_all_close,
    torch_device,
@@ -1227,7 +1227,7 @@ class ModelTesterMixin:

            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
    def test_model_parallelism(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()

--- a/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
+++ b/tests/pipelines/controlnet_sd3/test_controlnet_sd3.py
@@ -31,9 +31,10 @@ from diffusers import (
 from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
    slow,
    torch_device,
 )
@@ -219,7 +220,7 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes


 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
    pipeline_class = StableDiffusion3ControlNetPipeline
@@ -227,12 +228,12 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Canny", torch_dtype=torch.float16)
@@ -272,7 +273,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -304,7 +305,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -338,7 +339,7 @@ class StableDiffusion3ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-3-medium-diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)

--- a/tests/pipelines/flux/test_pipeline_flux.py
+++ b/tests/pipelines/flux/test_pipeline_flux.py
@@ -12,7 +12,7 @@ from diffusers.utils.testing_utils import (
    backend_empty_cache,
    nightly,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
    slow,
    torch_device,
 )
@@ -204,7 +204,7 @@ class FluxPipelineFastTests(


 @nightly
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxPipelineSlowTests(unittest.TestCase):
    pipeline_class = FluxPipeline
@@ -292,7 +292,7 @@ class FluxPipelineSlowTests(unittest.TestCase):


 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
    pipeline_class = FluxPipeline
@@ -304,12 +304,12 @@ class FluxIPAdapterPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):

--- a/tests/pipelines/flux/test_pipeline_flux_redux.py
+++ b/tests/pipelines/flux/test_pipeline_flux_redux.py
@@ -8,15 +8,16 @@ import torch
 from diffusers import FluxPipeline, FluxPriorReduxPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
    slow,
    torch_device,
 )


 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class FluxReduxSlowTests(unittest.TestCase):
    pipeline_class = FluxPriorReduxPipeline
@@ -27,12 +28,12 @@ class FluxReduxSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def get_inputs(self, device, seed=0):
        init_image = load_image(
@@ -59,7 +60,7 @@ class FluxReduxSlowTests(unittest.TestCase):
            self.base_repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
        )
        pipe_redux.to(torch_device)
-        pipe_base.enable_model_cpu_offload()
+        pipe_base.enable_model_cpu_offload(device=torch_device)

        inputs = self.get_inputs(torch_device)
        base_pipeline_inputs = self.get_base_pipeline_inputs(torch_device)

--- a/tests/pipelines/pag/test_pag_sd3_img2img.py
+++ b/tests/pipelines/pag/test_pag_sd3_img2img.py
@@ -262,7 +262,7 @@ class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase):
        pipeline = AutoPipelineForImage2Image.from_pretrained(
            self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.(4|17)"]
        )
-        pipeline.enable_model_cpu_offload()
+        pipeline.enable_model_cpu_offload(device=torch_device)
        pipeline.set_progress_bar_config(disable=None)

        inputs = self.get_inputs(torch_device, guidance_scale=0.0, pag_scale=1.8)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -57,7 +57,7 @@ from diffusers.utils.testing_utils import (
    require_accelerate_version_greater,
    require_torch_2,
    require_torch_accelerator,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
    run_test_in_subprocess,
    skip_mps,
    slow,
@@ -1409,7 +1409,7 @@ class StableDiffusionPipelineNightlyTests(unittest.TestCase):

 # (sayakpaul): This test suite was run in the DGX with two GPUs (1, 2).
 @slow
-@require_torch_multi_gpu
+@require_torch_multi_accelerator
 @require_accelerate_version_greater("0.27.0")
 class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
    def tearDown(self):
@@ -1497,7 +1497,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `to()` can be used and the pipeline can be called.
-        pipe = sd_pipe_with_device_map.to("cuda")
+        pipe = sd_pipe_with_device_map.to(torch_device)
        _ = pipe("hello", num_inference_steps=2)

    def test_reset_device_map_enable_model_cpu_offload(self):
@@ -1509,7 +1509,7 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_model_cpu_offload()` can be used and the pipeline can be called.
-        sd_pipe_with_device_map.enable_model_cpu_offload()
+        sd_pipe_with_device_map.enable_model_cpu_offload(device=torch_device)
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)

    def test_reset_device_map_enable_sequential_cpu_offload(self):
@@ -1521,5 +1521,5 @@ class StableDiffusionPipelineDeviceMapTests(unittest.TestCase):
        assert sd_pipe_with_device_map.hf_device_map is None

        # Make sure `enable_sequential_cpu_offload()` can be used and the pipeline can be called.
-        sd_pipe_with_device_map.enable_sequential_cpu_offload()
+        sd_pipe_with_device_map.enable_sequential_cpu_offload(device=torch_device)
        _ = sd_pipe_with_device_map("hello", num_inference_steps=2)
--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py
@@ -10,7 +10,7 @@ from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, SD3Transfo
 from diffusers.utils.testing_utils import (
    backend_empty_cache,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
    slow,
    torch_device,
 )
@@ -232,7 +232,7 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):


 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3PipelineSlowTests(unittest.TestCase):
    pipeline_class = StableDiffusion3Pipeline

--- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
+++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_img2img.py
@@ -18,7 +18,7 @@ from diffusers.utils.testing_utils import (
    backend_empty_cache,
    floats_tensor,
    numpy_cosine_similarity_distance,
-    require_big_gpu_with_torch_cuda,
+    require_big_accelerator,
    slow,
    torch_device,
 )
@@ -166,7 +166,7 @@ class StableDiffusion3Img2ImgPipelineFastTests(PipelineLatentTesterMixin, unitte


 @slow
-@require_big_gpu_with_torch_cuda
+@require_big_accelerator
 @pytest.mark.big_gpu_with_torch_cuda
 class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
    pipeline_class = StableDiffusion3Img2ImgPipeline
@@ -202,11 +202,10 @@ class StableDiffusion3Img2ImgPipelineSlowTests(unittest.TestCase):
        }

    def test_sd3_img2img_inference(self):
+        torch.manual_seed(0)
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
        pipe.enable_model_cpu_offload(device=torch_device)
-
        inputs = self.get_inputs(torch_device)
-
        image = pipe(**inputs).images[0]
        image_slice = image[0, :10, :10]
        expected_slice = np.array(

--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -45,6 +45,7 @@ from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.source_code_parsing_utils import ReturnNameVisitor
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
    require_accelerate_version_greater,
    require_accelerator,
    require_hf_hub_version_greater,
@@ -1108,13 +1109,13 @@ class PipelineTesterMixin:
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test in case of CUDA runtime errors
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_save_load_local(self, expected_max_difference=5e-4):
        components = self.get_dummy_components()
@@ -1423,7 +1424,6 @@ class PipelineTesterMixin:
    def test_save_load_optional_components(self, expected_max_difference=1e-4):
        if not hasattr(self.pipeline_class, "_optional_components"):
            return
-
        components = self.get_dummy_components()
        pipe = self.pipeline_class(**components)
        for component in pipe.components.values():
@@ -1438,6 +1438,7 @@ class PipelineTesterMixin:

        generator_device = "cpu"
        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output = pipe(**inputs)[0]

        with tempfile.TemporaryDirectory() as tmpdir:
@@ -1456,6 +1457,7 @@ class PipelineTesterMixin:
            )

        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output_loaded = pipe_loaded(**inputs)[0]

        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
@@ -1550,12 +1552,14 @@ class PipelineTesterMixin:

        generator_device = "cpu"
        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output_without_offload = pipe(**inputs)[0]

        pipe.enable_sequential_cpu_offload(device=torch_device)
        assert pipe._execution_device.type == torch_device

        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output_with_offload = pipe(**inputs)[0]

        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
@@ -1613,12 +1617,14 @@ class PipelineTesterMixin:
        pipe.set_progress_bar_config(disable=None)

        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output_without_offload = pipe(**inputs)[0]

        pipe.enable_model_cpu_offload(device=torch_device)
        assert pipe._execution_device.type == torch_device

        inputs = self.get_dummy_inputs(generator_device)
+        torch.manual_seed(0)
        output_with_offload = pipe(**inputs)[0]

        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()

--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -303,6 +303,7 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
        )
        shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
+        generator = torch.Generator(device=device).manual_seed(0)
        decoder_latents = pipe.prepare_latents(
            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
        )

--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -407,6 +407,7 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa
            pipe.super_res_first.config.sample_size,
            pipe.super_res_first.config.sample_size,
        )
+        generator = torch.Generator(device=device).manual_seed(0)
        super_res_latents = pipe.prepare_latents(
            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
        )

--- a/tests/schedulers/test_scheduler_dpm_sde.py
+++ b/tests/schedulers/test_scheduler_dpm_sde.py
@@ -64,7 +64,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
        if torch_device in ["mps"]:
            assert abs(result_sum.item() - 167.47821044921875) < 1e-2
            assert abs(result_mean.item() - 0.2178705964565277) < 1e-3
-        elif torch_device in ["cuda"]:
+        elif torch_device in ["cuda", "xpu"]:
            assert abs(result_sum.item() - 171.59352111816406) < 1e-2
            assert abs(result_mean.item() - 0.22342906892299652) < 1e-3
        else:
@@ -96,7 +96,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
        if torch_device in ["mps"]:
            assert abs(result_sum.item() - 124.77149200439453) < 1e-2
            assert abs(result_mean.item() - 0.16226289014816284) < 1e-3
-        elif torch_device in ["cuda"]:
+        elif torch_device in ["cuda", "xpu"]:
            assert abs(result_sum.item() - 128.1663360595703) < 1e-2
            assert abs(result_mean.item() - 0.16688326001167297) < 1e-3
        else:
@@ -127,7 +127,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
        if torch_device in ["mps"]:
            assert abs(result_sum.item() - 167.46957397460938) < 1e-2
            assert abs(result_mean.item() - 0.21805934607982635) < 1e-3
-        elif torch_device in ["cuda"]:
+        elif torch_device in ["cuda", "xpu"]:
            assert abs(result_sum.item() - 171.59353637695312) < 1e-2
            assert abs(result_mean.item() - 0.22342908382415771) < 1e-3
        else:
@@ -159,7 +159,7 @@ class DPMSolverSDESchedulerTest(SchedulerCommonTest):
        if torch_device in ["mps"]:
            assert abs(result_sum.item() - 176.66974135742188) < 1e-2
            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
-        elif torch_device in ["cuda"]:
+        elif torch_device in ["cuda", "xpu"]:
            assert abs(result_sum.item() - 177.63653564453125) < 1e-2
            assert abs(result_mean.item() - 0.23003872730981811) < 1e-2
        else: