[tests] enable bnb tests on xpu (#11001)

* enable bnb on xpu * add 2 more cases * add missing change * add missing change * add one more

[tests] enable bnb tests on xpu (#11001)
* enable bnb on xpu * add 2 more cases * add missing change * add missing change * add one more
56f74005 · Fanli Lin · GitHub · a34d97ce · 56f74005 · 56f74005
Unverified Commit 56f74005 authored Mar 20, 2025 by Fanli Lin Committed by GitHub Mar 19, 2025
6 changed files
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -427,7 +427,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                "It seems like you have activated a device mapping strategy on the pipeline which doesn't allow explicit device placement using `to()`. You can call `reset_device_map()` to remove the existing device map from the pipeline."
            )
-        if device_type == "cuda":
+        if device_type in ["cuda", "xpu"]:
            if pipeline_is_sequentially_offloaded and not pipeline_has_bnb:
                raise ValueError(
                    "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
@@ -440,7 +440,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        # Display a warning in this case (the operation succeeds but the benefits are lost)
        pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
-        if pipeline_is_offloaded and device_type == "cuda":
+        if pipeline_is_offloaded and device_type in ["cuda", "xpu"]:
            logger.warning(
                f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
            )

--- a/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
+++ b/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py
@@ -61,7 +61,7 @@ class BnB4BitDiffusersQuantizer(DiffusersQuantizer):
            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
    def validate_environment(self, *args, **kwargs):
-        if not torch.cuda.is_available():
+        if not (torch.cuda.is_available() or torch.xpu.is_available()):
            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
        if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
            raise ImportError(
@@ -238,11 +238,15 @@ class BnB4BitDiffusersQuantizer(DiffusersQuantizer):
    def update_device_map(self, device_map):
        if device_map is None:
-            device_map = {"": f"cuda:{torch.cuda.current_device()}"}
+            if torch.xpu.is_available():
+                current_device = f"xpu:{torch.xpu.current_device()}"
+            else:
+                current_device = f"cuda:{torch.cuda.current_device()}"
+            device_map = {"": current_device}
            logger.info(
                "The device_map was not initialized. "
                "Setting device_map to {"
-                ": f`cuda:{torch.cuda.current_device()}`}. "
+                ": {current_device}}. "
                "If you want to use the model for inference, please set device_map ='auto' "
            )
        return device_map
@@ -312,7 +316,10 @@ class BnB4BitDiffusersQuantizer(DiffusersQuantizer):
            logger.info(
                "Model was found to be on CPU (could happen as a result of `enable_model_cpu_offload()`). So, moving it to GPU. After dequantization, will move the model back to CPU again to preserve the previous device."
            )
-            model.to(torch.cuda.current_device())
+            if torch.xpu.is_available():
+                model.to(torch.xpu.current_device())
+            else:
+                model.to(torch.cuda.current_device())
        model = dequantize_and_replace(
            model, self.modules_to_not_convert, quantization_config=self.quantization_config
@@ -343,7 +350,7 @@ class BnB8BitDiffusersQuantizer(DiffusersQuantizer):
            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
    def validate_environment(self, *args, **kwargs):
-        if not torch.cuda.is_available():
+        if not (torch.cuda.is_available() or torch.xpu.is_available()):
            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
        if not is_accelerate_available() or is_accelerate_version("<", "0.26.0"):
            raise ImportError(
@@ -402,11 +409,15 @@ class BnB8BitDiffusersQuantizer(DiffusersQuantizer):
    # Copied from diffusers.quantizers.bitsandbytes.bnb_quantizer.BnB4BitDiffusersQuantizer.update_device_map
    def update_device_map(self, device_map):
        if device_map is None:
-            device_map = {"": f"cuda:{torch.cuda.current_device()}"}
+            if torch.xpu.is_available():
+                current_device = f"xpu:{torch.xpu.current_device()}"
+            else:
+                current_device = f"cuda:{torch.cuda.current_device()}"
+            device_map = {"": current_device}
            logger.info(
                "The device_map was not initialized. "
                "Setting device_map to {"
-                ": f`cuda:{torch.cuda.current_device()}`}. "
+                ": {current_device}}. "
                "If you want to use the model for inference, please set device_map ='auto' "
            )
        return device_map

--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -574,10 +574,10 @@ def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -
    return arry
-def load_pt(url: str):
+def load_pt(url: str, map_location: str):
    response = requests.get(url)
    response.raise_for_status()
-    arry = torch.load(BytesIO(response.content))
+    arry = torch.load(BytesIO(response.content), map_location=map_location)
    return arry

--- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
+++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
@@ -377,9 +377,10 @@ class IPAdapterSDIntegrationTests(IPAdapterNightlyTestsMixin):
        pipeline.set_ip_adapter_scale(0.7)
        inputs = self.get_dummy_inputs()
-        id_embeds = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt")[
+        id_embeds = load_pt(
-            0
+            "https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt",
-        ]
+            map_location=torch_device,
+        )[0]
        id_embeds = id_embeds.reshape((2, 1, 1, 512))
        inputs["ip_adapter_image_embeds"] = [id_embeds]
        inputs["ip_adapter_image"] = None

--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -26,6 +26,7 @@ from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DMo
 from diffusers.utils import is_accelerate_version, logging
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
    is_bitsandbytes_available,
    is_torch_available,
    is_transformers_available,
@@ -35,7 +36,7 @@ from diffusers.utils.testing_utils import (
    require_bitsandbytes_version_greater,
    require_peft_backend,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    slow,
    torch_device,
@@ -66,7 +67,7 @@ if is_bitsandbytes_available():
 @require_bitsandbytes_version_greater("0.43.2")
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class Base4bitTests(unittest.TestCase):
    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
@@ -84,13 +85,16 @@ class Base4bitTests(unittest.TestCase):
    def get_dummy_inputs(self):
        prompt_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/prompt_embeds.pt"
+            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/prompt_embeds.pt",
+            torch_device,
        )
        pooled_prompt_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/pooled_prompt_embeds.pt"
+            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/pooled_prompt_embeds.pt",
+            torch_device,
        )
        latent_model_input = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/latent_model_input.pt"
+            "https://huggingface.co/datasets/hf-internal-testing/bnb-diffusers-testing-artifacts/resolve/main/latent_model_input.pt",
+            torch_device,
        )
        input_dict_for_transformer = {
@@ -106,7 +110,7 @@ class Base4bitTests(unittest.TestCase):
 class BnB4BitBasicTests(Base4bitTests):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        # Models
        self.model_fp16 = SD3Transformer2DModel.from_pretrained(
@@ -128,7 +132,7 @@ class BnB4BitBasicTests(Base4bitTests):
            del self.model_4bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quantization_num_parameters(self):
        r"""
@@ -224,7 +228,7 @@ class BnB4BitBasicTests(Base4bitTests):
                    self.assertTrue(module.weight.dtype == torch.uint8)
        # test if inference works.
-        with torch.no_grad() and torch.amp.autocast("cuda", dtype=torch.float16):
+        with torch.no_grad() and torch.amp.autocast(torch_device, dtype=torch.float16):
            input_dict_for_transformer = self.get_dummy_inputs()
            model_inputs = {
                k: v.to(device=torch_device) for k, v in input_dict_for_transformer.items() if not isinstance(v, bool)
@@ -266,9 +270,9 @@ class BnB4BitBasicTests(Base4bitTests):
        self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
        # Move back to CUDA device
-        for device in [0, "cuda", "cuda:0", "call()"]:
+        for device in [0, f"{torch_device}", f"{torch_device}:0", "call()"]:
            if device == "call()":
-                self.model_4bit.cuda(0)
+                self.model_4bit.to(f"{torch_device}:0")
            else:
                self.model_4bit.to(device)
            self.assertEqual(self.model_4bit.device, torch.device(0))
@@ -286,7 +290,7 @@ class BnB4BitBasicTests(Base4bitTests):
        with self.assertRaises(ValueError):
            # Tries with a `device` and `dtype`
-            self.model_4bit.to(device="cuda:0", dtype=torch.float16)
+            self.model_4bit.to(device=f"{torch_device}:0", dtype=torch.float16)
        with self.assertRaises(ValueError):
            # Tries with a cast
@@ -297,7 +301,7 @@ class BnB4BitBasicTests(Base4bitTests):
            self.model_4bit.half()
        # This should work
-        self.model_4bit.to("cuda")
+        self.model_4bit.to(torch_device)
        # Test if we did not break anything
        self.model_fp16 = self.model_fp16.to(dtype=torch.float32, device=torch_device)
@@ -321,7 +325,7 @@ class BnB4BitBasicTests(Base4bitTests):
        _ = self.model_fp16.float()
        # Check that this does not throw an error
-        _ = self.model_fp16.cuda()
+        _ = self.model_fp16.to(torch_device)
    def test_bnb_4bit_wrong_config(self):
        r"""
@@ -398,7 +402,7 @@ class BnB4BitTrainingTests(Base4bitTests):
        model_inputs.update({k: v for k, v in input_dict_for_transformer.items() if k not in model_inputs})
        # Step 4: Check if the gradient is not None
-        with torch.amp.autocast("cuda", dtype=torch.float16):
+        with torch.amp.autocast(torch_device, dtype=torch.float16):
            out = self.model_4bit(**model_inputs)[0]
            out.norm().backward()
@@ -412,7 +416,7 @@ class BnB4BitTrainingTests(Base4bitTests):
 class SlowBnb4BitTests(Base4bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        nf4_config = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -431,7 +435,7 @@ class SlowBnb4BitTests(Base4bitTests):
        del self.pipeline_4bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quality(self):
        output = self.pipeline_4bit(
@@ -501,7 +505,7 @@ class SlowBnb4BitTests(Base4bitTests):
        reason="Test will pass after https://github.com/huggingface/accelerate/pull/3223 is in a release.",
        strict=True,
    )
-    def test_pipeline_cuda_placement_works_with_nf4(self):
+    def test_pipeline_device_placement_works_with_nf4(self):
        transformer_nf4_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
@@ -532,7 +536,7 @@ class SlowBnb4BitTests(Base4bitTests):
            transformer=transformer_4bit,
            text_encoder_3=text_encoder_3_4bit,
            torch_dtype=torch.float16,
-        ).to("cuda")
+        ).to(torch_device)
        # Check if inference works.
        _ = pipeline_4bit("table", max_sequence_length=20, num_inference_steps=2)
@@ -696,7 +700,7 @@ class SlowBnb4BitFluxTests(Base4bitTests):
 class BaseBnb4BitSerializationTests(Base4bitTests):
    def tearDown(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
        r"""

--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -31,6 +31,7 @@ from diffusers import (
 from diffusers.utils import is_accelerate_version
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
    is_bitsandbytes_available,
    is_torch_available,
    is_transformers_available,
@@ -40,7 +41,7 @@ from diffusers.utils.testing_utils import (
    require_bitsandbytes_version_greater,
    require_peft_version_greater,
    require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    slow,
    torch_device,
@@ -71,7 +72,7 @@ if is_bitsandbytes_available():
 @require_bitsandbytes_version_greater("0.43.2")
 @require_accelerate
 @require_torch
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class Base8bitTests(unittest.TestCase):
    # We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
@@ -111,7 +112,7 @@ class Base8bitTests(unittest.TestCase):
 class BnB8bitBasicTests(Base8bitTests):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        # Models
        self.model_fp16 = SD3Transformer2DModel.from_pretrained(
@@ -129,7 +130,7 @@ class BnB8bitBasicTests(Base8bitTests):
            del self.model_8bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quantization_num_parameters(self):
        r"""
@@ -279,7 +280,7 @@ class BnB8bitBasicTests(Base8bitTests):
        with self.assertRaises(ValueError):
            # Tries with a `device`
-            self.model_8bit.to(torch.device("cuda:0"))
+            self.model_8bit.to(torch.device(f"{torch_device}:0"))
        with self.assertRaises(ValueError):
            # Tries with a `device`
@@ -317,7 +318,7 @@ class BnB8bitBasicTests(Base8bitTests):
 class Bnb8bitDeviceTests(Base8bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        mixed_int8_config = BitsAndBytesConfig(load_in_8bit=True)
        self.model_8bit = SanaTransformer2DModel.from_pretrained(
@@ -331,7 +332,7 @@ class Bnb8bitDeviceTests(Base8bitTests):
        del self.model_8bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_buffers_device_assignment(self):
        for buffer_name, buffer in self.model_8bit.named_buffers():
@@ -345,7 +346,7 @@ class Bnb8bitDeviceTests(Base8bitTests):
 class BnB8bitTrainingTests(Base8bitTests):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        mixed_int8_config = BitsAndBytesConfig(load_in_8bit=True)
        self.model_8bit = SD3Transformer2DModel.from_pretrained(
@@ -389,7 +390,7 @@ class BnB8bitTrainingTests(Base8bitTests):
 class SlowBnb8bitTests(Base8bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        mixed_int8_config = BitsAndBytesConfig(load_in_8bit=True)
        model_8bit = SD3Transformer2DModel.from_pretrained(
@@ -404,7 +405,7 @@ class SlowBnb8bitTests(Base8bitTests):
        del self.pipeline_8bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quality(self):
        output = self.pipeline_8bit(
@@ -616,7 +617,7 @@ class SlowBnb8bitTests(Base8bitTests):
 class SlowBnb8bitFluxTests(Base8bitTests):
    def setUp(self) -> None:
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        model_id = "hf-internal-testing/flux.1-dev-int8-pkg"
        t5_8bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
@@ -633,7 +634,7 @@ class SlowBnb8bitFluxTests(Base8bitTests):
        del self.pipeline_8bit
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_quality(self):
        # keep the resolution and max tokens to a lower number for faster execution.
@@ -680,7 +681,7 @@ class SlowBnb8bitFluxTests(Base8bitTests):
 class BaseBnb8bitSerializationTests(Base8bitTests):
    def setUp(self):
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
@@ -693,7 +694,7 @@ class BaseBnb8bitSerializationTests(Base8bitTests):
        del self.model_0
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
    def test_serialization(self):
        r"""