HPU support for unit tests (#1680)

70bbbb92 · Chetan Kumar Verma · GitHub · d863adb2 · 70bbbb92 · 70bbbb92
Unverified Commit 70bbbb92 authored Jun 16, 2025 by Chetan Kumar Verma Committed by GitHub Jun 16, 2025
6 changed files
--- a/bitsandbytes/backends/hpu/ops.py
+++ b/bitsandbytes/backends/hpu/ops.py
@@ -29,8 +29,6 @@ def _(
    if A.dtype != torch.uint8:
        A = A.view(torch.uint8)

-    transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True
-
    A = A.reshape(-1)

    if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22):
@@ -47,7 +45,4 @@ def _(

    output = out_dq.reshape(shape)

-    if transpose:
-        output = output.t()
-
    return output
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -98,3 +98,14 @@ DTYPE_NAMES = {

 def describe_dtype(dtype: torch.dtype) -> str:
    return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
+
+
+def is_supported_on_hpu(
+    quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8
+) -> bool:
+    """
+    Check if the given quant_type, dtype and quant_storage are supported on HPU.
+    """
+    if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16):
+        return False
+    return True
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -8,6 +8,7 @@ from tests.helpers import (
    describe_dtype,
    get_available_devices,
    id_formatter,
+    is_supported_on_hpu,
 )

 TRANSPOSE_VALS = [(False, True), (False, False)]
@@ -189,6 +190,9 @@ def test_matmul_4bit(
    if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
        pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")

+    if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+        pytest.skip("This configuration is not supported on HPU.")
+
    for i in range(3):
        # normal multiply
        if funcs[0] in [torch.mm, torch.matmul]:

--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -16,6 +16,7 @@ from tests.helpers import (
    get_available_devices,
    get_test_dims,
    id_formatter,
+    is_supported_on_hpu,
 )

 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
@@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional:
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
    def test_4bit_quant(self, device, dtype, quant_type, blocksize):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
        A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
        qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
        A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
@@ -1132,11 +1136,15 @@ class TestQuantize4BitFunctional:
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
-    def test_4bit_compressed_stats(self, device, quant_type, blocksize):
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
+    def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
+            pytest.skip("FP4 quantization is not supported on HPU.")
+
        errs1 = []
        errs2 = []
        for i in range(10):
-            A1 = torch.randn(1024, 1024, device=device).half()
+            A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
            q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
            q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
            A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
@@ -1205,6 +1213,9 @@ class TestQuantize4BitFunctional:
    )
    @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
    def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
+        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
+            pytest.skip("This configuration is not supported on HPU.")
+
        errs1 = []
        errs2 = []
        errs3 = []
@@ -1354,6 +1365,9 @@ class TestQuantize4BitFunctional:
        if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
            pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")

+        if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
        dims = 10
        torch.random.manual_seed(np.random.randint(0, 412424242))
        dims = get_test_dims(0, 8192, n=dims)

--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -13,6 +13,7 @@ from tests.helpers import (
    describe_dtype,
    get_available_devices,
    id_formatter,
+    is_supported_on_hpu,
    torch_load_from_buffer,
    torch_save_to_buffer,
 )
@@ -27,12 +28,17 @@ storage = {

 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
+@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
 @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
-def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
-    original_dtype = torch.float16
+def test_linear_serialization(
+    device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward
+):
+    if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]):
+        pytest.skip("This configuration is not supported on HPU.")
+
    compute_dtype = None
    layer_shape = (300, 400)

@@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_copy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
    tensor = torch.randn(300, 400)
    param = bnb.nn.Params4bit(
        data=tensor,
@@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
    tensor = torch.randn(300, 400)
    param = bnb.nn.Params4bit(
        data=tensor,
@@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
 @pytest.mark.parametrize("blocksize", [64, 128])
 @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
 def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
    original_tensor = torch.randn(300, 400)
    original_param = bnb.nn.Params4bit(
        data=original_tensor,
@@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
 @pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
 @pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
 def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
+    if device == "hpu" and not is_supported_on_hpu(quant_type):
+        pytest.skip("This configuration is not supported on HPU.")
+
    if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
        pytest.skip("fullgraph mode requires torch 2.8 or higher")

@@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
        ref_output = net(x)

    # Compile the model
-    compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
+    compile_backend = "hpu_backend" if device == "hpu" else "inductor"
+    compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)

    # Get output from compiled model
    with torch.no_grad():

--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -5,7 +5,7 @@ import torch

 import bitsandbytes
 from bitsandbytes.functional import ipex_xpu
-from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
+from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu

 # torch.library.opcheck is only available in torch 2.4 and later.
 # When testing with older versions, we will skip it as a no-op.
@@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
    def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
        A = torch.randn(1024, 1024, dtype=dtype, device=device)

        out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
@@ -179,6 +182,9 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
    def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
        shape = (128, 128)

        n = prod(shape)
@@ -210,6 +216,9 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
    def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
+        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
+            pytest.skip("This configuration is not supported on HPU.")
+
        out_features = 1024
        in_features = 256