Unverified Commit 70bbbb92 authored by Chetan Kumar Verma's avatar Chetan Kumar Verma Committed by GitHub
Browse files

HPU support for unit tests (#1680)

parent d863adb2
...@@ -29,8 +29,6 @@ def _( ...@@ -29,8 +29,6 @@ def _(
if A.dtype != torch.uint8: if A.dtype != torch.uint8:
A = A.view(torch.uint8) A = A.view(torch.uint8)
transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True
A = A.reshape(-1) A = A.reshape(-1)
if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22): if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22):
...@@ -47,7 +45,4 @@ def _( ...@@ -47,7 +45,4 @@ def _(
output = out_dq.reshape(shape) output = out_dq.reshape(shape)
if transpose:
output = output.t()
return output return output
...@@ -98,3 +98,14 @@ DTYPE_NAMES = { ...@@ -98,3 +98,14 @@ DTYPE_NAMES = {
def describe_dtype(dtype: torch.dtype) -> str: def describe_dtype(dtype: torch.dtype) -> str:
return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
def is_supported_on_hpu(
quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8
) -> bool:
"""
Check if the given quant_type, dtype and quant_storage are supported on HPU.
"""
if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16):
return False
return True
...@@ -8,6 +8,7 @@ from tests.helpers import ( ...@@ -8,6 +8,7 @@ from tests.helpers import (
describe_dtype, describe_dtype,
get_available_devices, get_available_devices,
id_formatter, id_formatter,
is_supported_on_hpu,
) )
TRANSPOSE_VALS = [(False, True), (False, False)] TRANSPOSE_VALS = [(False, True), (False, False)]
...@@ -189,6 +190,9 @@ def test_matmul_4bit( ...@@ -189,6 +190,9 @@ def test_matmul_4bit(
if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6): if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6") pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
for i in range(3): for i in range(3):
# normal multiply # normal multiply
if funcs[0] in [torch.mm, torch.matmul]: if funcs[0] in [torch.mm, torch.matmul]:
......
...@@ -16,6 +16,7 @@ from tests.helpers import ( ...@@ -16,6 +16,7 @@ from tests.helpers import (
get_available_devices, get_available_devices,
get_test_dims, get_test_dims,
id_formatter, id_formatter,
is_supported_on_hpu,
) )
torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
...@@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional: ...@@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize): def test_4bit_quant(self, device, dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
A1 = torch.randn(1024, 1024, device=device, dtype=dtype) A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type) A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
...@@ -1132,11 +1136,15 @@ class TestQuantize4BitFunctional: ...@@ -1132,11 +1136,15 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize")) @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
def test_4bit_compressed_stats(self, device, quant_type, blocksize): @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("FP4 quantization is not supported on HPU.")
errs1 = [] errs1 = []
errs2 = [] errs2 = []
for i in range(10): for i in range(10):
A1 = torch.randn(1024, 1024, device=device).half() A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type) A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
...@@ -1205,6 +1213,9 @@ class TestQuantize4BitFunctional: ...@@ -1205,6 +1213,9 @@ class TestQuantize4BitFunctional:
) )
@pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim")) @pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind): def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
pytest.skip("This configuration is not supported on HPU.")
errs1 = [] errs1 = []
errs2 = [] errs2 = []
errs3 = [] errs3 = []
...@@ -1354,6 +1365,9 @@ class TestQuantize4BitFunctional: ...@@ -1354,6 +1365,9 @@ class TestQuantize4BitFunctional:
if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3): if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3") pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
dims = 10 dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242)) torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims) dims = get_test_dims(0, 8192, n=dims)
......
...@@ -13,6 +13,7 @@ from tests.helpers import ( ...@@ -13,6 +13,7 @@ from tests.helpers import (
describe_dtype, describe_dtype,
get_available_devices, get_available_devices,
id_formatter, id_formatter,
is_supported_on_hpu,
torch_load_from_buffer, torch_load_from_buffer,
torch_save_to_buffer, torch_save_to_buffer,
) )
...@@ -27,12 +28,17 @@ storage = { ...@@ -27,12 +28,17 @@ storage = {
@pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias")) @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward")) @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward): def test_linear_serialization(
original_dtype = torch.float16 device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward
):
if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]):
pytest.skip("This configuration is not supported on HPU.")
compute_dtype = None compute_dtype = None
layer_shape = (300, 400) layer_shape = (300, 400)
...@@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua ...@@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
@pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_copy_param(device, quant_type, blocksize, compress_statistics): def test_copy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
tensor = torch.randn(300, 400) tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit( param = bnb.nn.Params4bit(
data=tensor, data=tensor,
...@@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics): ...@@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
tensor = torch.randn(300, 400) tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit( param = bnb.nn.Params4bit(
data=tensor, data=tensor,
...@@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): ...@@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128]) @pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics): def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
original_tensor = torch.randn(300, 400) original_tensor = torch.randn(300, 400)
original_param = bnb.nn.Params4bit( original_param = bnb.nn.Params4bit(
data=original_tensor, data=original_tensor,
...@@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s ...@@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode")) @pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4") @pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode): def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
if fullgraph and torch.__version__ < (2, 8, 0, "dev"): if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
pytest.skip("fullgraph mode requires torch 2.8 or higher") pytest.skip("fullgraph mode requires torch 2.8 or higher")
...@@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st ...@@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
ref_output = net(x) ref_output = net(x)
# Compile the model # Compile the model
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode) compile_backend = "hpu_backend" if device == "hpu" else "inductor"
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)
# Get output from compiled model # Get output from compiled model
with torch.no_grad(): with torch.no_grad():
......
...@@ -5,7 +5,7 @@ import torch ...@@ -5,7 +5,7 @@ import torch
import bitsandbytes import bitsandbytes
from bitsandbytes.functional import ipex_xpu from bitsandbytes.functional import ipex_xpu
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu
# torch.library.opcheck is only available in torch 2.4 and later. # torch.library.opcheck is only available in torch 2.4 and later.
# When testing with older versions, we will skip it as a no-op. # When testing with older versions, we will skip it as a no-op.
...@@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps: ...@@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
A = torch.randn(1024, 1024, dtype=dtype, device=device) A = torch.randn(1024, 1024, dtype=dtype, device=device)
out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype) out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
...@@ -179,6 +182,9 @@ class Test4bitBlockwiseQuantOps: ...@@ -179,6 +182,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
shape = (128, 128) shape = (128, 128)
n = prod(shape) n = prod(shape)
...@@ -210,6 +216,9 @@ class Test4bitBlockwiseQuantOps: ...@@ -210,6 +216,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512]) @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize): def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
out_features = 1024 out_features = 1024
in_features = 256 in_features = 256
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment