Unverified Commit 70bbbb92 authored by Chetan Kumar Verma's avatar Chetan Kumar Verma Committed by GitHub
Browse files

HPU support for unit tests (#1680)

parent d863adb2
......@@ -29,8 +29,6 @@ def _(
if A.dtype != torch.uint8:
A = A.view(torch.uint8)
transpose = False if len(A.shape) == 2 and A.shape[0] == 1 else True
A = A.reshape(-1)
if GAUDI_SW_VER and (GAUDI_SW_VER.major < 1 or GAUDI_SW_VER.minor < 22):
......@@ -47,7 +45,4 @@ def _(
output = out_dq.reshape(shape)
if transpose:
output = output.t()
return output
......@@ -98,3 +98,14 @@ DTYPE_NAMES = {
def describe_dtype(dtype: torch.dtype) -> str:
return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2]
def is_supported_on_hpu(
quant_type: str = "nf4", dtype: torch.dtype = torch.bfloat16, quant_storage: torch.dtype = torch.uint8
) -> bool:
"""
Check if the given quant_type, dtype and quant_storage are supported on HPU.
"""
if quant_type == "fp4" or dtype == torch.float16 or quant_storage not in (torch.uint8, torch.bfloat16):
return False
return True
......@@ -8,6 +8,7 @@ from tests.helpers import (
describe_dtype,
get_available_devices,
id_formatter,
is_supported_on_hpu,
)
TRANSPOSE_VALS = [(False, True), (False, False)]
......@@ -189,6 +190,9 @@ def test_matmul_4bit(
if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
......
......@@ -16,6 +16,7 @@ from tests.helpers import (
get_available_devices,
get_test_dims,
id_formatter,
is_supported_on_hpu,
)
torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
......@@ -1101,6 +1102,9 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
......@@ -1132,11 +1136,15 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
def test_4bit_compressed_stats(self, device, quant_type, blocksize):
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype)
def test_4bit_compressed_stats(self, device, quant_type, blocksize, dtype):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype):
pytest.skip("FP4 quantization is not supported on HPU.")
errs1 = []
errs2 = []
for i in range(10):
A1 = torch.randn(1024, 1024, device=device).half()
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type)
A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type)
......@@ -1205,6 +1213,9 @@ class TestQuantize4BitFunctional:
)
@pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype, quant_storage):
pytest.skip("This configuration is not supported on HPU.")
errs1 = []
errs2 = []
errs3 = []
......@@ -1354,6 +1365,9 @@ class TestQuantize4BitFunctional:
if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
pytest.skip("This configuration is not supported on HPU.")
dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims)
......
......@@ -13,6 +13,7 @@ from tests.helpers import (
describe_dtype,
get_available_devices,
id_formatter,
is_supported_on_hpu,
torch_load_from_buffer,
torch_save_to_buffer,
)
......@@ -27,12 +28,17 @@ storage = {
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("original_dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
original_dtype = torch.float16
def test_linear_serialization(
device, quant_type, original_dtype, compress_statistics, bias, quant_storage, save_before_forward
):
if device == "hpu" and not is_supported_on_hpu(quant_type, original_dtype, storage[quant_storage]):
pytest.skip("This configuration is not supported on HPU.")
compute_dtype = None
layer_shape = (300, 400)
......@@ -188,6 +194,9 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_copy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
......@@ -207,6 +216,9 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
......@@ -233,6 +245,9 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
original_tensor = torch.randn(300, 400)
original_param = bnb.nn.Params4bit(
data=original_tensor,
......@@ -270,6 +285,9 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
if device == "hpu" and not is_supported_on_hpu(quant_type):
pytest.skip("This configuration is not supported on HPU.")
if fullgraph and torch.__version__ < (2, 8, 0, "dev"):
pytest.skip("fullgraph mode requires torch 2.8 or higher")
......@@ -314,7 +332,8 @@ def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_st
ref_output = net(x)
# Compile the model
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
compile_backend = "hpu_backend" if device == "hpu" else "inductor"
compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode, backend=compile_backend)
# Get output from compiled model
with torch.no_grad():
......
......@@ -5,7 +5,7 @@ import torch
import bitsandbytes
from bitsandbytes.functional import ipex_xpu
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu
# torch.library.opcheck is only available in torch 2.4 and later.
# When testing with older versions, we will skip it as a no-op.
......@@ -158,6 +158,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
A = torch.randn(1024, 1024, dtype=dtype, device=device)
out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
......@@ -179,6 +182,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
shape = (128, 128)
n = prod(shape)
......@@ -210,6 +216,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
pytest.skip("This configuration is not supported on HPU.")
out_features = 1024
in_features = 256
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment