Unverified Commit 6a07ffe0 authored by Matthew Douglas's avatar Matthew Douglas Committed by GitHub
Browse files

Test improvements (#1750)

* Test suite improvements for MPS/XPU/HPU

* Skip test on torch==2.8.0+cpu for Windows regression
parent d731fc42
...@@ -372,7 +372,7 @@ jobs: ...@@ -372,7 +372,7 @@ jobs:
pypi_index: "https://download.pytorch.org/whl/cu128" pypi_index: "https://download.pytorch.org/whl/cu128"
- cuda_version: "12.9.1" - cuda_version: "12.9.1"
torch_version: "2.8.0" torch_version: "2.8.0"
pypi_index: "https://download.pytorch.org/whl/test/cu129" pypi_index: "https://download.pytorch.org/whl/cu129"
# Linux L40S runners # Linux L40S runners
......
...@@ -34,6 +34,8 @@ def pytest_runtest_teardown(item, nextitem): ...@@ -34,6 +34,8 @@ def pytest_runtest_teardown(item, nextitem):
gc.collect() gc.collect()
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
torch.mps.empty_cache()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
......
import math import math
import platform
import random import random
import time import time
import einops import einops
import numpy as np from packaging import version
import pytest import pytest
import torch import torch
...@@ -101,16 +102,16 @@ class Test8BitBlockwiseQuantizeFunctional: ...@@ -101,16 +102,16 @@ class Test8BitBlockwiseQuantizeFunctional:
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed): def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
iters = 100 iters = 100
if device == "cpu": if device != "cuda":
iters = 10 iters = 10
# This test is slow on CPU, so avoid atypical use cases. # This test is slow in our non-CUDA implementations, so avoid atypical use cases.
if nested: if nested:
pytest.skip("Not a typical use case.") pytest.skip("Not a typical use case.")
if blocksize != 256: if blocksize != 256:
pytest.skip("Only blocksize 256 is used in CPU/XPU") pytest.skip("Only blocksize 256 is used in CPU/MPS/XPU")
if dtype != torch.float32: if dtype != torch.float32:
pytest.skip("Only float32 is used in CPU/XPU") pytest.skip("Only float32 is used in CPU/MPS/XPU")
diffs = [] diffs = []
reldiffs = [] reldiffs = []
...@@ -239,7 +240,7 @@ class Test8BitBlockwiseQuantizeFunctional: ...@@ -239,7 +240,7 @@ class Test8BitBlockwiseQuantizeFunctional:
abserr = [] abserr = []
relerr = [] relerr = []
for i in range(100): for i in range(10):
A1 = torch.randn(1024, 1024, device=device) A1 = torch.randn(1024, 1024, device=device)
C, SC = F.quantize_blockwise(A1, code=code) C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC) A2 = F.dequantize_blockwise(C, SC)
...@@ -253,7 +254,7 @@ class Test8BitBlockwiseQuantizeFunctional: ...@@ -253,7 +254,7 @@ class Test8BitBlockwiseQuantizeFunctional:
abserr = [] abserr = []
relerr = [] relerr = []
for i in range(100): for i in range(10):
A1 = torch.rand(1024, 1024, device=device) A1 = torch.rand(1024, 1024, device=device)
C, SC = F.quantize_blockwise(A1, code=code) C, SC = F.quantize_blockwise(A1, code=code)
A2 = F.dequantize_blockwise(C, SC) A2 = F.dequantize_blockwise(C, SC)
...@@ -267,7 +268,7 @@ class Test8BitBlockwiseQuantizeFunctional: ...@@ -267,7 +268,7 @@ class Test8BitBlockwiseQuantizeFunctional:
abserr = [] abserr = []
relerr = [] relerr = []
for i in range(100): for i in range(10):
A1 = torch.randn(1024, 1024, device=device) A1 = torch.randn(1024, 1024, device=device)
C, SC = F.quantize_blockwise(A1) C, SC = F.quantize_blockwise(A1)
A2 = F.dequantize_blockwise(C, SC) A2 = F.dequantize_blockwise(C, SC)
...@@ -1406,20 +1407,21 @@ class TestQuantize4BitFunctional: ...@@ -1406,20 +1407,21 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"]) @pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
@pytest.mark.skipif( @pytest.mark.skipif(
HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a", HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a",
reason="this test is not supported on ROCm with gfx90a architecture yet", reason="this test is not supported on ROCm with gfx90a architecture yet",
) )
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant): def test_gemv_eye_4bit(self, device, storage_type, dtype):
if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3): if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3") pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
if device == "hpu" and not is_supported_on_hpu(storage_type, dtype): if device == "hpu" and not is_supported_on_hpu(storage_type, dtype):
pytest.skip("This configuration is not supported on HPU.") pytest.skip("This configuration is not supported on HPU.")
dims = 10 if device == "cpu" and platform.system() == "Windows" and version.parse(torch.__version__).release == (2, 8, 0):
torch.random.manual_seed(np.random.randint(0, 412424242)) pytest.skip("Regression: CPU crash on Windows with torch 2.8.0")
dims = 4
dims = get_test_dims(0, 8192, n=dims) dims = get_test_dims(0, 8192, n=dims)
dims = [dim + (64 - (dim % 64)) for dim in dims] dims = [dim + (64 - (dim % 64)) for dim in dims]
# for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]: # for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]:
...@@ -1427,7 +1429,7 @@ class TestQuantize4BitFunctional: ...@@ -1427,7 +1429,7 @@ class TestQuantize4BitFunctional:
A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device=device) A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device=device)
B = torch.eye(dim, dtype=dtype, device=device) B = torch.eye(dim, dtype=dtype, device=device)
qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant) qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=False)
C3 = torch.matmul(A, B.t()) C3 = torch.matmul(A, B.t())
C2 = bnb.matmul_4bit(A, qB.t(), state) C2 = bnb.matmul_4bit(A, qB.t(), state)
A.requires_grad = True A.requires_grad = True
......
...@@ -172,6 +172,10 @@ optimizer_names_32bit = [ ...@@ -172,6 +172,10 @@ optimizer_names_32bit = [
@pytest.mark.parametrize("device", get_available_devices(no_cpu=True), ids=id_formatter("device")) @pytest.mark.parametrize("device", get_available_devices(no_cpu=True), ids=id_formatter("device"))
@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device") @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
def test_optimizer32bit(dim1, dim2, gtype, optim_name, device): def test_optimizer32bit(dim1, dim2, gtype, optim_name, device):
if device not in ["cuda", "xpu"]:
pytest.skip("Optimizers are only supported on CUDA and XPU")
if optim_name.startswith("paged_") and sys.platform == "win32": if optim_name.startswith("paged_") and sys.platform == "win32":
pytest.skip("Paged optimizers can have issues on Windows.") pytest.skip("Paged optimizers can have issues on Windows.")
...@@ -253,6 +257,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name, device): ...@@ -253,6 +257,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name, device):
@pytest.mark.parametrize("device", get_available_devices(no_cpu=True)) @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device") @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
def test_global_config(dim1, dim2, gtype, device): def test_global_config(dim1, dim2, gtype, device):
if device not in ["cuda", "xpu"]:
pytest.skip("Optimizers are only supported on CUDA and XPU")
if dim1 == 1 and dim2 == 1: if dim1 == 1 and dim2 == 1:
return return
p1 = torch.randn(dim1, dim2, device="cpu", dtype=gtype) * 0.1 p1 = torch.randn(dim1, dim2, device="cpu", dtype=gtype) * 0.1
...@@ -310,6 +317,10 @@ optimizer_names_8bit = [ ...@@ -310,6 +317,10 @@ optimizer_names_8bit = [
@pytest.mark.parametrize("device", get_available_devices(no_cpu=True)) @pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device") @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="No device")
def test_optimizer8bit(dim1, dim2, gtype, optim_name, device): def test_optimizer8bit(dim1, dim2, gtype, optim_name, device):
if device not in ["cuda", "xpu"]:
pytest.skip("8-bit optimizers are only supported on CUDA and XPU")
torch.set_printoptions(precision=6) torch.set_printoptions(precision=6)
if dim1 == 1 and dim2 == 1: if dim1 == 1 and dim2 == 1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment