Tests: improve CUDA support detection (#985)

* implicitly skip any test that implicitly uses CUDA on a non-CUDA box * add a `requires_cuda` fixture

Tests: improve CUDA support detection (#985)
* implicitly skip any test that implicitly uses CUDA on a non-CUDA box * add a `requires_cuda` fixture
f1c75741 · Aarni Koskela · GitHub · 53f8af8c · f1c75741 · f1c75741
Unverified Commit f1c75741 authored Jan 24, 2024 by Aarni Koskela Committed by GitHub Jan 24, 2024
8 changed files
--- a/tests/conftest.py
+++ b/tests/conftest.py
+import pytest
+import torch
+def pytest_runtest_call(item):
+    try:
+        item.runtest()
+    except AssertionError as ae:
+        if str(ae) == "Torch not compiled with CUDA enabled":
+            pytest.skip("Torch not compiled with CUDA enabled")
+        raise
+@pytest.fixture(scope="session")
+def requires_cuda() -> bool:
+    cuda_available = torch.cuda.is_available()
+    if not cuda_available:
+        pytest.skip("CUDA is required")
+    return cuda_available
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -40,7 +40,6 @@ names = [
    ids=names,
 )
 def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
-    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
    if dim2 > 0:
        dim2 = dim2 - (dim2 % 16)
    dim3 = dim3 - (dim3 % 16)
@@ -307,7 +306,6 @@ def test_matmullt(
    has_fp16_weights,
    has_bias
 ):
-    if not torch.cuda.is_available(): pytest.skip('No GPU found.')
    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
    dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
    outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda")
@@ -461,7 +459,6 @@ quant_type = ['fp4', 'nf4']
 values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type))
 str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics, quant_type))
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics_{}_quant_type_{}".format(*vals) for vals in str_values]
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type", values, ids=names)
 def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type):
    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
@@ -551,7 +548,6 @@ has_fp16_weights = [True, False]
 values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose))
 str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose))
 names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values]
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names)
 def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
    dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)

--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -5,12 +5,12 @@ from pathlib import Path
 # hardcoded test. Not good, but a sanity check for now
 # TODO: improve this
-def test_manual_override():
+def test_manual_override(requires_cuda):
    manual_cuda_path = str(Path('/mmfs1/home/dettmers/data/local/cuda-12.2'))
    pytorch_version = torch.version.cuda.replace('.', '')
-    assert pytorch_version != 122
+    assert pytorch_version != 122  # TODO: this will never be true...
    os.environ['CUDA_HOME']='{manual_cuda_path}'
    os.environ['BNB_CUDA_VERSION']='122'

--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -617,7 +617,10 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
        return
    if dtype == torch.int32 and out_order != "col32":
        return
-    func = F.get_transform_func(dtype, orderA, orderOut, transpose)
+    try:
+        func = F.get_transform_func(dtype, orderA, orderOut, transpose)
+    except ValueError as ve:
+        pytest.skip(str(ve))  # skip if not supported
    if dims == 2:
        A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype)
@@ -2278,7 +2281,6 @@ def test_fp4_quant(dtype):
    assert relerr.item() < 0.28
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
 def test_4bit_compressed_stats(quant_type):
    for blocksize in [128, 64]:
@@ -2317,7 +2319,6 @@ def test_4bit_compressed_stats(quant_type):
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 #@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'])
 @pytest.mark.parametrize("quant_type", ['nf4'])
 def test_bench_4bit_dequant(quant_type):

--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -79,7 +79,7 @@ def model_and_tokenizer(request):
 @pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False'])
 @pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False'])
 #@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32'])
-def test_pi(model_and_tokenizer, inference_kernel, DQ):
+def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ):
    print('')
    dtype = torch.float16

--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -15,7 +15,6 @@ storage = {
    'float32': torch.float32
 }
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize(
    "quant_type, compress_statistics, bias, quant_storage",
    list(product(["nf4", "fp4"], [False, True], [False, True], ['uint8', 'float16', 'bfloat16', 'float32'])),

--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -33,7 +33,6 @@ def test_layout_exact_match():
        assert torch.all(torch.eq(restored_x, x))
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 def test_linear_no_igemmlt():
    linear = torch.nn.Linear(1024, 3072)
    x = torch.randn(3, 1024, dtype=torch.half)
@@ -68,7 +67,6 @@ def test_linear_no_igemmlt():
    assert linear_custom.state.CxB is None
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt",
                         list(product([False, True], [False, True], [False, True], [False, True])))
 def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt):

--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -520,7 +520,6 @@ modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float
 modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16))
 modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16))
 names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C', 'NF4+fp32', 'NF4+fp16', 'NF4+bf16']
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize("module", modules, ids=names)
 def test_kbit_backprop(module):
    b = 17