General cleanup & test improvements (#1646)

* General cleanup & test improvements * Tests: WA numpy 2 compat issue for torch<2.3 * Tests: update aarch64 cpu min torch version * Tests: update aarch64 cpu min torch version * Tests: update aarch64 cpu min torch version

General cleanup & test improvements (#1646)
* General cleanup & test improvements * Tests: WA numpy 2 compat issue for torch<2.3 * Tests: update aarch64 cpu min torch version * Tests: update aarch64 cpu min torch version * Tests: update aarch64 cpu min torch version
503d243e · Matthew Douglas · GitHub · e99ac0a1 · 503d243e · e99ac0a1
Unverified Commit 503d243e authored May 23, 2025 by Matthew Douglas Committed by GitHub May 23, 2025
9 changed files
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,24 +93,32 @@ jobs:
          path: output/${{ matrix.os }}/${{ matrix.arch }}/*
          retention-days: 7

-  cpu-tests:
+  test-cpu:
    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
    needs: build-cpu
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        torch_version: ["2.6.0", "2.7.0"]
+        # Test with the oldest supported torch version and the two newest.
+        torch_version: ["2.2.2", "2.6.0", "2.7.0"]
        include:
          - os: ubuntu-22.04
            arch: x86_64
            runner: banb-aws-general-8-plus-use1-public-80
          - os: ubuntu-22.04-arm
            arch: aarch64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+            torch_version: "2.5.1"
          - os: windows-2025
            arch: x86_64
          - os: macos-15
            arch: arm64
+        exclude:
+          - os: ubuntu-22.04-arm
+            torch_version: "2.2.2"
+
    runs-on: ${{ matrix.runner || matrix.os }}
    env:
      BNB_TEST_DEVICE: cpu
@@ -135,6 +143,11 @@ jobs:
          pip install -e ".[test]"
          pip install pytest-cov

+      # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
      - name: Show installed packages
        run: pip list

@@ -144,7 +157,7 @@ jobs:
      - name: Run tests
        run: pytest --durations=100

-  # cuda-aarch64-tests:
+  # test-cuda-aarch64:
  #   if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
  #   needs: build-cuda
  #   strategy:
@@ -167,7 +180,7 @@ jobs:



-  cuda-tests:
+  test-cuda:
    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
    needs: build-cuda
    strategy:
@@ -179,7 +192,7 @@ jobs:
        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
        include:
          - cuda_version: "11.8.0"
-            torch_version: "2.4.1"
+            torch_version: "2.2.2"
            pypi_index: "https://download.pytorch.org/whl/cu118"
          - cuda_version: "12.6.3"
            torch_version: "2.6.0"
@@ -238,6 +251,11 @@ jobs:
          pip install -e ".[test]"
          pip install pytest-cov

+        # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      - name: Downgrade NumPy
+        if: startsWith(matrix.torch_version, '2.2.')
+        run: pip install "numpy<2"
+
      - name: Show installed packages
        run: pip list


--- a/benchmarking/int8/row_scale_benchmark.py
+++ b/benchmarking/int8/row_scale_benchmark.py
-"""
-Extracted from tests/test_functional.py
-
-Note: This feature is currently unused! It is kept here for archival purposes.
-
-Usage: pytest benchmarking/int8/row_scale_benchmark.py
-"""
-
-import time
-
-import pytest
-import torch
-
-from bitsandbytes import functional as F
-
-k = 20
-torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
-
-
-@pytest.mark.parametrize(
-    ("dim1", "dim4", "inner"),
-    [
-        pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"),
-        pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"),
-    ],
-)
-@pytest.mark.skip("Row scale has some bugs for ampere")
-@pytest.mark.benchmark
-def test_row_scale_bench(dim1, dim4, inner):
-    formatB = F.get_special_format_str()
-    err1, err2, err3 = [], [], []
-    relerr1, relerr2 = [], []
-    scale = 1
-    A = torch.randn(dim1, inner, device="cuda").half()
-    B = torch.randn(dim4, inner, device="cuda").half()
-    torch.nn.init.xavier_uniform_(B)
-    # warmpup
-    for i in range(k):
-        C1 = torch.matmul(A, B.t())
-
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        C1 = torch.matmul(A, B.t())
-    torch.cuda.synchronize()
-    print("16", time.time() - t0)
-
-    C1a, C1b, stats1a, stats1b, coo_tensor = F.int8_double_quant(A)
-    CB, absmaxB = F.vectorwise_quant(B, quant_type="linear")
-    A2, SA = F.nvidia_transform(C1a, "col32")
-    B2, SB = F.nvidia_transform(CB, formatB)
-    A1, maxA = F.vectorwise_quant(A, dim=1)
-
-    c = 10.0 * inner * scale
-    row_scale = maxA / c
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        outC32 = F.int8_linear_matmul(A2, B2, dtype=torch.int8, row_scale=row_scale)
-    torch.cuda.synchronize()
-    print("row-wise", time.time() - t0)
-
-    C2a, C2b, stats2a, stats2b, coo_tensor = F.int8_double_quant(B)
-    B2, SB = F.nvidia_transform(C2a, formatB)
-    torch.cuda.synchronize()
-    t0 = time.time()
-    for i in range(k):
-        outC32 = F.int8_linear_matmul(A2, B2)
-    torch.cuda.synchronize()
-    print("vector-wise", time.time() - t0)
--- a/deploy.sh
+++ b/deploy.sh
-#!/bin/bash
-BASE_PATH=$1
-
-echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
-echo $LD_LIBRARY_PATH
-
-if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-
-module unload cuda && echo "no module function available. Probably not on a slurm cluster."
-module unload gcc && echo "no module function available. Probably not on a slurm cluster."
-
-rm -rf dist build
-make cleaneggs
-make cleanlibs
-
-rm -rf build/*
-export CUDA_HOME=
-export CUDA_VERSION=
-make cpuonly CUDA_VERSION="CPU"
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110 CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118 CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-############################# START NO CUBLASLT #############################################
-# binaries without 8-bit matmul support START HERE
-# ###########################################################################################
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110_nomatmul CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x_nomatmul CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x_nomatmul CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x_nomatmul CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x_nomatmul CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118_nomatmul CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x_nomatmul CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x_nomatmul CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x_nomatmul CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x_nomatmul CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
-  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessful!" 1>&2
-  exit 64
-fi
-
-python -m build
-python -m twine upload dist/* --verbose
--- a/environment-bnb.yml
+++ b/environment-bnb.yml
-# for cmake build
-name: bnb
-channels:
-  - pytorch
-  - nvidia
-  - conda-forge
-
-dependencies:
-  - python
-  #- accelerate
-  #- einops
-  - scipy
-  #- transformers
-  - pytest
-  - pytest-cases
-  - ipython
-  - debugpy
-  - yapf
-  - monkeytype
-  - rich
-  - pytest-sugar
--- a/environment.yml
+++ b/environment.yml
-name: bnb
-channels:
-  - pytorch
-  - nvidia
-  - conda-forge
-
-dependencies:
-  # Base
-  - conda-forge::python=3.8
-  - pytorch::pytorch=>2.1
-  - pytorch::pytorch-cuda=11.8
-  - nvidia::cuda=11.8
-  # Libraries
-  - conda-forge::accelerate
-  - conda-forge::einops
-  - conda-forge::scipy
-  - conda-forge::transformers
-  # Development
-  - conda-forge::pytest
-  - conda-forge::build        # build Python packages
-  - conda-forge::twine        # upload Python packages
-  - conda-forge::pytest-cases # more readable and composable parametrized tests
-  - conda-forge::ipython      # better interactive shell
-  - conda-forge::debugpy      # debugger-support for VSCode
-  - conda-forge::ruff         # linting
-  - conda-forge::yapf         # code formatting
-  - conda-forge::monkeytype   # infer type annotations
-  - conda-forge::rich         # better, colored tracebacks, etc
-  - conda-forge::pytest-sugar # better pytest output
-  # - conda-forge::nodejs       # for `doc-builder preview` (optional)
-
-## ENV CREATION - steps to reproduce:
-# mamba env remove -n bnb
-# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
-# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
-# mamba env update -n bnb -f environment.yml
-# mamba activate bnb
-
-## PIP dependencies (install *after* ENV CREATION):
-# pip install --no-cache-dir --no-deps lion_pytorch triton hf-doc-builder watchdog
-## NOTE: conda peft is not up to date, so we install from pip
-# cd pip install -e .  ## installs bitsandbytes as editable development install from within repo root dir
-
-## ENV UPDATE:
-# # add new packages to environment.yml, then:
-# mamba env update -n bnb -f environment.yml
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -49,6 +49,10 @@ def test_matmullt(
        req_grad = list(req_grad)
        req_grad[2] = False

+    if device == "cpu" and dtype != torch.float32 and has_fp16_weights and any(req_grad):
+        if torch.__version__ < (2, 6):
+            pytest.xfail("mse_loss bf16/fp16 on CPU is not supported in torch < 2.6")
+
    for i in range(3):
        # normal multiply
        if funcs[0] in [torch.mm, torch.matmul]:
@@ -185,6 +189,9 @@ def test_matmul_4bit(
        req_grad = list(req_grad)
        req_grad[2] = False

+    if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
+        pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
+
    for i in range(3):
        # normal multiply
        if funcs[0] in [torch.mm, torch.matmul]:

--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1342,9 +1342,13 @@ class TestQuantize4BitFunctional:
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
    @pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
    def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
-        if device == "cpu" and storage_type != "nf4":
+        if device == "cpu":
+            if storage_type != "nf4":
                pytest.xfail("fp4 quantization is not supported on CPU")

+            if dtype == torch.bfloat16 and torch.__version__ < (2, 3):
+                pytest.xfail("eye doe not support bfloat16 on CPU in torch < 2.3")
+
        dims = 10
        torch.random.manual_seed(np.random.randint(0, 412424242))
        dims = get_test_dims(0, 8192, n=dims)

--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -6,6 +6,13 @@ import torch
 import bitsandbytes
 from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter

+# torch.library.opcheck is only available in torch 2.4 and later.
+# When testing with older versions, we will skip it as a no-op.
+if torch.__version__ >= (2, 4):
+    opcheck = torch.library.opcheck
+else:
+    opcheck = lambda *args, **kwargs: None
+

 class TestLLMInt8Ops:
    @pytest.mark.parametrize("device", get_available_devices())
@@ -18,7 +25,7 @@ class TestLLMInt8Ops:
        assert out.dtype == torch.int32
        assert out.device == A.device

-        torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
+        opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))

    @pytest.mark.parametrize("device", get_available_devices())
    def test_int8_linear_matmul_out(self, device):
@@ -32,7 +39,7 @@ class TestLLMInt8Ops:
        assert out.dtype == torch.int32
        assert out.device == A.device

-        torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
+        opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))

    @pytest.mark.parametrize("threshold", [0.0, 6.0])
    @pytest.mark.parametrize("device", get_available_devices())
@@ -57,9 +64,8 @@ class TestLLMInt8Ops:
        else:
            assert outlier_cols is None

-        torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
-
-        torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
+        opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
+        opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))

    @pytest.mark.parametrize("device", get_available_devices())
    def test_int8_mm_dequant(self, device):
@@ -72,7 +78,7 @@ class TestLLMInt8Ops:
        assert out.dtype == torch.float16
        assert out.device == A.device

-        torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
+        opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))

    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -89,7 +95,7 @@ class TestLLMInt8Ops:
        assert out.dtype == dtype
        assert out.device == A.device

-        torch.library.opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
+        opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))


 class TestInt8BlockwiseQuantOps:
@@ -115,7 +121,7 @@ class TestInt8BlockwiseQuantOps:
        assert absmax.device == A.device
        assert absmax.dtype == torch.float32

-        torch.library.opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
+        opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))

    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -137,7 +143,7 @@ class TestInt8BlockwiseQuantOps:
        assert out.dtype == dtype
        assert out.device == A.device

-        torch.library.opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
+        opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))


 class Test4bitBlockwiseQuantOps:
@@ -163,7 +169,7 @@ class Test4bitBlockwiseQuantOps:
        assert absmax.device == A.device
        assert absmax.dtype == torch.float32

-        torch.library.opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
+        opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))

    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -198,8 +204,9 @@ class Test4bitBlockwiseQuantOps:
        assert out.device == A.device
        assert out.shape == shape

-        torch.library.opcheck(
-            torch.ops.bitsandbytes.dequantize_4bit.default, (A, absmax, blocksize, quant_type, shape, dtype)
+        opcheck(
+            torch.ops.bitsandbytes.dequantize_4bit.default,
+            (A, absmax, blocksize, quant_type, shape, dtype),
        )

    @pytest.mark.parametrize("device", get_available_devices())
@@ -226,4 +233,4 @@ class Test4bitBlockwiseQuantOps:
        assert out.shape == (1, 1, out_features)
        assert out.isreal().all()

-        torch.library.opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
+        opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -11,7 +11,7 @@ from tests.helpers import TRUE_FALSE
    not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
    reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
 )
-@pytest.mark.skip("No longer supported.")
+@pytest.mark.deprecated
 @pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
 def test_switchback(vector_wise_quantization):
    for dim in [83]: