Unverified Commit 503d243e authored by Matthew Douglas's avatar Matthew Douglas Committed by GitHub
Browse files

General cleanup & test improvements (#1646)

* General cleanup & test improvements

* Tests: WA numpy 2 compat issue for torch<2.3

* Tests: update aarch64 cpu min torch version

* Tests: update aarch64 cpu min torch version

* Tests: update aarch64 cpu min torch version
parent e99ac0a1
......@@ -93,24 +93,32 @@ jobs:
path: output/${{ matrix.os }}/${{ matrix.arch }}/*
retention-days: 7
cpu-tests:
test-cpu:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cpu
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
torch_version: ["2.6.0", "2.7.0"]
# Test with the oldest supported torch version and the two newest.
torch_version: ["2.2.2", "2.6.0", "2.7.0"]
include:
- os: ubuntu-22.04
arch: x86_64
runner: banb-aws-general-8-plus-use1-public-80
- os: ubuntu-22.04-arm
arch: aarch64
- os: ubuntu-22.04-arm
arch: aarch64
torch_version: "2.5.1"
- os: windows-2025
arch: x86_64
- os: macos-15
arch: arm64
exclude:
- os: ubuntu-22.04-arm
torch_version: "2.2.2"
runs-on: ${{ matrix.runner || matrix.os }}
env:
BNB_TEST_DEVICE: cpu
......@@ -135,6 +143,11 @@ jobs:
pip install -e ".[test]"
pip install pytest-cov
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
- name: Downgrade NumPy
if: startsWith(matrix.torch_version, '2.2.')
run: pip install "numpy<2"
- name: Show installed packages
run: pip list
......@@ -144,7 +157,7 @@ jobs:
- name: Run tests
run: pytest --durations=100
# cuda-aarch64-tests:
# test-cuda-aarch64:
# if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
# needs: build-cuda
# strategy:
......@@ -167,7 +180,7 @@ jobs:
cuda-tests:
test-cuda:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cuda
strategy:
......@@ -179,7 +192,7 @@ jobs:
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
include:
- cuda_version: "11.8.0"
torch_version: "2.4.1"
torch_version: "2.2.2"
pypi_index: "https://download.pytorch.org/whl/cu118"
- cuda_version: "12.6.3"
torch_version: "2.6.0"
......@@ -238,6 +251,11 @@ jobs:
pip install -e ".[test]"
pip install pytest-cov
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
- name: Downgrade NumPy
if: startsWith(matrix.torch_version, '2.2.')
run: pip install "numpy<2"
- name: Show installed packages
run: pip list
......
"""
Extracted from tests/test_functional.py
Note: This feature is currently unused! It is kept here for archival purposes.
Usage: pytest benchmarking/int8/row_scale_benchmark.py
"""
import time
import pytest
import torch
from bitsandbytes import functional as F
k = 20
torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
@pytest.mark.parametrize(
("dim1", "dim4", "inner"),
[
pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"),
pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"),
],
)
@pytest.mark.skip("Row scale has some bugs for ampere")
@pytest.mark.benchmark
def test_row_scale_bench(dim1, dim4, inner):
formatB = F.get_special_format_str()
err1, err2, err3 = [], [], []
relerr1, relerr2 = [], []
scale = 1
A = torch.randn(dim1, inner, device="cuda").half()
B = torch.randn(dim4, inner, device="cuda").half()
torch.nn.init.xavier_uniform_(B)
# warmpup
for i in range(k):
C1 = torch.matmul(A, B.t())
torch.cuda.synchronize()
t0 = time.time()
for i in range(k):
C1 = torch.matmul(A, B.t())
torch.cuda.synchronize()
print("16", time.time() - t0)
C1a, C1b, stats1a, stats1b, coo_tensor = F.int8_double_quant(A)
CB, absmaxB = F.vectorwise_quant(B, quant_type="linear")
A2, SA = F.nvidia_transform(C1a, "col32")
B2, SB = F.nvidia_transform(CB, formatB)
A1, maxA = F.vectorwise_quant(A, dim=1)
c = 10.0 * inner * scale
row_scale = maxA / c
torch.cuda.synchronize()
t0 = time.time()
for i in range(k):
outC32 = F.int8_linear_matmul(A2, B2, dtype=torch.int8, row_scale=row_scale)
torch.cuda.synchronize()
print("row-wise", time.time() - t0)
C2a, C2b, stats2a, stats2b, coo_tensor = F.int8_double_quant(B)
B2, SB = F.nvidia_transform(C2a, formatB)
torch.cuda.synchronize()
t0 = time.time()
for i in range(k):
outC32 = F.int8_linear_matmul(A2, B2)
torch.cuda.synchronize()
print("vector-wise", time.time() - t0)
#!/bin/bash
BASE_PATH=$1
echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
echo $LD_LIBRARY_PATH
if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
module unload cuda && echo "no module function available. Probably not on a slurm cluster."
module unload gcc && echo "no module function available. Probably not on a slurm cluster."
rm -rf dist build
make cleaneggs
make cleanlibs
rm -rf build/*
export CUDA_HOME=
export CUDA_VERSION=
make cpuonly CUDA_VERSION="CPU"
if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.0
make cuda110 CUDA_VERSION=110
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.1
make cuda11x CUDA_VERSION=111
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.4
make cuda11x CUDA_VERSION=114
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.5
make cuda11x CUDA_VERSION=115
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.7
make cuda11x CUDA_VERSION=117
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.8
make cuda118 CUDA_VERSION=118
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.0
make cuda12x CUDA_VERSION=120
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.1
make cuda12x CUDA_VERSION=121
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.2
make cuda12x CUDA_VERSION=122
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.3
make cuda12x CUDA_VERSION=123
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
############################# START NO CUBLASLT #############################################
# binaries without 8-bit matmul support START HERE
# ###########################################################################################
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.0
make cuda110_nomatmul CUDA_VERSION=110
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.1
make cuda11x_nomatmul CUDA_VERSION=111
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.4
make cuda11x_nomatmul CUDA_VERSION=114
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.5
make cuda11x_nomatmul CUDA_VERSION=115
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.7
make cuda11x_nomatmul CUDA_VERSION=117
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-11.8
make cuda118_nomatmul CUDA_VERSION=118
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.0
make cuda12x_nomatmul CUDA_VERSION=120
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.1
make cuda12x_nomatmul CUDA_VERSION=121
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.2
make cuda12x_nomatmul CUDA_VERSION=122
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
rm -rf build/*
export CUDA_HOME=$BASE_PATH/cuda-12.3
make cuda12x_nomatmul CUDA_VERSION=123
if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessful!" 1>&2
exit 64
fi
python -m build
python -m twine upload dist/* --verbose
# for cmake build
name: bnb
channels:
- pytorch
- nvidia
- conda-forge
dependencies:
- python
#- accelerate
#- einops
- scipy
#- transformers
- pytest
- pytest-cases
- ipython
- debugpy
- yapf
- monkeytype
- rich
- pytest-sugar
name: bnb
channels:
- pytorch
- nvidia
- conda-forge
dependencies:
# Base
- conda-forge::python=3.8
- pytorch::pytorch=>2.1
- pytorch::pytorch-cuda=11.8
- nvidia::cuda=11.8
# Libraries
- conda-forge::accelerate
- conda-forge::einops
- conda-forge::scipy
- conda-forge::transformers
# Development
- conda-forge::pytest
- conda-forge::build # build Python packages
- conda-forge::twine # upload Python packages
- conda-forge::pytest-cases # more readable and composable parametrized tests
- conda-forge::ipython # better interactive shell
- conda-forge::debugpy # debugger-support for VSCode
- conda-forge::ruff # linting
- conda-forge::yapf # code formatting
- conda-forge::monkeytype # infer type annotations
- conda-forge::rich # better, colored tracebacks, etc
- conda-forge::pytest-sugar # better pytest output
# - conda-forge::nodejs # for `doc-builder preview` (optional)
## ENV CREATION - steps to reproduce:
# mamba env remove -n bnb
# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
# mamba env update -n bnb -f environment.yml
# mamba activate bnb
## PIP dependencies (install *after* ENV CREATION):
# pip install --no-cache-dir --no-deps lion_pytorch triton hf-doc-builder watchdog
## NOTE: conda peft is not up to date, so we install from pip
# cd pip install -e . ## installs bitsandbytes as editable development install from within repo root dir
## ENV UPDATE:
# # add new packages to environment.yml, then:
# mamba env update -n bnb -f environment.yml
......@@ -49,6 +49,10 @@ def test_matmullt(
req_grad = list(req_grad)
req_grad[2] = False
if device == "cpu" and dtype != torch.float32 and has_fp16_weights and any(req_grad):
if torch.__version__ < (2, 6):
pytest.xfail("mse_loss bf16/fp16 on CPU is not supported in torch < 2.6")
for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
......@@ -185,6 +189,9 @@ def test_matmul_4bit(
req_grad = list(req_grad)
req_grad[2] = False
if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
......
......@@ -1342,9 +1342,13 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
if device == "cpu" and storage_type != "nf4":
if device == "cpu":
if storage_type != "nf4":
pytest.xfail("fp4 quantization is not supported on CPU")
if dtype == torch.bfloat16 and torch.__version__ < (2, 3):
pytest.xfail("eye doe not support bfloat16 on CPU in torch < 2.3")
dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims)
......
......@@ -6,6 +6,13 @@ import torch
import bitsandbytes
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
# torch.library.opcheck is only available in torch 2.4 and later.
# When testing with older versions, we will skip it as a no-op.
if torch.__version__ >= (2, 4):
opcheck = torch.library.opcheck
else:
opcheck = lambda *args, **kwargs: None
class TestLLMInt8Ops:
@pytest.mark.parametrize("device", get_available_devices())
......@@ -18,7 +25,7 @@ class TestLLMInt8Ops:
assert out.dtype == torch.int32
assert out.device == A.device
torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
@pytest.mark.parametrize("device", get_available_devices())
def test_int8_linear_matmul_out(self, device):
......@@ -32,7 +39,7 @@ class TestLLMInt8Ops:
assert out.dtype == torch.int32
assert out.device == A.device
torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
@pytest.mark.parametrize("threshold", [0.0, 6.0])
@pytest.mark.parametrize("device", get_available_devices())
......@@ -57,9 +64,8 @@ class TestLLMInt8Ops:
else:
assert outlier_cols is None
torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
@pytest.mark.parametrize("device", get_available_devices())
def test_int8_mm_dequant(self, device):
......@@ -72,7 +78,7 @@ class TestLLMInt8Ops:
assert out.dtype == torch.float16
assert out.device == A.device
torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
......@@ -89,7 +95,7 @@ class TestLLMInt8Ops:
assert out.dtype == dtype
assert out.device == A.device
torch.library.opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
class TestInt8BlockwiseQuantOps:
......@@ -115,7 +121,7 @@ class TestInt8BlockwiseQuantOps:
assert absmax.device == A.device
assert absmax.dtype == torch.float32
torch.library.opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
......@@ -137,7 +143,7 @@ class TestInt8BlockwiseQuantOps:
assert out.dtype == dtype
assert out.device == A.device
torch.library.opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
class Test4bitBlockwiseQuantOps:
......@@ -163,7 +169,7 @@ class Test4bitBlockwiseQuantOps:
assert absmax.device == A.device
assert absmax.dtype == torch.float32
torch.library.opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
......@@ -198,8 +204,9 @@ class Test4bitBlockwiseQuantOps:
assert out.device == A.device
assert out.shape == shape
torch.library.opcheck(
torch.ops.bitsandbytes.dequantize_4bit.default, (A, absmax, blocksize, quant_type, shape, dtype)
opcheck(
torch.ops.bitsandbytes.dequantize_4bit.default,
(A, absmax, blocksize, quant_type, shape, dtype),
)
@pytest.mark.parametrize("device", get_available_devices())
......@@ -226,4 +233,4 @@ class Test4bitBlockwiseQuantOps:
assert out.shape == (1, 1, out_features)
assert out.isreal().all()
torch.library.opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
......@@ -11,7 +11,7 @@ from tests.helpers import TRUE_FALSE
not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
)
@pytest.mark.skip("No longer supported.")
@pytest.mark.deprecated
@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
def test_switchback(vector_wise_quantization):
for dim in [83]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment