Merge pull request #1160 from matthewdouglas/quant4bit-blocksize4096

Fix 4bit quantization with blocksize = 4096

Merge pull request #1160 from matthewdouglas/quant4bit-blocksize4096
Fix 4bit quantization with blocksize = 4096
76885a41 · Titus · GitHub · 2965c765 · a4714569 · 76885a41
Unverified Commit 76885a41 authored Apr 02, 2024 by Titus Committed by GitHub Apr 02, 2024
Showing with 30 additions and 15 deletions

bitsandbytes/functional.py bitsandbytes/functional.py +4 -3

csrc/ops.cu csrc/ops.cu +1 -1

install_cuda.py install_cuda.py +2 -6

tests/test_functional.py tests/test_functional.py +23 -5

No files found.
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1087,11 +1087,12 @@ def get_4bit_type(typename, device=None, blocksize=64):
    if data is None:
        raise NotImplementedError(f"Typename {typename} not supported")
-    data = Tensor(data)
+    data = torch.tensor(data, device=device)
-    data /= data.abs().max()
+    data.div_(data.abs().max())
    assert data.numel() == 16
-    return data.to(device)
+    return data
 def quantize_fp4(

--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -58,7 +58,7 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
  if(blocksize == 4096)
-    kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, 0><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
+    kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 2048)
    kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE><<<num_blocks, 512>>>(code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 1024)

--- a/install_cuda.py
+++ b/install_cuda.py
@@ -77,9 +77,7 @@ def main():
    download_path = "/tmp"  # default download path
    if len(sys.argv) < 2:
-        print(
+        print("Usage: python install_cuda.py <version/all> [user/system] [download_path]")
-            "Usage: python install_cuda.py <version/all> [user/system] [download_path]"
-        )
        sys.exit(1)
    version = sys.argv[1]
@@ -100,9 +98,7 @@ def main():
    elif version in cuda_versions:
        install_cuda(version, base_path, download_path)
    else:
-        print(
+        print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}")
-            f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}"
-        )
        sys.exit(1)

--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1928,7 +1928,9 @@ def test_bench_dequantization():
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
-def test_fp4_quant(dtype):
+@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
+@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
+def test_4bit_quant(dtype, quant_type, blocksize):
    vals = list(product([0, 1], repeat=4))
    code = {}
@@ -1953,8 +1955,8 @@ def test_fp4_quant(dtype):
        code[idx] = result
    A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype)
-    qa, SA = F.quantize_fp4(A1, blocksize=64)
+    qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
-    A2 = F.dequantize_fp4(qa, SA)
+    A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
    err = (A1 - A2).abs().float()
    relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -1962,8 +1964,24 @@ def test_fp4_quant(dtype):
    err = err.mean()
    assert A2.dtype == dtype
-    assert err.item() < 0.1
-    assert relerr.item() < 0.28
+    # With larger block sizes, we can expect this to blow up.
+    # At blocksize>=1024, don't even bother looking at relerr.
+    if blocksize <= 64:
+        assert err.item() < 0.1
+        assert relerr.item() < 0.28
+    elif blocksize <= 256:
+        assert err.item() < 0.11
+        assert relerr.item() < 0.30
+    elif blocksize <= 512:
+        assert err.item() < 0.12
+        assert relerr.item() < 0.31
+    elif quant_type == "fp4":
+        # 1024 => 0.48, 2048 => 0.52, 4096 => 0.56
+        assert err.item() < 0.08 + math.log2(blocksize) * 4e-2
+    else:
+        # 1024 => 0.8, 2048 => 0.88, 4096 => 0.96
+        assert err.item() < math.log2(blocksize) * 8e-2
 @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])