Merge branch 'main' into patch-2

c85733cf · Tim Dettmers · GitHub · d76b6ca9 · 9c63202a · c85733cf
Unverified Commit c85733cf authored Jan 01, 2024 by Tim Dettmers Committed by GitHub Jan 01, 2024
7 changed files
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+addopts = -rP
+    ; --cov=bitsandbytes
+    ; # contexts: record which test ran which line; can be seen in html coverage report
+    ; --cov-context=test
+    ; --cov-report html
+
+log_cli = True
+log_cli_level = INFO
+log_file = logs/pytest.log
\ No newline at end of file
--- a/scripts/stale.py
+++ b/scripts/stale.py
+# Copyright 2023 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to close stale issue. Taken in part from the AllenNLP repository.
+https://github.com/allenai/allennlp.
+"""
+import os
+from datetime import datetime as dt
+from datetime import timezone
+
+from github import Github
+
+
+# All labels that we don't want to touch
+LABELS_TO_EXEMPT = [
+    "feature-request",
+]
+
+
+def main():
+    g = Github(os.environ["GITHUB_TOKEN"])
+    repo = g.get_repo("TimDettmers/bitsandbytes")
+    open_issues = repo.get_issues(state="open")
+
+    for issue in open_issues:
+        comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
+        last_comment = comments[0] if len(comments) > 0 else None
+        if (
+            last_comment is not None
+            and last_comment.user.login == "github-actions[bot]"
+            and (dt.now(timezone.utc) - issue.updated_at).days > 7
+            and (dt.now(timezone.utc) - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            issue.edit(state="closed")
+        elif (
+            (dt.now(timezone.utc) - issue.updated_at).days > 23
+            and (dt.now(timezone.utc) - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            issue.create_comment(
+                "This issue has been automatically marked as stale because it has not had "
+                "recent activity. If you think this still needs to be addressed "
+                "please comment on this thread.\n\n"
+            )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ def read(fname):

 setup(
    name=f"bitsandbytes",
-    version=f"0.40.2",
+    version=f"0.41.3.post1",
    author="Tim Dettmers",
    author_email="dettmers@cs.washington.edu",
    description="k-bit optimizers and matrix multiplication routines.",

--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -129,6 +129,7 @@ def test_quantile_quantization():
        assert diff < 0.001


+
 def test_dynamic_quantization():
    diffs = []
    reldiffs = []
@@ -141,8 +142,8 @@ def test_dynamic_quantization():
        diffs.append(diff.mean().item())
        reldiffs.append(reldiff.mean().item())
        assert diff.mean().item() < 0.0135
-    # print(sum(diffs)/len(diffs))
-    # print(sum(reldiffs)/len(reldiffs))
+    print(sum(diffs)/len(diffs))
+    print(sum(reldiffs)/len(reldiffs))

    for i in range(100):
        A1 = torch.rand(1024, 1024, device="cuda")
@@ -157,7 +158,8 @@ def test_dynamic_quantization():
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"])
 @pytest.mark.parametrize("nested", [False, True], ids=["False", "True"])
 @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
-def test_dynamic_blockwise_quantization(dtype, nested, blocksize):
+@pytest.mark.parametrize("signed", [True, False], ids=['signed_True', 'signed_False'])
+def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed):
    #print('')
    diffs = []
    reldiffs = []
@@ -178,9 +180,10 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize):
    assert A2.dtype == dtype

    diffs = []
+    code = F.create_dynamic_map(signed=signed)
    for i in range(100):
        A1 = torch.rand(1024, 1024, device="cuda", dtype=dtype)
-        C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
+        C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
        A2 = F.dequantize_blockwise(C, S)
        diff = torch.abs(A1 - A2).float()
        reldiff = diff / torch.abs(A1.float() + 1e-8)
@@ -189,11 +192,15 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize):
        #torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0)
    abserr = sum(diffs)/len(diffs)
    relerr = sum(reldiffs)/len(reldiffs)
+    if signed:
        assert abserr < 0.0035
        assert relerr < 0.015
+    else:
+        assert abserr < 0.00175
+        assert relerr < 0.012
    assert A2.dtype == dtype
-    #print('nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
-    #print('nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))
+    #print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
+    #print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))



@@ -2366,7 +2373,7 @@ def test_normal_map_tree():
 def test_gemv_4bit(dtype, storage_type, double_quant, kind):
    for dim in [128, 256, 512, 1024]:
    #for dim in [4*1024]:
-    #for dim in [1*128]:
+    #for dim in [1*16]:
        errs1 = []
        errs2 = []
        errs3 = []
@@ -2446,11 +2453,11 @@ def test_gemv_4bit(dtype, storage_type, double_quant, kind):
        #
        #print('='*80)
        #print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
-        #print(C1.flatten()[-20:])
-        #print(C2.flatten()[-20:])
-        #print(f'inference vs training abs: {err1}')
-        #print(f'inference vs training rel: {relerr1}')
-        #print(f'inference vs training max: {maxerr1}')
+        print(C1.flatten()[-20:])
+        print(C2.flatten()[-20:])
+        print(f'inference vs training abs: {err1}')
+        print(f'inference vs training rel: {relerr1}')
+        print(f'inference vs training max: {maxerr1}')
        #print(f'inference vs training vs torch err ratio abs: {absratio}')
        #print(f'inference vs training vs torch err ratio rel: {relratio}')
        #print(f'inference vs training vs torch err ratio max: {maxratio}')
@@ -2478,7 +2485,7 @@ def test_gemv_4bit(dtype, storage_type, double_quant, kind):
            assert maxratio < 1.005 and maxratio > 0.995
        elif dtype == torch.bfloat16:
            if dim <= 512:
-                assert err1 < 5e-4
+                assert err1 < 6e-4
                assert relerr1 < 0.007
                assert maxerr1 < 0.015
            else:

--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
+import os
+from contextlib import nullcontext
+from itertools import product
+from tempfile import TemporaryDirectory
+
+import pytest
+import torch
+
+import bitsandbytes as bnb
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
+@pytest.mark.parametrize(
+    "quant_type, compress_statistics, bias",
+    list(product(["nf4", "fp4"], [False, True], [False, True])),
+)
+def test_linear_serialization(quant_type, compress_statistics, bias):
+    original_dtype = torch.float16
+    compute_dtype = None
+    device = "cuda"
+    layer_shape = (300, 400)
+
+    linear = torch.nn.Linear(*layer_shape, dtype=original_dtype, device="cpu")  # original layer
+
+    # Quantizing original layer
+    linear_q = bnb.nn.Linear4bit(
+        linear.in_features,
+        linear.out_features,
+        bias=bias,
+        compute_dtype=compute_dtype,
+        compress_statistics=compress_statistics,
+        quant_type=quant_type,
+        device="meta",
+    )
+    new_weight = bnb.nn.Params4bit(data=linear.weight, requires_grad=False)
+    linear_q.weight = new_weight
+    if bias:
+        linear_q.bias = torch.nn.Parameter(linear.bias)
+    linear_q = linear_q.to(device)
+
+    # saving to state_dict:
+    sd = linear_q.state_dict()
+
+    # restoring from state_dict:
+    bias_data2 = sd.pop("bias", None)
+    weight_data2 = sd.pop("weight")
+    weight2 = bnb.nn.Params4bit.from_prequantized(quantized_stats=sd, data=weight_data2)
+
+    # creating new layer with same params:
+    linear_q2 = bnb.nn.Linear4bit(
+        linear.in_features,
+        linear.out_features,
+        bias=bias,
+        compute_dtype=compute_dtype,
+        compress_statistics=compress_statistics,
+        quant_type=quant_type,
+        device="meta",
+    )
+    # loading weights from state_dict:
+    linear_q2.weight = weight2
+    if bias:
+        linear_q2.bias = torch.nn.Parameter(bias_data2)
+    linear_q2 = linear_q2.to(device)
+
+    # MATCHING
+    a, b = linear_q.weight, linear_q2.weight
+
+    assert a.device == b.device
+    assert a.dtype == b.dtype
+    assert torch.equal(a, b)
+
+    q0 = a.quant_state
+    q1 = b.quant_state
+    for attr in ('code', 'dtype', 'blocksize', 'absmax'):
+        c, d = getattr(q0, attr), getattr(q1, attr)
+        if isinstance(c, torch.Tensor):
+            assert torch.equal(c, d)
+        else:
+            assert c == d, f"{c} != {d}"
+
+    if q0.state2 is not None:
+        for attr in ('code', 'dtype', 'blocksize', 'absmax'):
+            c, d = getattr(q0.state2, attr), getattr(q1.state2, attr)
+            if isinstance(c, torch.Tensor):
+                assert torch.equal(c, d)
+            else:
+                assert c == d, f"{c} != {d}"
+
+    if bias:
+        a, b = linear_q.bias, linear_q2.bias
+        assert a.device == b.device
+        assert a.dtype == b.dtype
+        assert torch.equal(a, b)
+
+    # Forward test
+    x = torch.rand(42, layer_shape[0], device=device)
+    a = linear_q(x)
+    b = linear_q2(x)
+    assert a.device == b.device
+    assert a.dtype == b.dtype
+    assert torch.equal(a, b)
+
+    # Saved size ratio test. Target set for layer_shape == (300, 400) w/ bias
+    with TemporaryDirectory() as tmpdir:
+        state_path_4bit = os.path.join(tmpdir, "state_4bit.pth")
+        state_path = os.path.join(tmpdir, "state.pth")
+        torch.save(linear.state_dict(), state_path)
+        torch.save(linear_q.state_dict(), state_path_4bit)
+
+        size_orig, size_4 = os.path.getsize(state_path), os.path.getsize(
+            state_path_4bit
+        )
+        size_ratio = size_4 / size_orig
+        target_compression = 0.143 if original_dtype == torch.float32 else 0.29  # these numbers get lower as weight shape increases
+        ratio_error_msg = f"quantized_size {size_4:,} is larger on disk than {target_compression:.2%} of original size {size_orig:,}"
+        assert size_ratio < target_compression, ratio_error_msg
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -516,7 +516,10 @@ modules.append(bnb.nn.LinearFP4)
 modules.append(bnb.nn.LinearNF4)
 modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True))
 modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True))
-names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C']
+modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32))
+modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16))
+modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16))
+names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C', 'NF4+fp32', 'NF4+fp16', 'NF4+bf16']
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="this test requires a GPU")
 @pytest.mark.parametrize("module", modules, ids=names)
 def test_kbit_backprop(module):
@@ -563,10 +566,10 @@ def test_kbit_backprop(module):
        relerrs2.append(relerr2.mean().item())

        if isinstance(module, bnb.nn.Linear8bitLt):
-            torch.testing.assert_close(grad1, grad2, atol=0.008, rtol=0.05)
+            assert_all_approx_close(grad1, grad2, atol=0.008, rtol=0.05, count=1)
            torch.testing.assert_close(bgrad1, bgrad2, atol=0.008, rtol=0.05)
        else:
-            torch.testing.assert_close(grad1, grad2, atol=0.015, rtol=0.05)
+            assert_all_approx_close(grad1, grad2, atol=0.015, rtol=0.05, count=1)
            torch.testing.assert_close(bgrad1, bgrad2, atol=0.02, rtol=0.05)
        ref.zero_grad()
        kbit.zero_grad()
@@ -608,9 +611,33 @@ def test_fp8linear():
    assert graderr < 0.00002
    assert bgraderr < 0.00002

-
-
-
+def test_4bit_warnings():
+    dim1 = 64
+
+    with pytest.warns(UserWarning, match=r'inference or training'):
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)])
+        net = net.cuda()
+        inp = torch.rand(10, dim1).cuda().half()
+        net(inp)
+    with pytest.warns(UserWarning, match=r'inference.'):
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)])
+        net = net.cuda()
+        inp = torch.rand(1, dim1).cuda().half()
+        net(inp)
+
+    with pytest.warns(UserWarning) as record:
+
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)])
+        net = net.cuda()
+        inp = torch.rand(10, dim1).cuda().half()
+        net(inp)
+
+        net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)])
+        net = net.cuda()
+        inp = torch.rand(1, dim1).cuda().half()
+        net(inp)
+
+    assert len(record) == 2