Merge branch 'main' into patch-2

c85733cf · Tim Dettmers · GitHub · d76b6ca9 · 9c63202a · c85733cf
Unverified Commit c85733cf authored Jan 01, 2024 by Tim Dettmers Committed by GitHub Jan 01, 2024
20 changed files
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve bitsandbytes
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your relevant system information with us
+      placeholder: platform, python version, hardware, ...
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+      placeholder: |
+        Reproducer: 
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal.
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR? 
\ No newline at end of file
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
+name: Stale Bot
+on:
+  schedule:
+    - cron: "0 15 * * *"
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'TimDettmers/bitsandbytes'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python scripts/stale.py
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,4 @@ dmypy.json
 dependencies
 cuda_build
+.vscode/*
--- a/.style.yapf
+++ b/.style.yapf
+[style]
+ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = True
+ALLOW_MULTILINE_LAMBDAS = True
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = True
+COLUMN_LIMIT = 88
+COALESCE_BRACKETS = True
+SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = True
+SPACES_BEFORE_COMMENT = 2
+SPLIT_BEFORE_BITWISE_OPERATOR = True
+SPLIT_BEFORE_FIRST_ARGUMENT = True
+SPLIT_BEFORE_LOGICAL_OPERATOR = True
+SPLIT_BEFORE_NAMED_ASSIGNS = True
+SPLIT_COMPLEX_COMPREHENSION = True
\ No newline at end of file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -283,3 +283,47 @@ Bug fixes:
 - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
 - Fixed bug where read-permission was assumed for a file. #497
 - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
+### 0.41.0
+Features:
+ - Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
+ - CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
+Bug fixes:
+ - Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
+ - Fixed a missing scipy dependency in requirements.txt. #544
+ - Fixed a bug, where a view operation could cause an error in 8-bit layers.
+ - Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
+ - Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
+ - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
+ - Fixed bug where read-permission was assumed for a file. #497
+ - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
+Documentation:
+ - Improved documentation for GPUs that do not support 8-bit matmul. #529
+ - Added description and pointers for the NF4 data type. #543
+User experience:
+ - Improved handling of default compute_dtype for Linear4bit Layers, so that compute_dtype = input_dtype if the input data type is stable enough (float32, bfloat16, but not float16).
+Performance:
+ - improved 4-bit inference performance for A100 GPUs. This degraded performance for A40/RTX3090 and RTX 4090 GPUs slightly.
+### 0.41.1
+Bug fixes:
+ - Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
+### 0.41.2
+Feature:
+ - 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
+### 0.41.3
+Bug fixes:
+ - Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
+ - Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ python setup.py install
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf,
+  'decapoda-research/llama-7b-hf',
  device_map='auto',
  load_in_8bit=True,
  max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')
@@ -119,7 +119,7 @@ torch.nn.Embedding(...) ->  bnb.nn.StableEmbedding(...) # recommended for NLP mo
 ```
 Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so:
-```
+```python
 # parameter tensors with less than 16384 values are optimized in 32-bit
 # it is recommended to use multiplies of 4096
 adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)
@@ -146,13 +146,13 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands.
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
-# For example, the following installs CUDA 11.8 to ~/local/cuda-11.8 and exports the path to your .bashrc
+# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 118 ~/local 1 
+bash install_cuda.sh 117 ~/local 1 
 ```
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:

--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -496,7 +496,7 @@ class MatMul4Bit(torch.autograd.Function):
    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
    @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state=None):
+    def forward(ctx, A, B, out=None, bias=None, quant_state: F.QuantState = None):
        # default of pytorch behavior if inputs are empty
        ctx.is_empty = False
        if prod(A.shape) == 0:
@@ -504,7 +504,7 @@ class MatMul4Bit(torch.autograd.Function):
            ctx.A = A
            ctx.B = B
            ctx.bias = bias
-            B_shape = state[1]
+            B_shape = quant_state.shape
            if A.shape[-1] == B_shape[0]:
                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
            else:
@@ -513,10 +513,10 @@ class MatMul4Bit(torch.autograd.Function):
        # 1. Dequantize
        # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
        # 3. Save state
-        ctx.state = state
+        ctx.state = quant_state
        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
        if any(ctx.needs_input_grad[:2]):
@@ -534,7 +534,6 @@ class MatMul4Bit(torch.autograd.Function):
        req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad
        A, B = ctx.tensors
-        state = ctx.state
        grad_A, grad_B, grad_bias = None, None, None
@@ -563,12 +562,11 @@ def matmul(
    return MatMul8bitLt.apply(A, B, out, bias, state)
-def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
+def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None):
    assert quant_state is not None
    if A.numel() == A.shape[-1] and A.requires_grad == False:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
+        if A.shape[-1] % quant_state.blocksize != 0:
-        if A.shape[-1] % blocksize != 0:
+            warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
-            warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
            return MatMul4Bit.apply(A, B, out, bias, quant_state)
        else:
            out = F.gemv_4bit(A, B.t(), out, state=quant_state)

--- a/bitsandbytes/cuda_setup/env_vars.py
+++ b/bitsandbytes/cuda_setup/env_vars.py
@@ -8,6 +8,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
        "OLDPWD",
        "SSH_AUTH_SOCK",  # SSH stuff, therefore unrelated
        "SSH_TTY",
+        "GOOGLE_VM_CONFIG_LOCK_FILE",  # on GCP setups, requires elevated permissions, causing problems in Jupyter notebooks
        "HOME",  # Linux shell default
        "TMUX",  # Terminal Multiplexer
        "XDG_DATA_DIRS",  # XDG: Desktop environment stuff
@@ -19,6 +20,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
        "PATH",  # this is for finding binaries, not libraries
        "LESSOPEN",  # related to the `less` command
        "LESSCLOSE",
+        "GOOGLE_VM_CONFIG_LOCK_FILE", # Google Cloud stuff, contains root only paths
        "_",  # current Python interpreter
    }
    return env_var in ignorable

--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -64,9 +64,10 @@ class CUDASetup:
            self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
            self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
            self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://github.com/TimDettmers/bitsandbytes/blob/main/cuda_install.sh')
+            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
            self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
            self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
            return
        make_cmd = f'CUDA_VERSION={self.cuda_version_string}'
@@ -214,8 +215,11 @@ def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
    paths = set()
    for libname in CUDA_RUNTIME_LIBS:
        for path in candidate_paths:
-            if (path / libname).is_file():
+            try:
-                paths.add(path / libname)
+                if (path / libname).is_file():
+                    paths.add(path / libname)
+            except PermissionError:
+                pass
    return paths
@@ -361,4 +365,4 @@ def evaluate_cuda_setup():
        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
        binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
    return binary_name, cudart_path, cc, cuda_version_string
\ No newline at end of file
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -13,8 +13,9 @@ from scipy.stats import norm
 import numpy as np
 from functools import reduce  # Required in Python 3
-from typing import Tuple
+from typing import Tuple, Any, Dict
 from torch import Tensor
+from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 from .cextension import COMPILED_WITH_CUDA, lib
@@ -322,10 +323,8 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
    # these are additional items that come from the case
    # where all the exponent bits are zero and no
    # indicator bit is present
-    non_sign_bits = total_bits - (1 if signed else 0)
+    non_sign_bits = total_bits - (1 if signed else 1)
    additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
-    if not signed:
-        additional_items = 2 * additional_items
    for i in range(max_exponent_bits):
        fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1))
        boundaries = torch.linspace(0.1, 1, fraction_items)
@@ -334,16 +333,18 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
        if signed:
            data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-        if additional_items > 0:
+    if additional_items > 0:
-            boundaries = torch.linspace(0.1, 1, additional_items + 1)
+        boundaries = torch.linspace(0.1, 1, additional_items + 1)
-            means = (boundaries[:-1] + boundaries[1:]) / 2.0
+        means = (boundaries[:-1] + boundaries[1:]) / 2.0
-            data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+        data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-            if signed:
+        if signed:
-                data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
+            data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
    data.append(0)
    data.append(1.0)
+    assert len(data) == 2**total_bits
    gap = 256 - len(data)
    for i in range(gap):
        data.append(0)
@@ -567,6 +568,125 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n
    return out
+class QuantState:
+    """container for quantization state components to work with Params4bit and similar clases"""
+    valid_quant_types = ('fp4', 'nf4')
+    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
+    valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
+                     'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
+    def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None):
+        self.absmax = absmax
+        self.shape = shape
+        self.code = code
+        self.dtype = dtype
+        self.blocksize = blocksize
+        self.quant_type = quant_type
+        self.offset = offset
+        self.state2 = state2
+        self.nested = state2 is not None
+    def __get_item__(self, idx):
+        """
+        ensures compatibility with older quant state scheme with nested lists.
+        assumes the following layout:
+        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
+        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
+        """
+        if self.nested:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, [self.offset, self.state2], self.quant_type]
+        else:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
+        return list_repr[idx]
+    @classmethod
+    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState':
+        """
+        unpacks components of state_dict into QuantState
+        where necessary, convert into strings, torch.dtype, ints, etc.
+        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
+        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.        
+        """
+        # unpacking tensor with non-tensor components
+        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
+        if not len(qs_key) and 'quant_type' not in qs_dict:
+            raise ValueError("Expected packed or unpacked quant_state items, found neither")
+        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
+            raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.")
+        # unpacking minor and non-tensor quant state items if necessary
+        if len(qs_key) == 1:
+            qs_key = qs_key[0]
+            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(qs_key)))
+        qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()}  # strip prefixes
+        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
+        if 'nested_absmax' in qs_dict:
+            offset = torch.tensor(float(qs_dict['nested_offset'])).to(device)
+            state2 = cls(
+                absmax=qs_dict['nested_absmax'].to(device),
+                blocksize=qs_dict['nested_blocksize'],
+                code=qs_dict['nested_quant_map'].to(device),
+                dtype=getattr(torch, qs_dict['nested_dtype']),
+            )
+        else:
+            offset, state2 = None, None
+        quant_state = cls(
+            quant_type=qs_dict['quant_type'],
+            absmax=qs_dict['absmax'].to(device),
+            blocksize=qs_dict['blocksize'],
+            code=qs_dict['quant_map'].to(device),
+            dtype=getattr(torch, qs_dict['dtype']),
+            shape=torch.Size(qs_dict['shape']) if qs_dict['shape'] is not None else None,
+            offset=offset,
+            state2=state2,
+        )
+        return quant_state
+    def as_dict(self, packed=False):
+        """
+        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
+        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
+        """
+        qs_dict = {
+            'quant_type': self.quant_type,
+            'absmax': self.absmax,
+            'blocksize': self.blocksize,
+            'quant_map': self.code,
+            'dtype': str(self.dtype).strip('torch.'),
+            'shape': tuple(self.shape),
+        }
+        if self.nested:
+            qs_dict.update({
+                'nested_absmax': self.state2.absmax,
+                'nested_blocksize': self.state2.blocksize,
+                'nested_quant_map': self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
+                'nested_dtype': str(self.state2.dtype).strip('torch.'),
+                'nested_offset': self.offset.item(),
+            })
+        if not packed:
+            return qs_dict
+        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
+        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
+        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
+        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
+        return qs_packed_dict
+    def to(self, device):
+        # make sure the quantization state is on the right device
+        self.absmax = self.absmax.to(device)
+        if self.nested:
+            self.offset = self.offset.to(device)
+            self.state2.absmax = self.state2.absmax.to(device)
+            self.state2.code = self.state2.code.to(device)
 def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor:
    """
    Quantize tensor A in blocks of size 4096 values.
@@ -633,16 +753,16 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
        offset = absmax.mean()
        absmax -= offset
        qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
-        state = [qabsmax, code, blocksize, nested, A.dtype, offset, state2]
+        quant_state = QuantState(absmax=qabsmax, code=code, blocksize=blocksize, dtype=A.dtype, offset=offset, state2=state2)
    else:
-        state = [absmax, code, blocksize, nested, A.dtype, None, None]
+        quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=A.dtype)
-    return out, state
+    return out, quant_state
 def dequantize_blockwise(
    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: QuantState = None,
    absmax: Tensor = None,
    code: Tensor = None,
    out: Tensor = None,
@@ -659,8 +779,8 @@ def dequantize_blockwise(
    ----------
    A : torch.Tensor
        The input 8-bit tensor.
-    quant_state : tuple(torch.Tensor, torch.Tensor)
+    quant_state : QuantState
-        Tuple of code and absmax values.
+        Object with code, absmax and other quantization state components.
    absmax : torch.Tensor
        The absmax values.
    code : torch.Tensor
@@ -681,36 +801,35 @@ def dequantize_blockwise(
        code = name2qmap["dynamic"]
    if quant_state is None:
-       quant_state = (absmax, code, blocksize, False, torch.float32, None, None)
+       quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
-    absmax, code, blocksize, nested, dtype, offset, state2 = quant_state
+    absmax = quant_state.absmax
+    if quant_state.nested:
-    if nested:
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += quant_state.offset
-        absmax += offset
        if absmax.dtype != torch.float32: absmax = absmax.float()
    if out is None:
-        out = torch.empty(A.shape, dtype=dtype, device=A.device)
+        out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
    if A.device.type != 'cpu':
        device = pre_call(A.device)
-        code = code.to(A.device)
+        code = quant_state.code.to(A.device)
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+            raise ValueError(f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
        is_on_gpu([A, absmax, out])
        if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp32(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        elif out.dtype == torch.bfloat16:
-            lib.cdequantize_blockwise_bf16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_bf16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        else:
            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
        post_call(A.device)
    else:
-        code = code.cpu()
+        code = quant_state.code.cpu()
-        lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+        lib.cdequantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(quant_state.absmax), get_ptr(out), ct.c_longlong(quant_state.blocksize), ct.c_longlong(A.numel()))
    return out
@@ -765,7 +884,6 @@ def get_4bit_type(typename, device=None, blocksize=64):
    return data.to(device)
 def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
@@ -839,26 +957,26 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
    post_call(A.device)
-    datatype = get_4bit_type(quant_type, device=A.device)
+    code = get_4bit_type(quant_type, device=A.device)
    if compress_statistics:
        offset = absmax.mean()
        absmax -= offset
        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
        del absmax
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type, datatype]
+        state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
    else:
-        state = [absmax, input_shape, A.dtype, blocksize, None, quant_type, datatype]
+        state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
    return out, state
-def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
-def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
-def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
    """
    Dequantizes FP4 blockwise quantized values.
@@ -868,8 +986,8 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
    ----------
    A : torch.Tensor
        The input 8-bit tensor (packed 4-bit values).
-    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
+    quant_state : QuantState
-        Tuple of absmax values, original tensor shape and original dtype.
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
@@ -892,41 +1010,40 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
    if quant_state is None:
        assert absmax is not None and out is not None
-        shape = out.shape
-        dtype = out.dtype
+        quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
    else:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
+        absmax = quant_state.absmax
-    if compressed_stats is not None:
+    if quant_state.nested:
-        offset, state2 = compressed_stats
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += quant_state.offset
-        absmax += offset
        if absmax.dtype != torch.float32: absmax = absmax.float()
    if out is None:
-        out = torch.empty(shape, dtype=dtype, device=A.device)
+        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
    n = out.numel()
    device = pre_call(A.device)
    is_on_gpu([A, absmax, out])
    if out.dtype == torch.float32:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    elif out.dtype == torch.float16:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    elif out.dtype == torch.bfloat16:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    else:
        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
    post_call(A.device)
@@ -952,22 +1069,22 @@ def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
 def dequantize(
    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    state: Tuple[Tensor, Tensor] = None,
    absmax: Tensor = None,
    code: Tensor = None,
    out: Tensor = None,
 ) -> Tensor:
-    assert quant_state is not None or absmax is not None
+    assert state is not None or absmax is not None
-    if code is None and quant_state is None:
+    if code is None and state is None:
        if "dynamic" not in name2qmap:
            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
        code = name2qmap["dynamic"]
        code = code.to(A.device)
-    if quant_state is None:
+    if state is None:
-        quant_state = (absmax, code)
+        state = (absmax, code)
-    out = dequantize_no_absmax(A, quant_state[1], out)
+    out = dequantize_no_absmax(A, state[1], out)
-    return out * quant_state[0]
+    return out * state[0]
 def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
@@ -1482,13 +1599,12 @@ def gemv_4bit(
    if A.numel() != A.shape[-1]:
        raise ValueError(f'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]')
-    Bshape = state[1]
+    Bshape = state.shape
    bout = Bshape[0]
-    absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = state
+    absmax = state.absmax
-    if compressed_stats is not None:
+    if state.nested:
-        offset, state2 = compressed_stats
+        absmax = dequantize_blockwise(state.absmax, state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += state.offset
-        absmax += offset
    if out is None:
        if len(A.shape) == 3:
@@ -1502,7 +1618,7 @@ def gemv_4bit(
    lda = Bshape[0]
    ldc = Bshape[0]
    ldb = (A.shape[-1]+1)//2
-    is_on_gpu([B, A, out, absmax, state[-1]])
+    is_on_gpu([B, A, out, absmax, state.code])
    m = ct.c_int32(m)
    n = ct.c_int32(n)
    k = ct.c_int32(k)
@@ -1512,11 +1628,11 @@ def gemv_4bit(
    if B.dtype == torch.uint8:
        if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        elif A.dtype == torch.bfloat16:
-            lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        elif A.dtype == torch.float32:
-            lib.cgemm_4bit_inference_naive_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        else:
            raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}')

--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -2,14 +2,15 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import Optional, TypeVar, Union, overload
+from typing import Any, Dict, Optional, TypeVar, Union, overload
+import warnings
 import torch
 import torch.nn.functional as F
 from torch import Tensor, device, dtype, nn
 import bitsandbytes as bnb
-import bitsandbytes.functional
+from bitsandbytes.functional import QuantState
 from bitsandbytes.autograd._functions import undo_layout, get_tile_inds
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import OutlierTracer, find_outlier_dims
@@ -138,8 +139,10 @@ class Embedding(torch.nn.Embedding):
        return emb
 class Params4bit(torch.nn.Parameter):
-    def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'):
+    def __new__(cls, data: Optional[torch.Tensor] = None, requires_grad=True, quant_state: QuantState = None, blocksize: int = 64, compress_statistics: bool = True, quant_type: str = 'fp4') -> "Params4bit":
        if data is None:
            data = torch.empty(0)
@@ -151,6 +154,16 @@ class Params4bit(torch.nn.Parameter):
        self.data = data
        return self
+    @classmethod
+    def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any], requires_grad: bool = False, device='cuda', **kwargs) -> "Params4bit":
+        self = torch.Tensor._make_subclass(cls, data.to(device))
+        self.requires_grad = requires_grad
+        self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
+        self.blocksize = self.quant_state.blocksize
+        self.compress_statistics = self.quant_state.nested
+        self.quant_type = self.quant_state.quant_type
+        return self
    def cuda(self, device):
        w = self.data.contiguous().half().cuda(device)
        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
@@ -177,34 +190,52 @@ class Params4bit(torch.nn.Parameter):
        if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
            return self.cuda(device)
        else:
-            s = self.quant_state
+            if self.quant_state is not None:
-            if s is not None:
+                self.quant_state.to(device)
-                # make sure the quantization state is on the right device
-                s[0] = s[0].to(device)
-                if self.compress_statistics:
-                    # TODO: refactor this. This is a nightmare
-                    # for 4-bit: 
-                    # state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-                    # state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-                    #s[-2][0] = s[-2][0].to(device) # offset
-                    #s[-2][1][0] = s[-2][1][0].to(device) # nested absmax
-                    # for 8-bit
-                    s[-3][0] = s[-3][0].to(device) # offset
-                    s[-3][1][0] = s[-3][1][0].to(device) # nested quantiation state statitics
-                    s[-3][1][1] = s[-3][1][1].to(device) # nested quantiation codebook
            new_param = Params4bit(super().to(device=device, dtype=dtype, non_blocking=non_blocking),
-                                  requires_grad=self.requires_grad, quant_state=self.quant_state,
+                                   requires_grad=self.requires_grad, quant_state=self.quant_state,
                                   blocksize=self.blocksize, compress_statistics=self.compress_statistics,
                                   quant_type=self.quant_type)
            return new_param
 class Linear4bit(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4',device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4', device=None):
        super().__init__(input_features, output_features, bias, device)
        self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type)
+        # self.persistent_buffers = []  # TODO consider as way to save quant state
        self.compute_dtype = compute_dtype
+        self.compute_type_is_set = False
+    def set_compute_type(self, x):
+        if x.dtype in [torch.float32, torch.bfloat16]:
+            # the input is in a dtype that is safe to compute in, we switch
+            # to this type for speed and stability
+            self.compute_dtype = x.dtype
+        elif x.dtype == torch.float16:
+            # we take the compoute dtype passed into the layer
+            if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]):
+                # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast
+                # warn the user about this
+                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference.')
+                warnings.filterwarnings('ignore', message='.*inference.')
+            if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]):
+                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.')
+                warnings.filterwarnings('ignore', message='.*inference or training')
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        """
+        save weight and bias,
+        then fill state_dict with components of quant_state
+        """
+        super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
+        if getattr(self.weight, "quant_state", None) is not None:
+            for k, v in self.weight.quant_state.as_dict(packed=True).items():
+                destination[prefix + "weight." + k] = v if keep_vars else v.detach()
    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
@@ -213,6 +244,10 @@ class Linear4bit(nn.Linear):
        if getattr(self.weight, 'quant_state', None) is None:
            print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.')
+        if not self.compute_type_is_set:
+            self.set_compute_type(x)
+            self.compute_type_is_set = True
        inp_dtype = x.dtype
        if self.compute_dtype is not None:
            x = x.to(self.compute_dtype)
@@ -224,10 +259,12 @@ class Linear4bit(nn.Linear):
        return out
 class LinearFP4(Linear4bit):
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True,device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, device=None):
        super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', device)
 class LinearNF4(Linear4bit):
    ''' Implements the NF4 data type.
@@ -239,11 +276,10 @@ class LinearNF4(Linear4bit):
        Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
        the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
    '''
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True,device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, device=None):
        super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', device)
 class Int8Params(torch.nn.Parameter):
    def __new__(
        cls,

--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
+import json
 import shlex
 import subprocess
 import torch
@@ -99,45 +100,6 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
    return idx
-def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None):
-    """
-    Replace linear modules with a new Linear module.
-    Parameters:
-        model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        linear_replacement (`torch.nn.Module`):
-            The linear module that replaces the old one. Only expects standard arguments.
-            If other arguments need to be passed, use a lambda.
-        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
-            List of modules names not to convert. Defaults to `lm_head`.
-        copy_weights (`bool`):
-            Copy the weights from the old linear module to the new one
-        post_processing_fun_name (`str`):
-            A function name of the replacement linear class that is called
-            after processing.
-    """
-    for name, module in model.named_children():
-        if len(list(module.children())) > 0:
-            replace_linear(module, linear_replacement, skip_modules, copy_weights, post_processing_function)
-        if isinstance(module, torch.nn.Linear) and name not in skip_modules:
-            old_module = model._modules[name]
-            model._modules[name] = linear_replacement(
-                module.in_features,
-                module.out_features,
-                module.bias is not None,
-            )
-            if copy_weights:
-                model._modules[name].weight = old_module.weight
-                model._modules[name].bias = old_module.bias
-            if post_processing_function is not None:
-               func = getattr(module, post_processing_function, None)
-               if func is not None: func(module)
-    return model
 def execute_and_return(command_string: str) -> Tuple[str, str]:
    def _decode(subprocess_err_out_tuple):
@@ -197,3 +159,36 @@ def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_wei
               if func is not None: func(module)
    return model
+def pack_dict_to_tensor(source_dict):
+    """
+    Pack a dictionary into a torch tensor for storing quant_state items in state_dict.
+    Parameters:
+    - source_dict: The dictionary to be packed.
+    Returns:
+    A torch tensor containing the packed data.
+    """
+    json_str = json.dumps(source_dict)
+    json_bytes = json_str.encode('utf-8')
+    tensor_data = torch.tensor(list(json_bytes), dtype=torch.uint8)
+    return tensor_data
+def unpack_tensor_to_dict(tensor_data):
+    """
+    Unpack a torch tensor into a Python dictionary.
+    Parameters:
+    - tensor_data: The torch tensor containing the packed data.
+    Returns:
+    A Python dictionary containing the unpacked data.
+    """
+    json_bytes = bytes(tensor_data.numpy())
+    json_str = json_bytes.decode('utf-8')
+    unpacked_dict = json.loads(json_str)
+    return unpacked_dict
--- a/compile_from_source.md
+++ b/compile_from_source.md
@@ -9,13 +9,13 @@ To run these steps you will need to have the nvcc compiler installed that comes
 You can install CUDA locally without sudo by following the following steps:
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1 
 ```
 By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.

--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3540,8 +3540,8 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
  float local_C = 0.0f;
  unsigned char local_B_4bit[num_values_8bit];
-  T local_B[num_values_4bit];
+  T local_B[num_values_4bit/4];
-  T local_A[num_values_4bit];
+  T local_A[num_values_4bit/4];
  __shared__ T quant_map[16];
 	T local_absmax = T(0.0f);
@@ -3582,61 +3582,55 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc
          local_B_4bit[j] = 0b01110111;
    }
-    #pragma unroll
+    for(int i = 0; i < 4; i++)
-    for(int k = 0; k < num_values_8bit; k++)
-    {
-      #if __CUDA_ARCH__ >= 800
-        local_B[k*2] = quant_map[local_B_4bit[k] >> 4]*local_absmax;
-        local_B[k*2 + 1] = quant_map[local_B_4bit[k] & 0x0F]*local_absmax;
-      #else
-        // bf16 multipliation not supported
-        local_B[k*2] = T((float)quant_map[local_B_4bit[k] >> 4]*(float)local_absmax);
-        local_B[k*2 + 1] = T((float)quant_map[local_B_4bit[k] & 0x0F]*(float)local_absmax);
-      #endif
-    }
-    if(inner_idx+num_values_4bit < K)
    {
-      // this is also relatively important for performance
+      #pragma unroll
-      if(BITS==16)
+      for(int k = 0; k < num_values_8bit/4; k++)
-      {
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + 0];
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[1] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + 1];
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[2] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + 2];
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[3] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + 3];
-      }
-      else
      {
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 0];
+        #if __CUDA_ARCH__ >= 800
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[1] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 1];
+          local_B[k*2] = quant_map[local_B_4bit[(i*num_values_8bit/4) + k] >> 4]*local_absmax;
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[2] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 2];
+          local_B[k*2 + 1] = quant_map[local_B_4bit[(i*num_values_8bit/4) + k] & 0x0F]*local_absmax;
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[3] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 3];
+        #else
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[4] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 4];
+          // bf16 multipliation not supported
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[5] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 5];
+          local_B[k*2] = T((float)quant_map[local_B_4bit[(i*num_values_8bit/4) + k] >> 4]*(float)local_absmax);
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[6] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 6];
+          local_B[k*2 + 1] = T((float)quant_map[local_B_4bit[(i*num_values_8bit/4) + k] & 0x0F]*(float)local_absmax);
-        reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[7] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + 7];
+        #endif
      }
-    }
+      if(inner_idx+(num_values_4bit/4) + (i*num_values_4bit/4) < K)
-    else
+      {
-      #pragma unroll
+        // this is also relatively important for performance
-      for(int k = 0; k < num_values_4bit; k++)
+        if(BITS==16)
-        if(inner_idx + k < K)
+        {
-          local_A[k] = A[inner_idx + k];
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/4) + i];
+        }
        else
-          local_A[k] = T(0.0f);
+        {
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[0] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + (2*i) + 0];
+          reinterpret_cast<int4(&)[num_values_4bit]>(local_A)[1] = reinterpret_cast<int4*>(A)[inner_idx/(num_values_4bit/8) + (2*i) + 1];
+        }
+      }
+      else
+        #pragma unroll
+        for(int k = 0; k < num_values_4bit/4; k++)
+          if(inner_idx + (i*num_values_4bit/4) + k < K)
+            local_A[k] = A[inner_idx + k + (i*num_values_4bit/4)];
+          else
+            local_A[k] = T(0.0f);
-    // accumulate in float; small performance hit for Ampere, but lower error for outputs
-    #pragma unroll
+      // accumulate in float; small performance hit for Ampere, but lower error for outputs
-    for(int k = 0; k < num_values_4bit; k++)
+      #pragma unroll
-    {
+      for(int k = 0; k < num_values_4bit/4; k++)
-      #if __CUDA_ARCH__ >= 800
+      {
-        local_C += (float)(local_A[k]*local_B[k]);
+        #if __CUDA_ARCH__ >= 800
-      #else
+          local_C += (float)(local_A[k]*local_B[k]);
-        // bf16 multipliation not supported
+        #else
-        local_C += ((float)local_A[k]*(float)local_B[k]);
+          // bf16 multipliation not supported
-      #endif
+          local_C += ((float)local_A[k]*(float)local_B[k]);
+        #endif
+      }
    }
  }

--- a/environment.yml
+++ b/environment.yml
-name: 8-bit
+name: bnb
 channels:
-  - conda-forge
  - pytorch
+  - nvidia
+  - conda-forge
 dependencies:
-  - python=3.9
+  # Base
-  - pytest
+  - conda-forge::python=3.8
-  - pytorch
+  - pytorch::pytorch=>2.1
-  - torchaudio
+  - pytorch::pytorch-cuda=11.8
-  - torchvision
+  - nvidia::cuda=11.8
-  - cudatoolkit=11.1
+  # Libraries
-  - typer
+  - conda-forge::accelerate
-  - ca-certificates
+  - conda-forge::einops
-  - certifi
+  - conda-forge::scipy
-  - openssl
+  - conda-forge::transformers
+  # Development
+  - conda-forge::pytest
+  - conda-forge::build        # build Python packages
+  - conda-forge::twine        # upload Python packages
+  - conda-forge::pytest-cases # more readable and composable parametrized tests
+  - conda-forge::ipython      # better interactive shell
+  - conda-forge::debugpy      # debugger-support for VSCode
+  - conda-forge::ruff         # linting
+  - conda-forge::yapf         # code formatting
+  - conda-forge::monkeytype   # infer type annotations
+  - conda-forge::rich         # better, colored tracebacks, etc
+  - conda-forge::pytest-sugar # better pytest output
+## ENV CREATION - steps to reproduce:
+# mamba env remove -n bnb
+# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
+# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
+# mamba env update -n bnb -f environment.yml
+# mamba activate bnb
+## PIP dependencies (install *after* ENV CREATION):
+# pip install --no-cache-dir --no-deps lion_pytorch triton peft
+## NOTE: conda peft is not up to date, so we install from pip
+# cd pip install -e .  ## installs bitsandbytes as editable development install from within repo root dir
+## ENV UPDATE:
+# # add new packages to environment.yml, then:
+# mamba env update -n bnb -f environment.yml
\ No newline at end of file
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -15,13 +15,14 @@ where XX.X is the CUDA version number.
 You can also install CUDA version that you need locally with a script provided by bitsandbytes as follows:
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda_install.sh 117 ~/local 1 
+bash cuda_install.sh 117 ~/local 1
 ```
 ## Setting the environmental variables BNB_CUDA_VERSION, and LD_LIBRARY_PATH

--- a/install_cuda.py
+++ b/install_cuda.py
+import os
+import sys
+import subprocess
+from urllib.request import urlretrieve
+cuda_versions = {
+    "92": "https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux",
+    "100": "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux",
+    "101": "https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run",
+    "102": "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run",
+    "110": "https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run",
+    "111": "https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run",
+    "112": "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run",
+    "113": "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run",
+    "114": "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run",
+    "115": "https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run",
+    "116": "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run",
+    "117": "https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run",
+    "118": "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run",
+    "120": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run",
+    "121": "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run",
+    "122": "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run",
+    "123": "https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run",
+}
+def install_cuda(version, base_path, download_path):
+    formatted_version = f"{version[:-1]}.{version[-1]}"
+    folder = f"cuda-{formatted_version}"
+    install_path = os.path.join(base_path, folder)
+    if os.path.exists(install_path):
+        print(f"Removing existing CUDA version {version} at {install_path}...")
+        subprocess.run(["rm", "-rf", install_path], check=True)
+    url = cuda_versions[version]
+    filename = url.split('/')[-1]
+    filepath = os.path.join(download_path, filename)
+    if not os.path.exists(filepath):
+        print(f"Downloading CUDA version {version} from {url}...")
+        urlretrieve(url, filepath)
+    else:
+        print(f"Installer for CUDA version {version} already downloaded.")
+    # Make the installer executable
+    subprocess.run(["chmod", "+x", filepath], check=True)
+    # Install CUDA
+    print(f"Installing CUDA version {version}...")
+    install_command = [
+        "bash", filepath, 
+        "--no-drm", "--no-man-page", "--override", 
+        "--toolkitpath=" + install_path, "--toolkit", "--silent"
+    ]
+    print(f"Running command: {' '.join(install_command)}")
+    try:
+        subprocess.run(install_command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Installation failed for CUDA version {version}: {e}")
+        return
+    finally:
+        # Delete the installer file
+        os.remove(filepath)
+    print(f"CUDA version {version} installed at {install_path}")
+def main():
+    user_base_path = os.path.expanduser("~/cuda")
+    system_base_path = "/usr/local/cuda"
+    base_path = user_base_path  # default to user-specific installation
+    download_path = "/tmp"  # default download path
+    if len(sys.argv) < 2:
+        print("Usage: python install_cuda.py <version/all> [user/system] [download_path]")
+        sys.exit(1)
+    version = sys.argv[1]
+    if len(sys.argv) > 2:
+        base_path = system_base_path if sys.argv[2] == "system" else user_base_path
+    if len(sys.argv) > 3:
+        download_path = sys.argv[3]
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
+    # Install CUDA version(s)
+    if version == "all":
+        for ver in cuda_versions.keys():
+            install_cuda(ver, base_path, download_path)
+    elif version in cuda_versions:
+        install_cuda(version, base_path, download_path)
+    else:
+        print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/cuda_install.sh
+++ b/cuda_install.sh
@@ -14,6 +14,7 @@ URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installer
 URL120=https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
+URL123=https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run
 CUDA_VERSION=$1
@@ -69,11 +70,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
  elif [[ "$CUDA_VERSION" -eq "122" ]]; then
    URL=$URL122
    FOLDER=cuda-12.2
+  elif [[ "$CUDA_VERSION" -eq "123" ]]; then
+    URL=$URL123
+    FOLDER=cuda-12.3
  else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 121"
+    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
  fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 112"
+    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
 fi
 FILE=$(basename $URL)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,3 +4,34 @@ requires = [
    "wheel"
 ]
 build-backend = "setuptools.build_meta"
+[tool.ruff]
+src = [
+    "bitsandbytes",
+    "tests",
+    "benchmarking"
+]
+fix = true
+select = [
+    "A",    # prevent using keywords that clobber python builtins
+    "B",    # bugbear: security warnings
+    "E",    # pycodestyle
+    "F",    # pyflakes
+    "I",    # isort
+    "ISC",  # implicit string concatenation
+    "UP",   # alert you when better syntax is available in your python version
+    "RUF",  # the ruff developer's own rules
+]
+target-version = "py38"
+ignore = [
+    "E712",  # Allow using if x == False, as it's not always equivalent to if x.
+    "E501",  # Supress line-too-long warnings: trust yapf's judgement on this one.
+    "F401", 
+]
+ignore-init-module-imports = true  # allow to expose in __init__.py via imports
+[tool.ruff.isort]
+combine-as-imports = true
+detect-same-package = true
+force-sort-within-sections = true
+known-first-party = ["bitsandbytes"]
\ No newline at end of file