Merge branch 'main' into main

41faf4e3 · Tim Dettmers · GitHub · fea5bc7b · 095f7a56 · 41faf4e3
Unverified Commit 41faf4e3 authored Jan 01, 2024 by Tim Dettmers Committed by GitHub Jan 01, 2024
20 changed files
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve bitsandbytes
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your relevant system information with us
+      placeholder: platform, python version, hardware, ...
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+      placeholder: |
+        Reproducer: 
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
\ No newline at end of file
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request for a new feature
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal.
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR? 
\ No newline at end of file
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
+name: Stale Bot
+on:
+  schedule:
+    - cron: "0 15 * * *"
+jobs:
+  close_stale_issues:
+    name: Close Stale Issues
+    if: github.repository == 'TimDettmers/bitsandbytes'
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+    - name: Install requirements
+      run: |
+        pip install PyGithub
+    - name: Close stale issues
+      run: |
+        python scripts/stale.py
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,4 @@ dmypy.json
 dependencies
 cuda_build
+.vscode/*
--- a/.style.yapf
+++ b/.style.yapf
+[style]
+ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = True
+ALLOW_MULTILINE_LAMBDAS = True
+BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = True
+COLUMN_LIMIT = 88
+COALESCE_BRACKETS = True
+SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = True
+SPACES_BEFORE_COMMENT = 2
+SPLIT_BEFORE_BITWISE_OPERATOR = True
+SPLIT_BEFORE_FIRST_ARGUMENT = True
+SPLIT_BEFORE_LOGICAL_OPERATOR = True
+SPLIT_BEFORE_NAMED_ASSIGNS = True
+SPLIT_COMPLEX_COMPREHENSION = True
\ No newline at end of file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -311,7 +311,19 @@ User experience:
 Performance:
 - improved 4-bit inference performance for A100 GPUs. This degraded performance for A40/RTX3090 and RTX 4090 GPUs slightly.
-### 0.41.0
+### 0.41.1
 Bug fixes:
 - Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
+### 0.41.2
+Feature:
+ - 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
+### 0.41.3
+Bug fixes:
+ - Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
+ - Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ python setup.py install
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf,
+  'decapoda-research/llama-7b-hf',
  device_map='auto',
  load_in_8bit=True,
  max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')
@@ -146,13 +146,13 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands.
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
-# For example, the following installs CUDA 11.8 to ~/local/cuda-11.8 and exports the path to your .bashrc
+# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 118 ~/local 1 
+bash install_cuda.sh 117 ~/local 1 
 ```
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:

--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -496,7 +496,7 @@ class MatMul4Bit(torch.autograd.Function):
    # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
    @staticmethod
-    def forward(ctx, A, B, out=None, bias=None, state=None):
+    def forward(ctx, A, B, out=None, bias=None, quant_state: F.QuantState = None):
        # default of pytorch behavior if inputs are empty
        ctx.is_empty = False
        if prod(A.shape) == 0:
@@ -504,7 +504,7 @@ class MatMul4Bit(torch.autograd.Function):
            ctx.A = A
            ctx.B = B
            ctx.bias = bias
-            B_shape = state[1]
+            B_shape = quant_state.shape
            if A.shape[-1] == B_shape[0]:
                return torch.empty(A.shape[:-1] + B_shape[1:], dtype=A.dtype, device=A.device)
            else:
@@ -513,10 +513,10 @@ class MatMul4Bit(torch.autograd.Function):
        # 1. Dequantize
        # 2. MatmulnN
-        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, state).to(A.dtype).t(), bias)
+        output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
        # 3. Save state
-        ctx.state = state
+        ctx.state = quant_state
        ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
        if any(ctx.needs_input_grad[:2]):
@@ -534,7 +534,6 @@ class MatMul4Bit(torch.autograd.Function):
        req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad
        A, B = ctx.tensors
-        state = ctx.state
        grad_A, grad_B, grad_bias = None, None, None
@@ -563,12 +562,11 @@ def matmul(
    return MatMul8bitLt.apply(A, B, out, bias, state)
-def matmul_4bit(A: tensor, B: tensor, quant_state: List, out: tensor = None, bias=None):
+def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None):
    assert quant_state is not None
    if A.numel() == A.shape[-1] and A.requires_grad == False:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
+        if A.shape[-1] % quant_state.blocksize != 0:
-        if A.shape[-1] % blocksize != 0:
+            warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
-            warn(f'Some matrices hidden dimension is not a multiple of {blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
            return MatMul4Bit.apply(A, B, out, bias, quant_state)
        else:
            out = F.gemv_4bit(A, B.t(), out, state=quant_state)

--- a/bitsandbytes/cuda_setup/env_vars.py
+++ b/bitsandbytes/cuda_setup/env_vars.py
@@ -8,6 +8,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
        "OLDPWD",
        "SSH_AUTH_SOCK",  # SSH stuff, therefore unrelated
        "SSH_TTY",
+        "GOOGLE_VM_CONFIG_LOCK_FILE",  # on GCP setups, requires elevated permissions, causing problems in Jupyter notebooks
        "HOME",  # Linux shell default
        "TMUX",  # Terminal Multiplexer
        "XDG_DATA_DIRS",  # XDG: Desktop environment stuff

--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -67,6 +67,7 @@ class CUDASetup:
            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
            self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
            self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
            return
        make_cmd = f'CUDA_VERSION={self.cuda_version_string}'

--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -13,8 +13,9 @@ from scipy.stats import norm
 import numpy as np
 from functools import reduce  # Required in Python 3
-from typing import Tuple
+from typing import Tuple, Any, Dict
 from torch import Tensor
+from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 from .cextension import COMPILED_WITH_CUDA, lib
@@ -567,6 +568,125 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n
    return out
+class QuantState:
+    """container for quantization state components to work with Params4bit and similar clases"""
+    valid_quant_types = ('fp4', 'nf4')
+    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
+    valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
+                     'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset']
+    def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None):
+        self.absmax = absmax
+        self.shape = shape
+        self.code = code
+        self.dtype = dtype
+        self.blocksize = blocksize
+        self.quant_type = quant_type
+        self.offset = offset
+        self.state2 = state2
+        self.nested = state2 is not None
+    def __get_item__(self, idx):
+        """
+        ensures compatibility with older quant state scheme with nested lists.
+        assumes the following layout:
+        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
+        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
+        """
+        if self.nested:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, [self.offset, self.state2], self.quant_type]
+        else:
+            list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type]
+        return list_repr[idx]
+    @classmethod
+    def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState':
+        """
+        unpacks components of state_dict into QuantState
+        where necessary, convert into strings, torch.dtype, ints, etc.
+        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.
+        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.        
+        """
+        # unpacking tensor with non-tensor components
+        qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)]
+        if not len(qs_key) and 'quant_type' not in qs_dict:
+            raise ValueError("Expected packed or unpacked quant_state items, found neither")
+        elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys:
+            raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.")
+        # unpacking minor and non-tensor quant state items if necessary
+        if len(qs_key) == 1:
+            qs_key = qs_key[0]
+            qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(qs_key)))
+        qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()}  # strip prefixes
+        assert set(qs_dict.keys()).issubset(cls.valid_qs_keys)
+        if 'nested_absmax' in qs_dict:
+            offset = torch.tensor(float(qs_dict['nested_offset'])).to(device)
+            state2 = cls(
+                absmax=qs_dict['nested_absmax'].to(device),
+                blocksize=qs_dict['nested_blocksize'],
+                code=qs_dict['nested_quant_map'].to(device),
+                dtype=getattr(torch, qs_dict['nested_dtype']),
+            )
+        else:
+            offset, state2 = None, None
+        quant_state = cls(
+            quant_type=qs_dict['quant_type'],
+            absmax=qs_dict['absmax'].to(device),
+            blocksize=qs_dict['blocksize'],
+            code=qs_dict['quant_map'].to(device),
+            dtype=getattr(torch, qs_dict['dtype']),
+            shape=torch.Size(qs_dict['shape']) if qs_dict['shape'] is not None else None,
+            offset=offset,
+            state2=state2,
+        )
+        return quant_state
+    def as_dict(self, packed=False):
+        """
+        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
+        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
+        """
+        qs_dict = {
+            'quant_type': self.quant_type,
+            'absmax': self.absmax,
+            'blocksize': self.blocksize,
+            'quant_map': self.code,
+            'dtype': str(self.dtype).strip('torch.'),
+            'shape': tuple(self.shape),
+        }
+        if self.nested:
+            qs_dict.update({
+                'nested_absmax': self.state2.absmax,
+                'nested_blocksize': self.state2.blocksize,
+                'nested_quant_map': self.state2.code.clone(),  # un-shared to avoid restoring it after shared tensors are removed by safetensors
+                'nested_dtype': str(self.state2.dtype).strip('torch.'),
+                'nested_offset': self.offset.item(),
+            })
+        if not packed:
+            return qs_dict
+        # packed format allows serialization of non-tensor components, critical for saving in safetensors format
+        qs_packed_dict = {k: v for k, v in qs_dict.items() if isinstance(v, torch.Tensor)}
+        non_tensor_dict = {k: v for k, v in qs_dict.items() if not isinstance(v, torch.Tensor)}
+        qs_packed_dict["quant_state." + "bitsandbytes__" + self.quant_type] = pack_dict_to_tensor(non_tensor_dict)
+        return qs_packed_dict
+    def to(self, device):
+        # make sure the quantization state is on the right device
+        self.absmax = self.absmax.to(device)
+        if self.nested:
+            self.offset = self.offset.to(device)
+            self.state2.absmax = self.state2.absmax.to(device)
+            self.state2.code = self.state2.code.to(device)
 def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor:
    """
    Quantize tensor A in blocks of size 4096 values.
@@ -633,16 +753,16 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou
        offset = absmax.mean()
        absmax -= offset
        qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
-        state = [qabsmax, code, blocksize, nested, A.dtype, offset, state2]
+        quant_state = QuantState(absmax=qabsmax, code=code, blocksize=blocksize, dtype=A.dtype, offset=offset, state2=state2)
    else:
-        state = [absmax, code, blocksize, nested, A.dtype, None, None]
+        quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=A.dtype)
-    return out, state
+    return out, quant_state
 def dequantize_blockwise(
    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    quant_state: QuantState = None,
    absmax: Tensor = None,
    code: Tensor = None,
    out: Tensor = None,
@@ -659,8 +779,8 @@ def dequantize_blockwise(
    ----------
    A : torch.Tensor
        The input 8-bit tensor.
-    quant_state : tuple(torch.Tensor, torch.Tensor)
+    quant_state : QuantState
-        Tuple of code and absmax values.
+        Object with code, absmax and other quantization state components.
    absmax : torch.Tensor
        The absmax values.
    code : torch.Tensor
@@ -681,36 +801,35 @@ def dequantize_blockwise(
        code = name2qmap["dynamic"]
    if quant_state is None:
-       quant_state = (absmax, code, blocksize, False, torch.float32, None, None)
+       quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
-    absmax, code, blocksize, nested, dtype, offset, state2 = quant_state
+    absmax = quant_state.absmax
+    if quant_state.nested:
-    if nested:
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += quant_state.offset
-        absmax += offset
        if absmax.dtype != torch.float32: absmax = absmax.float()
    if out is None:
-        out = torch.empty(A.shape, dtype=dtype, device=A.device)
+        out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
    if A.device.type != 'cpu':
        device = pre_call(A.device)
-        code = code.to(A.device)
+        code = quant_state.code.to(A.device)
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
+            raise ValueError(f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]")
        is_on_gpu([A, absmax, out])
        if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp32(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_fp16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        elif out.dtype == torch.bfloat16:
-            lib.cdequantize_blockwise_bf16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()))
+            lib.cdequantize_blockwise_bf16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel()))
        else:
            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
        post_call(A.device)
    else:
-        code = code.cpu()
+        code = quant_state.code.cpu()
-        lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel()))
+        lib.cdequantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(quant_state.absmax), get_ptr(out), ct.c_longlong(quant_state.blocksize), ct.c_longlong(A.numel()))
    return out
@@ -765,7 +884,6 @@ def get_4bit_type(typename, device=None, blocksize=64):
    return data.to(device)
 def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4')
@@ -839,26 +957,26 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
    post_call(A.device)
-    datatype = get_4bit_type(quant_type, device=A.device)
+    code = get_4bit_type(quant_type, device=A.device)
    if compress_statistics:
        offset = absmax.mean()
        absmax -= offset
        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
        del absmax
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type, datatype]
+        state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2)
    else:
-        state = [absmax, input_shape, A.dtype, blocksize, None, quant_type, datatype]
+        state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, )
    return out, state
-def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4')
-def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
+def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor:
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4')
-def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
+def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor:
    """
    Dequantizes FP4 blockwise quantized values.
@@ -868,8 +986,8 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
    ----------
    A : torch.Tensor
        The input 8-bit tensor (packed 4-bit values).
-    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
+    quant_state : QuantState
-        Tuple of absmax values, original tensor shape and original dtype.
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
    absmax : torch.Tensor
        The absmax values.
    out : torch.Tensor
@@ -892,41 +1010,40 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax:
    if quant_state is None:
        assert absmax is not None and out is not None
-        shape = out.shape
-        dtype = out.dtype
+        quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type)
    else:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = quant_state
+        absmax = quant_state.absmax
-    if compressed_stats is not None:
+    if quant_state.nested:
-        offset, state2 = compressed_stats
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += quant_state.offset
-        absmax += offset
        if absmax.dtype != torch.float32: absmax = absmax.float()
    if out is None:
-        out = torch.empty(shape, dtype=dtype, device=A.device)
+        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
    n = out.numel()
    device = pre_call(A.device)
    is_on_gpu([A, absmax, out])
    if out.dtype == torch.float32:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    elif out.dtype == torch.float16:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    elif out.dtype == torch.bfloat16:
-        if quant_type == 'fp4':
+        if quant_state.quant_type == 'fp4':
-            lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
        else:
-            lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))
+            lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n))
    else:
        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
    post_call(A.device)
@@ -952,22 +1069,22 @@ def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
 def dequantize(
    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
+    state: Tuple[Tensor, Tensor] = None,
    absmax: Tensor = None,
    code: Tensor = None,
    out: Tensor = None,
 ) -> Tensor:
-    assert quant_state is not None or absmax is not None
+    assert state is not None or absmax is not None
-    if code is None and quant_state is None:
+    if code is None and state is None:
        if "dynamic" not in name2qmap:
            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
        code = name2qmap["dynamic"]
        code = code.to(A.device)
-    if quant_state is None:
+    if state is None:
-        quant_state = (absmax, code)
+        state = (absmax, code)
-    out = dequantize_no_absmax(A, quant_state[1], out)
+    out = dequantize_no_absmax(A, state[1], out)
-    return out * quant_state[0]
+    return out * state[0]
 def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
@@ -1482,13 +1599,12 @@ def gemv_4bit(
    if A.numel() != A.shape[-1]:
        raise ValueError(f'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]')
-    Bshape = state[1]
+    Bshape = state.shape
    bout = Bshape[0]
-    absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = state
+    absmax = state.absmax
-    if compressed_stats is not None:
+    if state.nested:
-        offset, state2 = compressed_stats
+        absmax = dequantize_blockwise(state.absmax, state.state2)
-        absmax = dequantize_blockwise(absmax, state2)
+        absmax += state.offset
-        absmax += offset
    if out is None:
        if len(A.shape) == 3:
@@ -1502,7 +1618,7 @@ def gemv_4bit(
    lda = Bshape[0]
    ldc = Bshape[0]
    ldb = (A.shape[-1]+1)//2
-    is_on_gpu([B, A, out, absmax, state[-1]])
+    is_on_gpu([B, A, out, absmax, state.code])
    m = ct.c_int32(m)
    n = ct.c_int32(n)
    k = ct.c_int32(k)
@@ -1512,11 +1628,11 @@ def gemv_4bit(
    if B.dtype == torch.uint8:
        if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        elif A.dtype == torch.bfloat16:
-            lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        elif A.dtype == torch.float32:
-            lib.cgemm_4bit_inference_naive_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state[-1]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]))
+            lib.cgemm_4bit_inference_naive_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize))
        else:
            raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}')

--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -2,7 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import Optional, TypeVar, Union, overload
+from typing import Any, Dict, Optional, TypeVar, Union, overload
 import warnings
 import torch
@@ -10,7 +10,7 @@ import torch.nn.functional as F
 from torch import Tensor, device, dtype, nn
 import bitsandbytes as bnb
-import bitsandbytes.functional
+from bitsandbytes.functional import QuantState
 from bitsandbytes.autograd._functions import undo_layout, get_tile_inds
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import OutlierTracer, find_outlier_dims
@@ -139,8 +139,10 @@ class Embedding(torch.nn.Embedding):
        return emb
 class Params4bit(torch.nn.Parameter):
-    def __new__(cls, data=None, requires_grad=True, quant_state=None, blocksize=64, compress_statistics=True, quant_type='fp4'):
+    def __new__(cls, data: Optional[torch.Tensor] = None, requires_grad=True, quant_state: QuantState = None, blocksize: int = 64, compress_statistics: bool = True, quant_type: str = 'fp4') -> "Params4bit":
        if data is None:
            data = torch.empty(0)
@@ -152,6 +154,16 @@ class Params4bit(torch.nn.Parameter):
        self.data = data
        return self
+    @classmethod
+    def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any], requires_grad: bool = False, device='cuda', **kwargs) -> "Params4bit":
+        self = torch.Tensor._make_subclass(cls, data.to(device))
+        self.requires_grad = requires_grad
+        self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
+        self.blocksize = self.quant_state.blocksize
+        self.compress_statistics = self.quant_state.nested
+        self.quant_type = self.quant_state.quant_type
+        return self
    def cuda(self, device):
        w = self.data.contiguous().half().cuda(device)
        w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, quant_type=self.quant_type)
@@ -178,33 +190,23 @@ class Params4bit(torch.nn.Parameter):
        if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
            return self.cuda(device)
        else:
-            s = self.quant_state
+            if self.quant_state is not None:
-            if s is not None:
+                self.quant_state.to(device)
-                # make sure the quantization state is on the right device
-                s[0] = s[0].to(device)
-                if self.compress_statistics:
-                    # TODO: refactor this. This is a nightmare
-                    # for 4-bit: 
-                    # state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-                    # state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-                    #s[-2][0] = s[-2][0].to(device) # offset
-                    #s[-2][1][0] = s[-2][1][0].to(device) # nested absmax
-                    # for 8-bit
-                    s[-3][0] = s[-3][0].to(device) # offset
-                    s[-3][1][0] = s[-3][1][0].to(device) # nested quantiation state statitics
-                    s[-3][1][1] = s[-3][1][1].to(device) # nested quantiation codebook
            new_param = Params4bit(super().to(device=device, dtype=dtype, non_blocking=non_blocking),
-                                  requires_grad=self.requires_grad, quant_state=self.quant_state,
+                                   requires_grad=self.requires_grad, quant_state=self.quant_state,
                                   blocksize=self.blocksize, compress_statistics=self.compress_statistics,
                                   quant_type=self.quant_type)
            return new_param
 class Linear4bit(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4',device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4', device=None):
        super().__init__(input_features, output_features, bias, device)
        self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type)
+        # self.persistent_buffers = []  # TODO consider as way to save quant state
        self.compute_dtype = compute_dtype
        self.compute_type_is_set = False
@@ -224,10 +226,16 @@ class Linear4bit(nn.Linear):
                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.')
                warnings.filterwarnings('ignore', message='.*inference or training')
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        """
+        save weight and bias,
+        then fill state_dict with components of quant_state
+        """
+        super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
+        if getattr(self.weight, "quant_state", None) is not None:
+            for k, v in self.weight.quant_state.as_dict(packed=True).items():
+                destination[prefix + "weight." + k] = v if keep_vars else v.detach()
    def forward(self, x: torch.Tensor):
        # weights are cast automatically as Int8Params, but the bias has to be cast manually
@@ -251,10 +259,12 @@ class Linear4bit(nn.Linear):
        return out
 class LinearFP4(Linear4bit):
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True,device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, device=None):
        super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', device)
 class LinearNF4(Linear4bit):
    ''' Implements the NF4 data type.
@@ -266,11 +276,10 @@ class LinearNF4(Linear4bit):
        Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
        the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
    '''
-    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True,device=None):
+    def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, device=None):
        super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', device)
 class Int8Params(torch.nn.Parameter):
    def __new__(
        cls,

--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
+import json
 import shlex
 import subprocess
 import torch
@@ -99,45 +100,6 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False)
    return idx
-def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None):
-    """
-    Replace linear modules with a new Linear module.
-    Parameters:
-        model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        linear_replacement (`torch.nn.Module`):
-            The linear module that replaces the old one. Only expects standard arguments.
-            If other arguments need to be passed, use a lambda.
-        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
-            List of modules names not to convert. Defaults to `lm_head`.
-        copy_weights (`bool`):
-            Copy the weights from the old linear module to the new one
-        post_processing_fun_name (`str`):
-            A function name of the replacement linear class that is called
-            after processing.
-    """
-    for name, module in model.named_children():
-        if len(list(module.children())) > 0:
-            replace_linear(module, linear_replacement, skip_modules, copy_weights, post_processing_function)
-        if isinstance(module, torch.nn.Linear) and name not in skip_modules:
-            old_module = model._modules[name]
-            model._modules[name] = linear_replacement(
-                module.in_features,
-                module.out_features,
-                module.bias is not None,
-            )
-            if copy_weights:
-                model._modules[name].weight = old_module.weight
-                model._modules[name].bias = old_module.bias
-            if post_processing_function is not None:
-               func = getattr(module, post_processing_function, None)
-               if func is not None: func(module)
-    return model
 def execute_and_return(command_string: str) -> Tuple[str, str]:
    def _decode(subprocess_err_out_tuple):
@@ -197,3 +159,36 @@ def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_wei
               if func is not None: func(module)
    return model
+def pack_dict_to_tensor(source_dict):
+    """
+    Pack a dictionary into a torch tensor for storing quant_state items in state_dict.
+    Parameters:
+    - source_dict: The dictionary to be packed.
+    Returns:
+    A torch tensor containing the packed data.
+    """
+    json_str = json.dumps(source_dict)
+    json_bytes = json_str.encode('utf-8')
+    tensor_data = torch.tensor(list(json_bytes), dtype=torch.uint8)
+    return tensor_data
+def unpack_tensor_to_dict(tensor_data):
+    """
+    Unpack a torch tensor into a Python dictionary.
+    Parameters:
+    - tensor_data: The torch tensor containing the packed data.
+    Returns:
+    A Python dictionary containing the unpacked data.
+    """
+    json_bytes = bytes(tensor_data.numpy())
+    json_str = json_bytes.decode('utf-8')
+    unpacked_dict = json.loads(json_str)
+    return unpacked_dict
--- a/compile_from_source.md
+++ b/compile_from_source.md
@@ -9,13 +9,13 @@ To run these steps you will need to have the nvcc compiler installed that comes
 You can install CUDA locally without sudo by following the following steps:
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1 
 ```
 By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.

--- a/environment.yml
+++ b/environment.yml
-name: 8-bit
+name: bnb
 channels:
-  - conda-forge
  - pytorch
+  - nvidia
+  - conda-forge
 dependencies:
-  - python=3.9
+  # Base
-  - pytest
+  - conda-forge::python=3.8
-  - pytorch
+  - pytorch::pytorch=>2.1
-  - torchaudio
+  - pytorch::pytorch-cuda=11.8
-  - torchvision
+  - nvidia::cuda=11.8
-  - cudatoolkit=11.1
+  # Libraries
-  - typer
+  - conda-forge::accelerate
-  - ca-certificates
+  - conda-forge::einops
-  - certifi
+  - conda-forge::scipy
-  - openssl
+  - conda-forge::transformers
+  # Development
+  - conda-forge::pytest
+  - conda-forge::build        # build Python packages
+  - conda-forge::twine        # upload Python packages
+  - conda-forge::pytest-cases # more readable and composable parametrized tests
+  - conda-forge::ipython      # better interactive shell
+  - conda-forge::debugpy      # debugger-support for VSCode
+  - conda-forge::ruff         # linting
+  - conda-forge::yapf         # code formatting
+  - conda-forge::monkeytype   # infer type annotations
+  - conda-forge::rich         # better, colored tracebacks, etc
+  - conda-forge::pytest-sugar # better pytest output
+## ENV CREATION - steps to reproduce:
+# mamba env remove -n bnb
+# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
+# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
+# mamba env update -n bnb -f environment.yml
+# mamba activate bnb
+## PIP dependencies (install *after* ENV CREATION):
+# pip install --no-cache-dir --no-deps lion_pytorch triton peft
+## NOTE: conda peft is not up to date, so we install from pip
+# cd pip install -e .  ## installs bitsandbytes as editable development install from within repo root dir
+## ENV UPDATE:
+# # add new packages to environment.yml, then:
+# mamba env update -n bnb -f environment.yml
\ No newline at end of file
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -15,13 +15,13 @@ where XX.X is the CUDA version number.
 You can also install CUDA version that you need locally with a script provided by bitsandbytes as follows:
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh
+wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1 
 ```
 ## Setting the environmental variables BNB_CUDA_VERSION, and LD_LIBRARY_PATH

--- a/install_cuda.py
+++ b/install_cuda.py
+import os
+import sys
+import subprocess
+from urllib.request import urlretrieve
+cuda_versions = {
+    "92": "https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux",
+    "100": "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux",
+    "101": "https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run",
+    "102": "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run",
+    "110": "https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run",
+    "111": "https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run",
+    "112": "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run",
+    "113": "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run",
+    "114": "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run",
+    "115": "https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run",
+    "116": "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run",
+    "117": "https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run",
+    "118": "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run",
+    "120": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run",
+    "121": "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run",
+    "122": "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run",
+    "123": "https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run",
+}
+def install_cuda(version, base_path, download_path):
+    formatted_version = f"{version[:-1]}.{version[-1]}"
+    folder = f"cuda-{formatted_version}"
+    install_path = os.path.join(base_path, folder)
+    if os.path.exists(install_path):
+        print(f"Removing existing CUDA version {version} at {install_path}...")
+        subprocess.run(["rm", "-rf", install_path], check=True)
+    url = cuda_versions[version]
+    filename = url.split('/')[-1]
+    filepath = os.path.join(download_path, filename)
+    if not os.path.exists(filepath):
+        print(f"Downloading CUDA version {version} from {url}...")
+        urlretrieve(url, filepath)
+    else:
+        print(f"Installer for CUDA version {version} already downloaded.")
+    # Make the installer executable
+    subprocess.run(["chmod", "+x", filepath], check=True)
+    # Install CUDA
+    print(f"Installing CUDA version {version}...")
+    install_command = [
+        "bash", filepath, 
+        "--no-drm", "--no-man-page", "--override", 
+        "--toolkitpath=" + install_path, "--toolkit", "--silent"
+    ]
+    print(f"Running command: {' '.join(install_command)}")
+    try:
+        subprocess.run(install_command, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Installation failed for CUDA version {version}: {e}")
+        return
+    finally:
+        # Delete the installer file
+        os.remove(filepath)
+    print(f"CUDA version {version} installed at {install_path}")
+def main():
+    user_base_path = os.path.expanduser("~/cuda")
+    system_base_path = "/usr/local/cuda"
+    base_path = user_base_path  # default to user-specific installation
+    download_path = "/tmp"  # default download path
+    if len(sys.argv) < 2:
+        print("Usage: python install_cuda.py <version/all> [user/system] [download_path]")
+        sys.exit(1)
+    version = sys.argv[1]
+    if len(sys.argv) > 2:
+        base_path = system_base_path if sys.argv[2] == "system" else user_base_path
+    if len(sys.argv) > 3:
+        download_path = sys.argv[3]
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
+    # Install CUDA version(s)
+    if version == "all":
+        for ver in cuda_versions.keys():
+            install_cuda(ver, base_path, download_path)
+    elif version in cuda_versions:
+        install_cuda(version, base_path, download_path)
+    else:
+        print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/cuda_install.sh
+++ b/cuda_install.sh
@@ -14,6 +14,7 @@ URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installer
 URL120=https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
+URL123=https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run
 CUDA_VERSION=$1
@@ -69,11 +70,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
  elif [[ "$CUDA_VERSION" -eq "122" ]]; then
    URL=$URL122
    FOLDER=cuda-12.2
+  elif [[ "$CUDA_VERSION" -eq "123" ]]; then
+    URL=$URL123
+    FOLDER=cuda-12.3
  else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 121"
+    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
  fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 112"
+    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
 fi
 FILE=$(basename $URL)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,3 +4,34 @@ requires = [
    "wheel"
 ]
 build-backend = "setuptools.build_meta"
+[tool.ruff]
+src = [
+    "bitsandbytes",
+    "tests",
+    "benchmarking"
+]
+fix = true
+select = [
+    "A",    # prevent using keywords that clobber python builtins
+    "B",    # bugbear: security warnings
+    "E",    # pycodestyle
+    "F",    # pyflakes
+    "I",    # isort
+    "ISC",  # implicit string concatenation
+    "UP",   # alert you when better syntax is available in your python version
+    "RUF",  # the ruff developer's own rules
+]
+target-version = "py38"
+ignore = [
+    "E712",  # Allow using if x == False, as it's not always equivalent to if x.
+    "E501",  # Supress line-too-long warnings: trust yapf's judgement on this one.
+    "F401", 
+]
+ignore-init-module-imports = true  # allow to expose in __init__.py via imports
+[tool.ruff.isort]
+combine-as-imports = true
+detect-same-package = true
+force-sort-within-sections = true
+known-first-party = ["bitsandbytes"]
\ No newline at end of file
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+addopts = -rP
+    ; --cov=bitsandbytes
+    ; # contexts: record which test ran which line; can be seen in html coverage report
+    ; --cov-context=test
+    ; --cov-report html
+log_cli = True
+log_cli_level = INFO
+log_file = logs/pytest.log
\ No newline at end of file