Reformat with ruff-format

5a4263f4 · Ruff · Aarni Koskela · 02e30ca6 · 5a4263f4 · 5a4263f4
Commit 5a4263f4 authored Feb 24, 2024 by Ruff Committed by Aarni Koskela Mar 13, 2024
19 changed files
--- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
@@ -3,14 +3,14 @@ import torch
 from bitsandbytes.triton.triton_utils import is_triton_available

 if not is_triton_available():
-    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): return None
-else:

+    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias):
+        return None
+else:
    import triton
    import triton.language as tl
    from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time

-
    # This is a matmul kernel based on triton.ops.matmul
    # It is modified to support rowwise quantized input and global quantized weight
    # It's purpose is fused matmul then dequantize
@@ -27,58 +27,83 @@ else:
                    for block_n in [32, 64, 128, 256]:
                        num_warps = 2 if block_n <= 64 else 4
                        configs.append(
-                            triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1},
-                                          num_stages=num_stages, num_warps=num_warps))
+                            triton.Config(
+                                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1},
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            ),
+                        )
                        # split_k
                        for split_k in [2, 4, 8, 16]:
-                            configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k},
-                                                         num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C')))
+                            configs.append(
+                                triton.Config(
+                                    {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k},
+                                    num_stages=num_stages,
+                                    num_warps=num_warps,
+                                    pre_hook=init_to_zero("C"),
+                                ),
+                            )
        return configs

-
    @triton.autotune(
        configs=[
            # basic configs for compute-bound matmuls
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2),
            # good for int8
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2),
            *get_configs_io_bound(),
        ],
-        key=['M', 'N', 'K'],
-        prune_configs_by={
-            'early_config_prune': early_config_prune,
-            'perf_model': estimate_matmul_time,
-            'top_k': 10
+        key=["M", "N", "K"],
+        prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10},
+    )
+    @triton.heuristics(
+        {
+            "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
        },
    )
-    @triton.heuristics({
-        'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0,
-    })
    @triton.jit
-    def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr,
-                stride_am, stride_ak,
-                stride_bk, stride_bn,
-                stride_cm, stride_cn,
-                BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-                GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr,
-                ACC_TYPE: tl.constexpr
-                ):
+    def _int8_matmul_mixed_dequantize(
+        A,
+        B,
+        C,
+        bias,
+        state_x_ptr,
+        state_w_ptr,
+        M,
+        N,
+        K,
+        divfactor: tl.constexpr,
+        has_bias: tl.constexpr,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        GROUP_M: tl.constexpr,
+        SPLIT_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ACC_TYPE: tl.constexpr,
+    ):
        # matrix multiplication
        pid = tl.program_id(0)
        pid_z = tl.program_id(1)
@@ -115,13 +140,13 @@ else:
                b = tl.load(B)
            else:
                k_remaining = K - k * (BLOCK_K * SPLIT_K)
-                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.)
-                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.)
+                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0)
+                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0)
            acc += tl.dot(a, b)
            A += BLOCK_K * SPLIT_K * stride_ak
            B += BLOCK_K * SPLIT_K * stride_bk

-        acc = (w_factor * (x_factor * (acc * divfactor)))
+        acc = w_factor * (x_factor * (acc * divfactor))
        acc = acc.to(C.dtype.element_ty)

        # conditionally add bias
@@ -137,10 +162,9 @@ else:
        else:
            tl.atomic_add(C, acc, mask=mask)

-
    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias):
        device = a.device
-        divfactor = 1. / (127. * 127.)
+        divfactor = 1.0 / (127.0 * 127.0)
        has_bias = 0 if bias is None else 1
        # handle non-contiguous inputs if necessary
        if a.stride(0) > 1 and a.stride(1) > 1:
@@ -154,12 +178,28 @@ else:
        # allocates output
        c = torch.empty((M, N), device=device, dtype=torch.float16)
        # accumulator types
-        ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+        ACC_TYPE = tl.float32  # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
        # launch int8_matmul_mixed_dequantize kernel
-        grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K'])
-        _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias,
-                        a.stride(0), a.stride(1),
-                        b.stride(0), b.stride(1),
-                        c.stride(0), c.stride(1),
-                        GROUP_M=8, ACC_TYPE=ACC_TYPE)
+        grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"])
+        _int8_matmul_mixed_dequantize[grid](
+            a,
+            b,
+            c,
+            bias,
+            state_x,
+            state_w,
+            M,
+            N,
+            K,
+            divfactor,
+            has_bias,
+            a.stride(0),
+            a.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c.stride(0),
+            c.stride(1),
+            GROUP_M=8,
+            ACC_TYPE=ACC_TYPE,
+        )
        return c
--- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
@@ -3,7 +3,9 @@ import torch
 from bitsandbytes.triton.triton_utils import is_triton_available

 if not is_triton_available():
-    def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): return None
+
+    def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias):
+        return None
 else:
    import triton
    import triton.language as tl
@@ -17,7 +19,6 @@ else:
    def init_to_zero(name):
        return lambda nargs: nargs[name].zero_()

-
    def get_configs_io_bound():
        configs = []
        for num_stages in [2, 3, 4, 5, 6]:
@@ -26,58 +27,83 @@ else:
                    for block_n in [32, 64, 128, 256]:
                        num_warps = 2 if block_n <= 64 else 4
                        configs.append(
-                            triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1},
-                                          num_stages=num_stages, num_warps=num_warps))
+                            triton.Config(
+                                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1},
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            ),
+                        )
                        # split_k
                        for split_k in [2, 4, 8, 16]:
-                            configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k},
-                                                         num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C')))
+                            configs.append(
+                                triton.Config(
+                                    {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k},
+                                    num_stages=num_stages,
+                                    num_warps=num_warps,
+                                    pre_hook=init_to_zero("C"),
+                                ),
+                            )
        return configs

-
    @triton.autotune(
        configs=[
            # basic configs for compute-bound matmuls
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2),
            # good for int8
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2),
            *get_configs_io_bound(),
        ],
-        key=['M', 'N', 'K'],
-        prune_configs_by={
-            'early_config_prune': early_config_prune,
-            'perf_model': estimate_matmul_time,
-            'top_k': 10
+        key=["M", "N", "K"],
+        prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10},
+    )
+    @triton.heuristics(
+        {
+            "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
        },
    )
-    @triton.heuristics({
-        'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0,
-    })
    @triton.jit
-    def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr,
-                stride_am, stride_ak,
-                stride_bk, stride_bn,
-                stride_cm, stride_cn,
-                BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-                GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr,
-                ACC_TYPE: tl.constexpr
-                ):
+    def _int8_matmul_rowwise_dequantize(
+        A,
+        B,
+        C,
+        bias,
+        state_x_ptr,
+        state_w_ptr,
+        M,
+        N,
+        K,
+        divfactor,
+        has_bias: tl.constexpr,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        GROUP_M: tl.constexpr,
+        SPLIT_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ACC_TYPE: tl.constexpr,
+    ):
        # matrix multiplication
        pid = tl.program_id(0)
        pid_z = tl.program_id(1)
@@ -114,13 +140,13 @@ else:
                b = tl.load(B)
            else:
                k_remaining = K - k * (BLOCK_K * SPLIT_K)
-                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.)
-                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.)
+                a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0)
+                b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0)
            acc += tl.dot(a, b)
            A += BLOCK_K * SPLIT_K * stride_ak
            B += BLOCK_K * SPLIT_K * stride_bk

-        acc = (w_factor * (x_factor * (acc * divfactor)))
+        acc = w_factor * (x_factor * (acc * divfactor))
        acc = acc.to(C.dtype.element_ty)

        if has_bias:
@@ -135,9 +161,8 @@ else:
        else:
            tl.atomic_add(C, acc, mask=mask)

-
    def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias):
-        divfactor = 1. / (127. * 127.)
+        divfactor = 1.0 / (127.0 * 127.0)

        has_bias = 0 if bias is None else 1

@@ -154,12 +179,28 @@ else:
        # allocates output
        c = torch.empty((M, N), device=device, dtype=torch.float16)
        # accumulator types
-        ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
+        ACC_TYPE = tl.float32  # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32
        # launch int8_matmul_rowwise_dequantize kernel
-        grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K'])
-        _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias,
-                        a.stride(0), a.stride(1),
-                        b.stride(0), b.stride(1),
-                        c.stride(0), c.stride(1),
-                        GROUP_M=8, ACC_TYPE=ACC_TYPE)
+        grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"])
+        _int8_matmul_rowwise_dequantize[grid](
+            a,
+            b,
+            c,
+            bias,
+            state_x,
+            state_w,
+            M,
+            N,
+            K,
+            divfactor,
+            has_bias,
+            a.stride(0),
+            a.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c.stride(0),
+            c.stride(1),
+            GROUP_M=8,
+            ACC_TYPE=ACC_TYPE,
+        )
        return c
--- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py
+++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
@@ -5,9 +5,10 @@ import torch
 from bitsandbytes.triton.triton_utils import is_triton_available

 if not is_triton_available():
-    def quantize_columnwise_and_transpose(x: torch.Tensor): return None
-else:

+    def quantize_columnwise_and_transpose(x: torch.Tensor):
+        return None
+else:
    import triton
    import triton.language as tl

@@ -15,23 +16,23 @@ else:

    # TODO: autotune this better.
    @triton.autotune(
-            configs=[
-                triton.Config({}, num_stages=1),
-                triton.Config({}, num_stages=2),
-                triton.Config({}, num_stages=4),
-                triton.Config({}, num_stages=8),
-                triton.Config({}, num_stages=16),
-                triton.Config({}, num_stages=1, num_warps=8),
-                triton.Config({}, num_stages=2, num_warps=8),
-                triton.Config({}, num_stages=4, num_warps=8),
-                triton.Config({}, num_stages=8, num_warps=8),
-                triton.Config({}, num_stages=16, num_warps=8),
-                triton.Config({}, num_warps=1),
-                triton.Config({}, num_warps=2),
-                triton.Config({}, num_warps=4),
-                triton.Config({}, num_warps=8),
-            ],
-            key=['n_elements']
+        configs=[
+            triton.Config({}, num_stages=1),
+            triton.Config({}, num_stages=2),
+            triton.Config({}, num_stages=4),
+            triton.Config({}, num_stages=8),
+            triton.Config({}, num_stages=16),
+            triton.Config({}, num_stages=1, num_warps=8),
+            triton.Config({}, num_stages=2, num_warps=8),
+            triton.Config({}, num_stages=4, num_warps=8),
+            triton.Config({}, num_stages=8, num_warps=8),
+            triton.Config({}, num_stages=16, num_warps=8),
+            triton.Config({}, num_warps=1),
+            triton.Config({}, num_warps=2),
+            triton.Config({}, num_warps=4),
+            triton.Config({}, num_warps=8),
+        ],
+        key=["n_elements"],
    )
    @triton.jit
    def _quantize_columnwise_and_transpose(
@@ -39,7 +40,8 @@ else:
        output_ptr,
        output_maxs,
        n_elements,
-        M : tl.constexpr, N : tl.constexpr,
+        M: tl.constexpr,
+        N: tl.constexpr,
        BLOCK_SIZE: tl.constexpr,
        P2: tl.constexpr,
    ):
@@ -47,12 +49,12 @@ else:
        block_start = pid
        p2_arange = tl.arange(0, P2)
        p2_arange_mask = p2_arange < M
-        arange =  p2_arange * N
+        arange = p2_arange * N
        offsets = block_start + arange
        x = tl.load(x_ptr + offsets, mask=p2_arange_mask)
        abs_x = tl.abs(x)
        max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0)
-        output = tl.libdevice.llrint(127. * (x / max_val))
+        output = tl.libdevice.llrint(127.0 * (x / max_val))

        new_start = pid * M
        new_offsets = new_start + p2_arange
@@ -68,6 +70,6 @@ else:

        assert x.is_cuda and output.is_cuda
        n_elements = output.numel()
-        grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
        _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2)
        return output, output_maxs
--- a/bitsandbytes/triton/quantize_global.py
+++ b/bitsandbytes/triton/quantize_global.py
--- a/bitsandbytes/triton/quantize_rowwise.py
+++ b/bitsandbytes/triton/quantize_rowwise.py
@@ -5,9 +5,10 @@ import torch
 from bitsandbytes.triton.triton_utils import is_triton_available

 if not is_triton_available():
-    def quantize_rowwise(x: torch.Tensor): return None
-else:

+    def quantize_rowwise(x: torch.Tensor):
+        return None
+else:
    import triton
    import triton.language as tl

@@ -15,21 +16,21 @@ else:

    # TODO: autotune this better.
    @triton.autotune(
-            configs=[
-                triton.Config({}, num_stages=1, num_warps=8),
-                triton.Config({}, num_stages=2, num_warps=8),
-                triton.Config({}, num_stages=4, num_warps=8),
-                triton.Config({}, num_stages=8, num_warps=8),
-                triton.Config({}, num_stages=1),
-                triton.Config({}, num_stages=2),
-                triton.Config({}, num_stages=4),
-                triton.Config({}, num_stages=8),
-                triton.Config({}, num_warps=1),
-                triton.Config({}, num_warps=2),
-                triton.Config({}, num_warps=4),
-                triton.Config({}, num_warps=8),
-            ],
-            key=['n_elements']
+        configs=[
+            triton.Config({}, num_stages=1, num_warps=8),
+            triton.Config({}, num_stages=2, num_warps=8),
+            triton.Config({}, num_stages=4, num_warps=8),
+            triton.Config({}, num_stages=8, num_warps=8),
+            triton.Config({}, num_stages=1),
+            triton.Config({}, num_stages=2),
+            triton.Config({}, num_stages=4),
+            triton.Config({}, num_stages=8),
+            triton.Config({}, num_warps=1),
+            triton.Config({}, num_warps=2),
+            triton.Config({}, num_warps=4),
+            triton.Config({}, num_warps=8),
+        ],
+        key=["n_elements"],
    )
    @triton.jit
    def _quantize_rowwise(
@@ -49,7 +50,7 @@ else:

        abs_x = tl.abs(x)
        max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0)
-        output = tl.libdevice.llrint(127. * (x / max_val))
+        output = tl.libdevice.llrint(127.0 * (x / max_val))
        tl.store(output_ptr + offsets, output, mask=row_mask)
        tl.store(output_maxs + pid, max_val)


--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
--- a/check_bnb_install.py
+++ b/check_bnb_install.py
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
--- a/install_cuda.py
+++ b/install_cuda.py
--- a/scripts/stale.py
+++ b/scripts/stale.py
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -35,7 +35,4 @@ def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog):

 def test_get_cuda_bnb_library_path_nocublaslt(monkeypatch, cuda111_noblas_spec):
    monkeypatch.delenv("BNB_CUDA_VERSION", raising=False)
-    assert (
-        get_cuda_bnb_library_path(cuda111_noblas_spec).stem
-        == "libbitsandbytes_cuda111_nocublaslt"
-    )
+    assert get_cuda_bnb_library_path(cuda111_noblas_spec).stem == "libbitsandbytes_cuda111_nocublaslt"
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
--- a/tests/test_triton.py
+++ b/tests/test_triton.py