Merge pull request #76 from tomaarsen/cleanup

Cleanup involving a handful of failures, some optimization and a lot of code quality improvements

Merge pull request #76 from tomaarsen/cleanup
Cleanup involving a handful of failures, some optimization and a lot of code quality improvements
f0ec93d0 · Tim Dettmers · GitHub · c059bd28 · c91f592a · f0ec93d0
Unverified Commit f0ec93d0 authored Jan 02, 2023 by Tim Dettmers Committed by GitHub Jan 02, 2023
20 changed files
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -12,13 +12,13 @@ import torch
 import bitsandbytes.functional as F


-class MockArgs(object):
+class MockArgs:
    def __init__(self, initial_data):
        for key in initial_data:
            setattr(self, key, initial_data[key])


-class GlobalOptimManager(object):
+class GlobalOptimManager:
    _instance = None

    def __init__(self):
@@ -56,9 +56,9 @@ class GlobalOptimManager(object):
        """
        Overrides initial optimizer config for specific parameters.

-        The key-values of the optimizer config for the input parameters are overidden
+        The key-values of the optimizer config for the input parameters are overridden
        This can be both, optimizer parameters like "betas", or "lr" or it can be
-        8-bit specific paramters like "optim_bits", "percentile_clipping".
+        8-bit specific parameters like "optim_bits", "percentile_clipping".

        Parameters
        ----------
@@ -93,13 +93,12 @@ class GlobalOptimManager(object):

 class Optimizer8bit(torch.optim.Optimizer):
    def __init__(self, params, defaults, optim_bits=32):
-        super(Optimizer8bit, self).__init__(params, defaults)
+        super().__init__(params, defaults)
        self.initialized = False
        self.name2qmap = {}

        self.mng = GlobalOptimManager.get_instance()
-        self.non_castable_tensor_keys = set(
-            [
+        self.non_castable_tensor_keys = {
                "qmap1",
                "qmap2",
                "max1",
@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer):
                "absmax1",
                "absmax2",
                "unorm_vec",
-            ]
-        )
+        }

        if optim_bits == 8:
            self.fill_qmap()
@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer):
        self.name2qmap["udynamic"] = F.create_dynamic_map(signed=False)

    def __setstate__(self, state):
-        super(Optimizer8bit, self).__setstate__(state)
+        super().__setstate__(state)

    def load_state_dict(self, state_dict):
        r"""Loads the optimizer state.
@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer):
        id_map = {
            old_id: p
            for old_id, p in zip(
-                chain.from_iterable((g["params"] for g in saved_groups)),
-                chain.from_iterable((g["params"] for g in groups)),
+                chain.from_iterable(g["params"] for g in saved_groups),
+                chain.from_iterable(g["params"] for g in groups),
            )
        }

@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer):
        return config

    def init_state(self, group, p, gindex, pindex):
-        raise NotImplementedError(f"init_state method needs to be overidden")
+        raise NotImplementedError("init_state method needs to be overridden")

    def update_step(self, group, p, gindex, pindex):
        raise NotImplementedError(
-            f"The update_step method needs to be overidden"
+            "The update_step method needs to be overridden"
        )


@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit):
        skip_zeros=False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        if isinstance(betas, str):
            # format: '(beta1, beta2)'
            betas = betas.replace("(", "").replace(")", "").strip().split(",")
@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit):
                )
        if not 0.0 <= weight_decay:
            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
+                f"Invalid weight_decay value: {weight_decay}"
            )
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(Optimizer2State, self).__init__(params, defaults, optim_bits)
+        super().__init__(params, defaults, optim_bits)

        if args is None:
            args = {}
@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit):
        skip_zeros=False,
    ):
        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
+            raise ValueError(f"Invalid epsilon value: {eps}")
        for i in range(len(betas)):
            if not 0.0 <= betas[i] < 1.0:
                raise ValueError(
@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit):
                )
        if not 0.0 <= weight_decay:
            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
+                f"Invalid weight_decay value: {weight_decay}"
            )
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(Optimizer1State, self).__init__(params, defaults, optim_bits)
+        super().__init__(params, defaults, optim_bits)

        if args is None:
            args = {}

--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State):
    ):
        if alpha == 0:
            raise NotImplementedError(
-                f"RMSprop with alpha==0.0 is not supported!"
+                "RMSprop with alpha==0.0 is not supported!"
            )
        if centered:
-            raise NotImplementedError(f"Centered RMSprop is not supported!")
-        super(RMSprop, self).__init__(
+            raise NotImplementedError("Centered RMSprop is not supported!")
+        super().__init__(
            "rmsprop",
            params,
            lr,
@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State):
    ):
        if alpha == 0:
            raise NotImplementedError(
-                f"RMSprop with alpha==0.0 is not supported!"
+                "RMSprop with alpha==0.0 is not supported!"
            )
        if centered:
-            raise NotImplementedError(f"Centered RMSprop is not supported!")
-        super(RMSprop8bit, self).__init__(
+            raise NotImplementedError("Centered RMSprop is not supported!")
+        super().__init__(
            "rmsprop",
            params,
            lr,
@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State):

        if alpha == 0:
            raise NotImplementedError(
-                f"RMSprop with alpha==0.0 is not supported!"
+                "RMSprop with alpha==0.0 is not supported!"
            )
        if centered:
-            raise NotImplementedError(f"Centered RMSprop is not supported!")
-        super(RMSprop32bit, self).__init__(
+            raise NotImplementedError("Centered RMSprop is not supported!")
+        super().__init__(
            "rmsprop",
            params,
            lr,

--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
@@ -21,8 +21,8 @@ class SGD(Optimizer1State):
        block_wise=True,
    ):
        if momentum == 0:
-            raise NotImplementedError(f"SGD without momentum is not supported!")
-        super(SGD, self).__init__(
+            raise NotImplementedError("SGD without momentum is not supported!")
+        super().__init__(
            "momentum",
            params,
            lr,
@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State):
        block_wise=True,
    ):
        if momentum == 0:
-            raise NotImplementedError(f"SGD without momentum is not supported!")
-        super(SGD8bit, self).__init__(
+            raise NotImplementedError("SGD without momentum is not supported!")
+        super().__init__(
            "momentum",
            params,
            lr,
@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State):
        block_wise=True,
    ):
        if momentum == 0:
-            raise NotImplementedError(f"SGD without momentum is not supported!")
-        super(SGD32bit, self).__init__(
+            raise NotImplementedError("SGD without momentum is not supported!")
+        super().__init__(
            "momentum",
            params,
            lr,

--- a/compile_from_source.md
+++ b/compile_from_source.md
@@ -4,7 +4,7 @@ Basic steps.
 1. `make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cpuonly`
 2. `CUDA_VERSION=XXX python setup.py install`

-To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). 
+To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive).

 For your convenience, there is an installation script in the root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
 ```bash
@@ -13,7 +13,7 @@ echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
 source ~/.bashrc
 ```

-By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. 
+By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.

 Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed


--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -62,7 +62,7 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long

      for (int i = 0; i < valid_chunks; i++)
          int err = pthread_join(threads[i], NULL);
-      
+
      free(threads);
      for (int i = 0; i < valid_chunks; i++)
          free(args[i]);

--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
-// Copyright (c) Facebook, Inc. and its affiliates. 
-//   
-// This source code is licensed under the MIT license found in the 
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.

 #include <kernels.cuh>
@@ -303,7 +303,7 @@ __global__ void kCompressMax(T * __restrict__ const A, T* out, unsigned char* ou
  if(threadIdx.x % 32 < 8)
  {
    // offset: 8 values per 256 input values
-    // 
+    //
    int offset = BLOCK_SIZE*blockIdx.x*BLOCK_SIZE/32*8;
  }

@@ -574,7 +574,7 @@ __global__ void kDequantize(float *code, unsigned char *A, float *out, const int

 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
 __launch_bounds__(BLOCK_SIZE/NUM_VALS, 1)
-__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, 
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
                float* state1, float* state2, float *unorm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n)
@@ -622,7 +622,7 @@ __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
      {
          switch(OPTIMIZER)
          {
-              case ADAM: 
+              case ADAM:
                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
                  s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
                  s1_vals[j] *= correction1;
@@ -653,7 +653,7 @@ __global__ void kPreconditionOptimizer32bit2State(T* g, T* p,

 template<typename T, int OPTIMIZER>
 __launch_bounds__(TH, 1)
-__global__ void kOptimizer32bit2State(T* g, T* p, 
+__global__ void kOptimizer32bit2State(T* g, T* p,
                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n)
@@ -716,7 +716,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
      {
          switch(OPTIMIZER)
          {
-              case ADAM: 
+              case ADAM:
 									if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 									{
 										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
@@ -741,7 +741,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,

 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
 __launch_bounds__(BLOCK_SIZE/NUM_VALS, 1)
-__global__ void kPreconditionOptimizer32bit1State(T* g, T* p, 
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
                float* state1, float *unorm,
                const float beta1, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n)
@@ -783,19 +783,19 @@ __global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
      {
          switch(OPTIMIZER)
          {
-              case MOMENTUM: 
+              case MOMENTUM:
                  if(step == 1)
                    s1_vals[j] = (float)g_vals[j]; // state update
                  else
                    s1_vals[j] = s1_vals[j]*beta1 + ((float)g_vals[j]); // state update
                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
                  break;
-              case RMSPROP: 
+              case RMSPROP:
                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*((float)g_vals[j])*((float)g_vals[j])); // state update
                  s1_vals[j] = __fdividef((float)g_vals[j],sqrtf(s1_vals[j])+eps); // update value
                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
                  break;
-              case ADAGRAD: 
+              case ADAGRAD:
                  s1_vals[j] = s1_vals[j] + ((float)g_vals[j])*((float)g_vals[j]); // state update
                  s1_vals[j] = __fdividef((float)g_vals[j],sqrtf(s1_vals[j])+eps); // update value
                  s1_vals[j] = s1_vals[j]*s1_vals[j]; // update norm
@@ -819,7 +819,7 @@ __global__ void kPreconditionOptimizer32bit1State(T* g, T* p,

 template<typename T, int OPTIMIZER>
 __launch_bounds__(TH, 1)
-__global__ void kOptimizer32bit1State(T *g, T *p, 
+__global__ void kOptimizer32bit1State(T *g, T *p,
                float *state1, float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n)
@@ -882,7 +882,7 @@ __global__ void kOptimizer32bit1State(T *g, T *p,
 					{
 						switch(OPTIMIZER)
 						{
-								case MOMENTUM: 
+								case MOMENTUM:
 										if(step == 1)
 											s1_vals[j] = (float)g_vals[j];
 										else
@@ -890,11 +890,11 @@ __global__ void kOptimizer32bit1State(T *g, T *p,

 										p_vals[j] = ((float)p_vals[j]) + update_scale*(-lr*(s1_vals[j]));
 										break;
-								case RMSPROP: 
+								case RMSPROP:
 										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*((float)g_vals[j])*((float)g_vals[j]));
 										p_vals[j] = ((float)p_vals[j]) - update_scale*(lr*__fdividef((float)g_vals[j],sqrtf((float)s1_vals[j])+eps));
 										break;
-								case ADAGRAD: 
+								case ADAGRAD:
 										s1_vals[j] = s1_vals[j] + ((float)g_vals[j])*((float)g_vals[j]);
 										p_vals[j] = ((float)p_vals[j]) - lr*__fdividef((float)g_vals[j],sqrtf((float)s1_vals[j])+eps);
 										break;
@@ -1156,12 +1156,12 @@ kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned cha
 template<typename T, int OPTIMIZER>
 __global__ void
 __launch_bounds__(NUM_THREADS, 2)
-kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, 
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
                float *unorm,
-                const float beta1, 
+                const float beta1,
                const float eps, const int step,
-                float* __restrict__ const quantiles1, 
-                float* max1, float* new_max1, 
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
                const float weight_decay,
                const float gnorm_scale, const int n)
 {
@@ -1211,7 +1211,7 @@ kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned c
            s1_vals[j] = smem_quantiles1[m_c1[j]]*max1[0];
            switch(OPTIMIZER)
            {
-                case MOMENTUM: 
+                case MOMENTUM:
                    if(step == 1)
                      s1_vals[j] = (float)g_vals[j];
                    else
@@ -1219,7 +1219,7 @@ kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned c
                    if(unorm != NULL)
                      local_unorm += s1_vals[j]*s1_vals[j];
                    break;
-              case RMSPROP: 
+              case RMSPROP:
                    s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
                  break;
            }
@@ -1244,10 +1244,10 @@ template<typename T, int OPTIMIZER>
 __global__ void
 kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
                const float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, 
+                const float beta1,
                const float eps, const int step, const float lr,
-                float* __restrict__ const quantiles1, 
-                float* max1, float* new_max1, 
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
                float weight_decay,
                const float gnorm_scale, const int n)
 {
@@ -1313,7 +1313,7 @@ kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,

            switch(OPTIMIZER)
            {
-                case MOMENTUM: 
+                case MOMENTUM:
                  if(step == 1)
                    s1_vals[j] = g_vals[j];
                  else
@@ -1321,7 +1321,7 @@ kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,

                  p_vals[j] = ((float)p_vals[j]) + (-lr*update_scale*(s1_vals[j]));
                  break;
-              case RMSPROP: 
+              case RMSPROP:
                  s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
                  p_vals[j] = ((float)p_vals[j]) - (lr*__fdividef(g_val,sqrtf(s1_vals[j])+eps));
                  break;
@@ -1401,7 +1401,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
                const float beta1, const float beta2,
                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
-                float* absmax1, float* absmax2, 
+                float* absmax1, float* absmax2,
                float weight_decay,
                const float gnorm_scale, const bool skip_zeros, const int n)
 {
@@ -1545,7 +1545,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
        StoreT(temp_storage.storeh).Store(&(p[i]), g_vals, valid_items);

        //  quantizaztion: 2.67/1.70  -> 3.4/3.3
-        # pragma unroll N_PER_TH 
+        # pragma unroll N_PER_TH
        for(unsigned int j = 0; j < N_PER_TH; j++)
        {
            c1s[j] = quantize_2D<1>(quadrants1, smem_quantiles1[lane_id], __fdividef(s1_vals[j],new_local_abs_max1));
@@ -1658,16 +1658,16 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char

 							switch(OPTIMIZER)
 							{
-									case MOMENTUM: 
+									case MOMENTUM:
 										if(step == 1)
 											s1_vals[j] = g_val;
 										else
 											s1_vals[j] = (s1_vals[j]*beta1) + g_val;
 										break;
-									case RMSPROP: 
+									case RMSPROP:
 										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f-beta1)*(g_val*g_val));
 										break;
-									case ADAGRAD: 
+									case ADAGRAD:
 										s1_vals[j] = s1_vals[j] + (g_val*g_val);
 										break;
 							}
@@ -1698,14 +1698,14 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
 						{
 							switch(OPTIMIZER)
 							{
-									case MOMENTUM: 
+									case MOMENTUM:
 										p_vals[j] = ((float)p_vals[j]) - lr*(s1_vals[j]);
 										break;
-									case RMSPROP: 
+									case RMSPROP:
 										g_val = g_vals[j];
 										p_vals[j] = ((float)p_vals[j]) - lr*(__fdividef(g_val, sqrtf(s1_vals[j])+eps));
 										break;
-									case ADAGRAD: 
+									case ADAGRAD:
 										g_val = g_vals[j];
 										p_vals[j] = ((float)p_vals[j]) - lr*(__fdividef(g_val, sqrtf(s1_vals[j])+eps));
 										break;
@@ -1718,7 +1718,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
        StoreT(temp_storage.storeh).Store(&(p[i]), p_vals, valid_items);

        //  quantizaztion: 2.67/1.70  -> 3.4/3.3
-        # pragma unroll N_PER_TH 
+        # pragma unroll N_PER_TH
        for(unsigned int j = 0; j < N_PER_TH; j++)
        {
            c1s[j] = quantize_2D<1>(quadrants1, smem_quantiles1[lane_id], __fdividef(s1_vals[j],new_local_abs_max1));
@@ -1895,9 +1895,9 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 {

  // Strategy: To dequantize we need to load col/row statistics. This can be very expensive
-  // since different row/col stats need to be loaded with each thread. 
+  // since different row/col stats need to be loaded with each thread.
  // (1, bad algorithm) Loading 32 items per thread would only occur 1 row load, but this increases register pressure
-  // and would lead to low global load utilization. 
+  // and would lead to low global load utilization.
  // (2, bad algorithm) If each thread loads some columns and multiple rows one needs to do lot of row loads
  // for each thread and this is duplicated by a factor of 32/num-cols-per-thread.
  // (3, good algorithm) Combining (1) and (2) we use sub-tiles of size 32xk in shared memory per threadblock.
@@ -1905,7 +1905,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
  // We can run for example 32x128 sub-tiles and warp-strided loads of 4 elements so that each thread has
  // the same col statistic but needs to load 4 row stats from shared memory. To prevent bank conflicts
  // we use a block-striped shared memory config [1, 31, 63, 95] so no bank conflicts happen during the
-  // shared memory loads. 
+  // shared memory loads.

  // data is in 32 column-tile major with tile width 32 columns and numRows rows
  // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
@@ -2142,7 +2142,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T


  // To have efficient loads and stores if we transpose we need 128 consequitive bytes which at 1 byte are 128 values
-  // As such we need: 
+  // As such we need:
  // at least 32*4 shared memory tiles for col32; preferably 32*32
  // at least 32*6 shared memory tiles for col32_ampere: preferably 32*32
  // at least 32*8 shared memory tiles for col4_turing: preferably 32*32
@@ -2152,7 +2152,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
  // we have 64k sharded mem per SM in Turing which is 8 blocks per SM which is 2*8 = 32 warps = 100% occupancy
  // for turing and 50% for A100 and 75% for RTX 30s / A40 which is probably good enough
  // register pressure should be low with: 8 registers from local memoryh per block and 64 registers per SM
-  // 
+  //
  // to make the shared memory work with that occupancy we might need to union the block loads/stores

  // each block loads TILE_COLs columns and TILE_ROW rows
@@ -2241,7 +2241,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T

          switch(FORMAT)
          {
-              case COL32: 
+              case COL32:
                if(TRANSPOSE)
                {
                  // data lies in shared memory in the following way:
@@ -2266,7 +2266,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T

                    // each 32 columns we have new tile
                    // each tile has size outRows*32 and base_row is done in increments of 32
-                    offset = base_row*outRows; 
+                    offset = base_row*outRows;
                    out[offset + (base_col + jrow + subrow_loop_row)*32 + threadIdx.x] = data;
                  }
                }
@@ -2312,7 +2312,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                    // we increase by row_tile_column every 32 columns
                    // base_row increase in increments of 32
                    //int row_tile_column = 256*outRows/8; // there are outRows/8 row tiles, and each tile is 256 elements
-                    //int col_offset = (base_row/32)*row_tile_column; 
+                    //int col_offset = (base_row/32)*row_tile_column;
                    // -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
                    // 256*outRows/8*base_row/32 = outRows*base_row
                    int col_offset = outRows*base_row;
@@ -2349,7 +2349,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                    // this happends every 8 rows anew (subrow % 8)
                    // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
                    int subcol = warp_lane;
-                    
+
                    // add local offset (4x4 sub-tile)
                    if(subrow % 2 == 1)
                      // odd
@@ -2389,7 +2389,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
 											// we increase by row_tile_column every 32 columns
 											// base_row increase in increments of 32
 											//int row_tile_column = 1024*outRows/32; // there are outRows/32 row tiles, and each tile is 1024 elements
-											//int col_offset = (base_row/32)*row_tile_column; 
+											//int col_offset = (base_row/32)*row_tile_column;
 											// -> we can remove the divisions to speed up compute since outRows is always a multiple of 8
 											// 1024*outRows/32*base_row/32 = outRows*base_row
 											int col_offset = outRows*base_row;
@@ -2447,7 +2447,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
 #define C 1.0f/127.0f
 #define MAX_SPARSE_COUNT 32
 #define SMEM_SIZE 8*256
-template <typename T, int SPMM_ITEMS, int BITS> 
+template <typename T, int SPMM_ITEMS, int BITS>
 __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB)
 {

@@ -2577,7 +2577,7 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o
          #pragma unroll num_items
          for(int k = 0; k < num_items; k++)
            local_valC[(j/num_items) + k] = (float)local_valC[(j/num_items) + k] + (float)local_valOut[k];
-            
+
          reinterpret_cast<float4*>(out)[idx_val/num_items] = reinterpret_cast<float4(&)[num_items]>(local_valC)[j/num_items];
      }
      else
@@ -2591,11 +2591,11 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o

    idx_col_B += blockDim.x*SPMM_ITEMS;
    local_idx_col_B_offset += blockDim.x*SPMM_ITEMS;
-  } 
+  }
 }

 template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA)
-{  
+{
 	int local_colidx = idx[blockIdx.x];

 	if(FORMAT==COL_TURING)
@@ -2655,7 +2655,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 			out[out_idx] = val;
 		}
 	}
-} 
+}

 //==============================================================
 //                   TEMPLATE DEFINITIONS

--- a/csrc/kernels.cuh
+++ b/csrc/kernels.cuh
-// Copyright (c) Facebook, Inc. and its affiliates. 
-//   
-// This source code is licensed under the MIT license found in the 
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.

 #include <float.h>
@@ -18,49 +18,49 @@ template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__
 template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n);

 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
-__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, 
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
                float* state1, float* state2, float *unorm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

 template<typename T, int OPTIMIZER>
-__global__ void kOptimizer32bit2State(T* g, T* p, 
+__global__ void kOptimizer32bit2State(T* g, T* p,
                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);

 template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
-__global__ void kPreconditionOptimizer32bit1State(T* g, T* p, 
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
                float* state1, float *unorm,
                const float beta1, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const int n);

 template<typename T, int OPTIMIZER>
-__global__ void kOptimizer32bit1State(T* g, T* p, 
+__global__ void kOptimizer32bit1State(T* g, T* p,
                float* state1,  float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);

 template<typename T, int OPTIMIZER>
 __global__ void
-kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, 
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
                float *unorm,
-                const float beta1, 
-                const float eps, const int step, 
-                float* __restrict__ const quantiles1, 
-                float* max1, float* new_max1, 
+                const float beta1,
+                const float eps, const int step,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
                const float weight_decay,
                const float gnorm_scale, const int n);


 template<typename T, int OPTIMIZER>
 __global__ void
-kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1, 
+kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
                const float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, 
-                const float eps, const int step, const float lr, 
-                float* __restrict__ const quantiles1, 
-                float* max1, float* new_max1, 
+                const float beta1,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* max1, float* new_max1,
                float weight_decay, const float gnorm_scale, const int n);


@@ -70,7 +70,7 @@ __global__ void
 kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
                float *unorm,
                const float beta1, const float beta2,
-                const float eps, const int step, 
+                const float eps, const int step,
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                const float gnorm_scale, const int n);
@@ -81,7 +81,7 @@ __global__ void
 kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
                const float *unorm, const float max_unorm, const float param_norm,
                const float beta1, const float beta2,
-                const float eps, const int step, const float lr, 
+                const float eps, const int step, const float lr,
                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay, const float gnorm_scale, const int n);
@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
 template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);

 #endif
-
-
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
-// Copyright (c) Facebook, Inc. and its affiliates. 
-//   
-// This source code is licensed under the MIT license found in the 
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.

 #include <ops.cuh>
@@ -241,7 +241,7 @@ void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, in

 }

-void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc, 
+void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
                    long long int strideA, long long int strideB, long long int strideC, int batchCount)
 {
  const int falpha = 1;
@@ -351,7 +351,7 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
  cublasLtOrder_t orderOut = get_order<TARGET>();
  int ldA = get_leading_dim<SRC>(dim1, dim2);
  int ldOut = get_leading_dim<TARGET>(dim1, dim2);
-  
+
  cublasLtMatrixLayout_t A_desc = NULL, out_desc = NULL;
  cublasLtMatrixTransformDesc_t A2Out_desc = NULL;
  cublasOperation_t opTranspose = CUBLAS_OP_T;
@@ -397,7 +397,7 @@ template void transform<int8_t, ROW, COL_AMPERE, false, 8>(cublasLtHandle_t ltHa
 template void transform<int8_t, COL32, ROW, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
 template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandle, int32_t *A, int32_t *out, int dim1, int dim2);

-template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) 
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
 {
 #ifdef NO_CUBLASLT
  cout << "" << endl;

--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
-// Copyright (c) Facebook, Inc. and its affiliates. 
-//   
-// This source code is licensed under the MIT license found in the 
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.


@@ -131,7 +131,7 @@ void dequantize(float *code, unsigned char *A, float *out, int n);
 template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
 template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n);

-template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p, 
+template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
                float beta1, float beta2, float eps, float weight_decay,
                int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
@@ -139,15 +139,15 @@ template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
 template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
                float *unorm, float max_unorm, float param_norm,
                float beta1, float beta2,
-                float eps, int step, float lr, 
+                float eps, int step, float lr,
                float* quantiles1, float* quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay,
                const float gnorm_scale, int n);

 template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, 
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, 
+                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,
+                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
 								bool skip_zeros, int n);

 template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
@@ -155,7 +155,7 @@ template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step,
 void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n);

 void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
-void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc, 
+void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
                    long long int strideA, long long int strideB, long long int strideC, int batchCount);



--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
-// Copyright (c) Facebook, Inc. and its affiliates. 
-//   
-// This source code is licensed under the MIT license found in the 
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.

 #if BUILD_CUDA
@@ -9,7 +9,7 @@
 #include <cpu_ops.h>

 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
-// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to 
+// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
 // maintain all that boilerplate
 //===================================================================================
 //                               UNMANGLED CALLS
@@ -290,4 +290,3 @@ extern "C"
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
-
--- a/cuda_install.sh
+++ b/cuda_install.sh
@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then
 else
  echo ""
 fi
-
-
-
--- a/howto_config_override.md
+++ b/howto_config_override.md
@@ -14,16 +14,16 @@ mng.register_parameters(model.parameters()) # 1. register parameters while still

 model = model.cuda()
 # use 8-bit optimizer states for all parameters
-adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) 
+adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8)

 # 2a. override: the parameter model.fc1.weight now uses 32-bit Adam
-mng.override_config(model.fc1.weight, 'optim_bits', 32) 
+mng.override_config(model.fc1.weight, 'optim_bits', 32)

 # 2b. override: the two special layers use
 # sparse optimization + different learning rate + different Adam betas
 mng.override_config([model.special.weight, model.also_special.weight],
-                    key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) 
-``` 
+                    key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)})
+```
 Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm`

 For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:

--- a/include/Algo-Direct-Common.h
+++ b/include/Algo-Direct-Common.h
@@ -121,7 +121,7 @@ template <unsigned char Gap, typename T>
 struct DirectTraits<true,Gap,T>
 {
    typedef FVec1<SSE, T> fVec1;
-    
+
    static void checkH(T scaler, T H_Times_x0, T xN)
    {
        union {
@@ -177,9 +177,9 @@ struct DirectInfo
            , cst0(fun_t::cst0(H, x[0]))
        {
            myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
-            
+
            uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
-            
+
            const uint32 npad = Gap-1;
            const uint32 n_sz = n + npad;   // size of padded vector

@@ -320,7 +320,7 @@ struct DirectInfo
        T cst0 = fun_t::cst0(H, px[0]);
        const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
        buckets.resize(maxIndex + 1);
-        
+
        data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
    }


--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -203,7 +203,7 @@ struct IVec<SSE, double> : IVecBase<SSE>
 #if 1
        // takes 4 cycles
        __m128i hi = _mm_shuffle_epi32(vec, 2);  // 1 cycle
-        __m128i s = _mm_add_epi32(vec, hi);      
+        __m128i s = _mm_add_epi32(vec, hi);
        int32 x = _mm_cvtsi128_si32(s);
        return -x;
 #else

--- a/setup.py
+++ b/setup.py
@@ -26,9 +26,6 @@ setup(
    keywords="gpu optimizers optimization 8-bit quantization compression",
    url="https://github.com/TimDettmers/bitsandbytes",
    packages=find_packages(),
-    entry_points={
-        "console_scripts": ["debug_cuda = bitsandbytes.debug_cli:cli"],
-    },
    package_data={"": libs},
    long_description=read("README.md"),
    long_description_content_type="text/markdown",

--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
-from itertools import product, permutations
+from itertools import permutations, product

 import pytest
 import torch
@@ -27,7 +27,7 @@ str_values = list(
    )
 )
 names = [
-    "dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}".format(
+    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(
        *vals
    )
    for vals in str_values
@@ -286,7 +286,7 @@ str_values = list(
        has_bias
    )
 )
-names = ["dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_func_{4}_dtype_{5}_requires_grad_{6}_transpose_{7}_decomp_{8}_has_fp16_weights_{9}_has_bias_{10}".format(*vals) for vals in str_values]
+names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values]


 @pytest.mark.parametrize(
@@ -336,7 +336,7 @@ def test_matmullt(
            )
            bias = None
            bias2 = None
-            if has_bias: 
+            if has_bias:
                bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2])
                bias2 = bias.clone()
            torch.nn.init.xavier_uniform_(B)

--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
 import os
-import pytest
-import bitsandbytes as bnb
-
 from typing import List, NamedTuple

+import pytest
+
+import bitsandbytes as bnb
 from bitsandbytes.cuda_setup import (
    CUDA_RUNTIME_LIB,
-    evaluate_cuda_setup,
    determine_cuda_runtime_lib_path,
+    evaluate_cuda_setup,
    extract_candidate_paths,
 )


--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -28,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0):

 class FFN(torch.nn.Module):
    def __init__(self, input_features, hidden_size, bias=True):
-        super(FFN, self).__init__()
+        super().__init__()
        self.fc1 = torch.nn.Linear(input_features, hidden_size, bias=bias)
        self.fc2 = torch.nn.Linear(hidden_size, input_features, bias=bias)

@@ -42,7 +42,7 @@ class FFN(torch.nn.Module):
        return x


-class Timer(object):
+class Timer:
    def __init__(self):
        self.starts = {}
        self.ends = {}
@@ -69,7 +69,7 @@ class Timer(object):
                self.ends.pop(name)

        if print_ms and name in self.agg:
-            print("{0} took: {1:.5f}s".format(name, self.agg[name] / 1000.0))
+            print(f"{name} took: {self.agg[name] / 1000.0:.5f}s")

        return self.agg[name]

@@ -302,7 +302,7 @@ batched = [False, True]
 values = list(product(dim1, dim2, methods, batched))
 values_names = list(product(dim1, dim2, method_names, batched))
 names = [
-    "dim1_{0}_dim2_{1}_quant_{2}_batched_{3}".format(*vals)
+    "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals)
    for vals in values_names
 ]

@@ -360,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist()
 transpose = [(False, False), (False, True), (True, False), (True, True)]
 values = list(product(hidden_dim, batch_dim, transpose, seq_dim))
 names = [
-    "hidden_dim_{0}_batch_dim_{1},transpose_{2}_seq_dim_{3}".format(*vals)
+    "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals)
    for vals in values
 ]

@@ -425,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
 batch_dim = torch.randint(2, 16, size=(n,)).tolist()
 values = list(product(seq_dim, hidden_dim, batch_dim))
 names = [
-    "seq_dim{0}_hidden_dim{1}_batch_dim{2}".format(*vals) for vals in values
+    "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values
 ]


@@ -457,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist()
 transpose = [False, True]
 values = list(product(seq_dim, hidden_dim, batch_dim, transpose))
 names = [
-    "seq_dim={0}_hidden_dim={1}_batch_dim={2}_transpose{3}".format(*vals)
+    "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals)
    for vals in values
 ]

@@ -542,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist()
 transpose = [(False, False), (True, False), (False, True), (True, True)]
 values = list(product(dim1, dim2, dim3, dim4, transpose))
 names = [
-    "dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_transpose_{4}".format(*vals)
+    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals)
    for vals in values
 ]

@@ -580,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist()
 dim2 = torch.randint(32, 128, size=(n,)).tolist()
 dim3 = torch.randint(32, 256, size=(n,)).tolist()
 values = list(product(dim1, dim2, dim3))
-names = ["dim1_{0}_dim2_{1}_dim3_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names)
@@ -609,7 +609,7 @@ transpose = [False]
 dims = [2, 3]
 values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose))

-names = ["dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_transpose_{7}".format(*vals)for vals in values]
+names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values]


 @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names)
@@ -691,7 +691,7 @@ ldb = [0]
 # ldb = list(range(256, 1*1024, 256))
 values = list(product(dim1, dim2, dim3, dim4, dims, ldb))
 names = [
-    "dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}_ldb_{5}".format(*vals)
+    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals)
    for vals in values
 ]

@@ -739,7 +739,7 @@ dims = (2,)
 # ldb = list(range(256, 1*1024, 256))
 values = list(product(dim1, dim2, dim3, dim4, dims))
 names = [
-    "dim1_{0}_dim2_{1}_dim3_{2}_dim4_{3}_dims_{4}".format(*vals)
+    "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals)
    for vals in values
 ]

@@ -797,7 +797,7 @@ values = [

 # values = list(product(batch, seq, model, hidden))
 names = [
-    "batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values
+    "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
 ]


@@ -965,7 +965,7 @@ dims = (2,)
 formatB = ["col_turing", "col_ampere"]
 has_bias = [True, False]
 values = list(product(dim1, dim4, dims, formatB, has_bias))
-names = ["dim1_{0}_dim4_{1}_dims_{2}_formatB_{3}_has_bias_{4}".format(*vals) for vals in values]
+names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names)
@@ -1015,7 +1015,7 @@ dim2 = [1 * 1024]
 dims = (2,)
 # ldb = list(range(256, 1*1024, 256))
 values = list(product(dim1, dim2, dims))
-names = ["dim1_{0}_dim2_{1}_dims_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2, dims", values, ids=names)
@@ -1071,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
 dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()

 values = list(product(dim1, dim2))
-names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
@@ -1118,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
 inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()

 values = list(zip(dim1, dim4, inner))
-names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
@@ -1162,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
 inner = torch.randint(1, 4 * 1024, size=(n,)).tolist()

 values = list(zip(dim1, dim4, inner))
-names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
@@ -1237,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4]
 dim4 = [12288, 4096]

 values = list(zip(dim1, dim4, inner))
-names = ["dim1_{0}_dim4_{1}_inner_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim4, inner", values, ids=names)
@@ -1303,7 +1303,7 @@ values = list(
    product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)
 )
 names = [
-    "dim1_{0}_dim2_{1}_dim3_{2}_dims_{3}_dtype_{4}_orderA_{5}_orderOut_{6}_{7}".format(
+    "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format(
        *vals
    )
    for vals in values
@@ -1354,7 +1354,7 @@ a_order = ["col_turing"]
 out_order = ["row"]
 values = list(product(dim1, dim2, dtype, a_order, out_order))
 names = [
-    "dim1_{0}_dim2_{1}_dtype_{2}_orderA_{3}_orderOut_{4}".format(*vals)
+    "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals)
    for vals in values
 ]

@@ -1380,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
 # dim2 = [5]

 values = list(product(dim1, dim2))
-names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
@@ -1417,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
 # dim2 = [11]
 transposed_B = [False, True]
 values = list(product(dim1, dim2, transposed_B))
-names = ["dim1_{0}_dim2_{1}_transposed_B_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names)
@@ -1498,7 +1498,7 @@ n = 2
 dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
 dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist()
 values = list(product(dim1, dim2))
-names = ["dim1_{0}_dim2_{1}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2", values, ids=names)
@@ -1563,7 +1563,7 @@ dtype = [torch.float16]
 out_function = ["zeros", "ones"]
 values = list(product(dim1, dim2, dtype, out_function))
 names = [
-    "dim1_{0}_dim2_{1}_dtype_{2}_out_func_{3}".format(*vals) for vals in values
+    "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values
 ]


@@ -1680,7 +1680,7 @@ dim2 = [2048]
 # dim2 = [2]
 dtype = [torch.int8]
 values = list(product(dim1, dim2, dtype))
-names = ["dim1_{0}_dim2_{1}_dtype_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names)
@@ -1796,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768))
 # values.append((batch_size, seqdim, 5140, 4*5140))
 #values.append((batch_size, seqdim, 12288, 4*12288))
 names = [
-    "batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values
+    "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values
 ]



--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -7,7 +7,7 @@ from torch import nn
 import bitsandbytes as bnb


-class MockArgs(object):
+class MockArgs:
    def __init__(self, initial_data):
        for key in initial_data:
            setattr(self, key, initial_data[key])
@@ -15,7 +15,7 @@ class MockArgs(object):

 class MLP8bit(torch.nn.Module):
    def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0):
-        super(MLP8bit, self).__init__()
+        super().__init__()
        self.fc1 = bnb.nn.Linear8bitLt(
            dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
            threshold=threshold
@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function):

 class Linear8bit(nn.Module):
    def __init__(self, input_features, output_features, bias=True, args=None):
-        super(Linear8bit, self).__init__()
+        super().__init__()
        self.input_features = input_features
        self.output_features = output_features
        self.args = args
@@ -312,7 +312,7 @@ class Linear8bit(nn.Module):

 threshold = [0.0, 3.0]
 values = threshold
-names = ["threshold_{0}".format(vals) for vals in values]
+names = [f"threshold_{vals}" for vals in values]


 @pytest.mark.parametrize("threshold", values, ids=names)
@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient():

 threshold = [0.0, 2.0]
 values = threshold
-names = ["threshold_{0}".format(vals) for vals in values]
+names = [f"threshold_{vals}" for vals in values]


 @pytest.mark.parametrize("threshold", values, ids=names)

--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -18,7 +18,7 @@ k = 20


 def get_temp_dir():
-    path = "/tmp/autoswap/{0}".format(str(uuid.uuid4()))
+    path = f"/tmp/autoswap/{str(uuid.uuid4())}"
    os.makedirs(path, exist_ok=True)
    return path

@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16]
 optimizer_names = ["adam", "momentum", "rmsprop", "lars"]
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = [
-    "dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values
+    "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
 ]


@@ -187,7 +187,7 @@ dim1 = [1024]
 dim2 = [32, 1024, 4097]
 gtype = [torch.float32, torch.float16]
 values = list(product(dim1, dim2, gtype))
-names = ["dim1_{0}_dim2_{1}_gtype_{2}".format(*vals) for vals in values]
+names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values]


 @pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names)
@@ -250,7 +250,7 @@ optimizer_names = [
 ]
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = [
-    "dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values
+    "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
 ]


@@ -391,7 +391,7 @@ gtype = [torch.float32]
 optim_bits = [32, 8]
 values = list(product(dim1, dim2, gtype, optim_bits))
 names = [
-    "dim1_{0}_dim2_{1}_gtype_{2}_optim_bits_{3}".format(*vals)
+    "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals)
    for vals in values
 ]

@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16]
 optimizer_names = ["adam8bit_blockwise"]
 values = list(product(dim1, dim2, gtype, optimizer_names))
 names = [
-    "dim1_{0}_dim2_{1}_gtype_{2}_optim_{3}".format(*vals) for vals in values
+    "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values
 ]