Added bitsandbytes

144fd688 · zhaoying1 · 387082e1 · 144fd688 · 144fd688 · 144fd688
Commit 144fd688 authored Jun 08, 2023 by zhaoying1
20 changed files
--- a/bitsandbytes/optim/lamb.py
+++ b/bitsandbytes/optim/lamb.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from bitsandbytes.optim.optimizer import Optimizer2State
+class LAMB(Optimizer2State):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        amsgrad=False,
+        adam_w_mode=True,
+        optim_bits=32,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=False,
+        max_unorm=1.0,
+    ):
+        super(LAMB, self).__init__(
+            "lamb",
+            params,
+            lr,
+            betas,
+            eps,
+            weight_decay,
+            optim_bits,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+            max_unorm=1.0,
+        )
+class LAMB8bit(Optimizer2State):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        amsgrad=False,
+        adam_w_mode=True,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=False,
+        max_unorm=1.0,
+    ):
+        super(LAMB8bit, self).__init__(
+            "lamb",
+            params,
+            lr,
+            betas,
+            eps,
+            weight_decay,
+            8,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+            max_unorm=1.0,
+        )
+class LAMB32bit(Optimizer2State):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        bias_correction=True,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        amsgrad=False,
+        adam_w_mode=True,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=False,
+        max_unorm=1.0,
+    ):
+        super(LAMB32bit, self).__init__(
+            "lamb",
+            params,
+            lr,
+            betas,
+            eps,
+            weight_decay,
+            32,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+            max_unorm=1.0,
+        )
--- a/bitsandbytes/optim/lars.py
+++ b/bitsandbytes/optim/lars.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.optim import Optimizer
+from bitsandbytes.optim.optimizer import Optimizer1State
+class LARS(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        optim_bits=32,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        max_unorm=0.02,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(
+                f"LARS without momentum is not supported!"
+            )
+        super(LARS, self).__init__(
+            "lars",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            optim_bits,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            max_unorm=max_unorm,
+            block_wise=False,
+        )
+class LARS8bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        max_unorm=0.02,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(
+                f"LARS without momentum is not supported!"
+            )
+        super(LARS8bit, self).__init__(
+            "lars",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            8,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            max_unorm=max_unorm,
+            block_wise=False,
+        )
+class LARS32bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        max_unorm=0.02,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(
+                f"LARS without momentum is not supported!"
+            )
+        super(LARS32bit, self).__init__(
+            "lars",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            32,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            max_unorm=max_unorm,
+            block_wise=False,
+        )
+class PytorchLARS(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=0.01,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        max_unorm=0.02,
+    ):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError(
+                "Invalid weight_decay value: {}".format(weight_decay)
+            )
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            dampening=dampening,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            max_unorm=max_unorm,
+        )
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError(
+                "Nesterov momentum requires a momentum and zero dampening"
+            )
+        super(PytorchLARS, self).__init__(params, defaults)
+    def __setstate__(self, state):
+        super(PytorchLARS, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("nesterov", False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            d_p_list = []
+            momentum_buffer_list = []
+            weight_decay = group["weight_decay"]
+            momentum = group["momentum"]
+            dampening = group["dampening"]
+            nesterov = group["nesterov"]
+            max_unorm = group["max_unorm"]
+            lr = group["lr"]
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                state = self.state[p]
+                d_p = p.grad
+                if weight_decay != 0:
+                    d_p = d_p.add(param, alpha=weight_decay)
+                if momentum != 0:
+                    buf = state.get("momentum_buffer", None)
+                    if buf is None:
+                        buf = torch.clone(d_p).detach()
+                        state["momentum_buffer"] = buf
+                    else:
+                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+                    if nesterov:
+                        update = d_p + buf * momentum
+                    else:
+                        update = buf
+                update_scale = 1.0
+                if max_unorm > 0.0:
+                    assert p.dtype == torch.float32
+                    pnorm = torch.norm(p.detach())
+                    unorm = torch.norm(update)
+                    if unorm > max_unorm * pnorm:
+                        update_scale = max_unorm * pnorm / unorm
+                p.add_(update, alpha=-lr * update_scale)
+        return loss
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from bitsandbytes.optim.optimizer import Optimizer1State
+class RMSprop(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        optim_bits=32,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if alpha == 0:
+            raise NotImplementedError(
+                f"RMSprop with alpha==0.0 is not supported!"
+            )
+        if centered:
+            raise NotImplementedError(f"Centered RMSprop is not supported!")
+        super(RMSprop, self).__init__(
+            "rmsprop",
+            params,
+            lr,
+            (alpha, momentum),
+            eps,
+            weight_decay,
+            optim_bits,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
+class RMSprop8bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if alpha == 0:
+            raise NotImplementedError(
+                f"RMSprop with alpha==0.0 is not supported!"
+            )
+        if centered:
+            raise NotImplementedError(f"Centered RMSprop is not supported!")
+        super(RMSprop8bit, self).__init__(
+            "rmsprop",
+            params,
+            lr,
+            (alpha, momentum),
+            eps,
+            weight_decay,
+            8,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
+class RMSprop32bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if alpha == 0:
+            raise NotImplementedError(
+                f"RMSprop with alpha==0.0 is not supported!"
+            )
+        if centered:
+            raise NotImplementedError(f"Centered RMSprop is not supported!")
+        super(RMSprop32bit, self).__init__(
+            "rmsprop",
+            params,
+            lr,
+            (alpha, momentum),
+            eps,
+            weight_decay,
+            32,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from bitsandbytes.optim.optimizer import Optimizer1State
+class SGD(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        optim_bits=32,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(f"SGD without momentum is not supported!")
+        super(SGD, self).__init__(
+            "momentum",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            optim_bits,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
+class SGD8bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(f"SGD without momentum is not supported!")
+        super(SGD8bit, self).__init__(
+            "momentum",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            8,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
+class SGD32bit(Optimizer1State):
+    def __init__(
+        self,
+        params,
+        lr,
+        momentum=0,
+        dampening=0,
+        weight_decay=0,
+        nesterov=False,
+        args=None,
+        min_8bit_size=4096,
+        percentile_clipping=100,
+        block_wise=True,
+    ):
+        if momentum == 0:
+            raise NotImplementedError(f"SGD without momentum is not supported!")
+        super(SGD32bit, self).__init__(
+            "momentum",
+            params,
+            lr,
+            (momentum, dampening),
+            0.0,
+            weight_decay,
+            32,
+            args,
+            min_8bit_size,
+            percentile_clipping,
+            block_wise,
+        )
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
+import shlex
+import subprocess
+from typing import Tuple
+def execute_and_return(command_string: str) -> Tuple[str, str]:
+    def _decode(subprocess_err_out_tuple):
+        return tuple(
+            to_decode.decode("UTF-8").strip()
+            for to_decode in subprocess_err_out_tuple
+        )
+    def execute_and_return_decoded_std_streams(command_string):
+        return _decode(
+            subprocess.Popen(
+                shlex.split(command_string),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            ).communicate()
+        )
+    std_out, std_err = execute_and_return_decoded_std_streams(command_string)
+    return std_out, std_err
--- a/compile_from_source.md
+++ b/compile_from_source.md
+# Compiling from source
+Basic steps.
+1. `make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cpuonly`
+2. `CUDA_VERSION=XXX python setup.py install`
+To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). 
+For your convenience, there is an installation script in the root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
+```bash
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/" >> ~/.bashrc
+echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
+source ~/.bashrc
+```
+By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. 
+Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed
+If you have problems compiling the library with these instructions from source, please open an issue.
--- a/csrc/common.cpp
+++ b/csrc/common.cpp
+#include <common.h>
+#include <float.h>
+void *quantize_block(void *arguments) {
+    // 1. find absmax in block
+    // 2. divide input value by absmax to normalize into [-1.0, 1.0]
+    // 3. do binary search to find the closest value
+    // 4. check minimal distance
+    // 5. store index
+    struct quantize_block_args *args = (quantize_block_args *) arguments;
+    // 1. find absmax in block
+    float absmax_block = -FLT_MAX;
+    for (long long i = args->block_idx; i < args->block_end; i++)
+        absmax_block = fmax(absmax_block, fabs(args->A[i]));
+    args->absmax[args->block_idx / args->blocksize] = absmax_block;
+    for (long long i = args->block_idx; i < args->block_end; i++) {
+        // 2. divide input value by absmax to normalize into [-1.0, 1.0]
+        // 3. do binary search to find the closest value
+        float normed_value = args->A[i] / absmax_block;
+        long long idx = args->bin_searcher->scalar(normed_value);
+        // 4. check minimal distance
+        // The binary search returns always the value to the left, which might not be the closest value
+        if (idx < 255) {
+            float dist_left = fabs(normed_value - (args->code[idx]));
+            float dist_right = fabs(normed_value - (args->code[idx + 1]));
+            if (dist_right < dist_left) { idx += 1; }
+        }
+        // 5. store index
+        args->out[i] = (unsigned char) idx;
+    }
+    return NULL;
+}
--- a/csrc/common.h
+++ b/csrc/common.h
+#include <BinSearch.h>
+#ifndef common
+#define common
+using namespace BinSearch;
+#define BLOCK_SIZE 16384
+struct quantize_block_args {
+    BinAlgo<Scalar, float, Direct2> *bin_searcher;
+    float *code;
+    float *A;
+    float *absmax;
+    unsigned char *out;
+    long long block_end;
+    long long block_idx;
+    long long threadidx;
+		long long blocksize;
+};
+void *quantize_block(void *arguments);
+#endif
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
+#include <BinSearch.h>
+#include <pthread.h>
+#include <common.h>
+using namespace BinSearch;
+void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n) {
+    for (long long block_idx = 0; block_idx < n; block_idx += blocksize) {
+        long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
+        long long block_end = block_idx + valid_items;
+        for (long long i = block_idx; i < block_end; i++)
+            out[i] = code[A[i]] * absmax[block_idx / blocksize];
+    }
+}
+void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n)
+{
+    // the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
+    code[0] = -1.0f;
+    long long num_blocks = n / blocksize;
+    num_blocks += n % blocksize == 0 ? 0 : 1;
+    const uint32 elements_code = 256;
+    BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
+    int thread_wave_size = 256;
+    // we chunk the thresds into waves of 256 since the max limit is
+    // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
+    for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
+    {
+      long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
+      pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
+      struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
+      for(long long i = 0; i < valid_chunks; i++)
+          args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args));
+      int chunks_processed = 0;
+      for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
+      {
+          long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
+          long long block_end = block_idx + valid_items;
+          struct quantize_block_args *arg = args[chunks_processed];
+          arg->bin_searcher = &bin_searcher;
+          arg->code = code;
+          arg->A = A;
+          arg->absmax = absmax;
+          arg->out = out;
+          arg->block_end = block_end;
+          arg->block_idx = block_idx;
+          arg->threadidx = block_idx / blocksize;
+          arg->blocksize = blocksize;
+          pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+          chunks_processed += 1;
+          if(chunks_processed == valid_chunks){ break; }
+      }
+      for (int i = 0; i < valid_chunks; i++)
+          int err = pthread_join(threads[i], NULL);
+      free(threads);
+      for (int i = 0; i < valid_chunks; i++)
+          free(args[i]);
+      free(args);
+    }
+}
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
+#ifndef BITSANDBYTES_CPU_OPS_H
+#define BITSANDBYTES_CPU_OPS_H
+#include <iostream>
+#include <stdio.h>
+void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n);
+void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n);
+#endif
\ No newline at end of file
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
--- a/csrc/kernels.cuh
+++ b/csrc/kernels.cuh
+// Copyright (c) Facebook, Inc. and its affiliates. 
+//   
+// This source code is licensed under the MIT license found in the 
+// LICENSE file in the root directory of this source tree.
+#include <hip/hip_runtime.h>
+#include <float.h>
+#include "ops.cuh"
+#ifndef kernels
+#define kernels
+template<typename T>__global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const float offset, const T max_val, const int n);
+__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
+__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
+template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
+template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int n);
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit2State(T* g, T* p, 
+                float* state1, float* state2, float *unorm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit2State(T* g, T* p, 
+                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit1State(T* g, T* p, 
+                float* state1, float *unorm,
+                const float beta1, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit1State(T* g, T* p, 
+                float* state1,  float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float eps, const float weight_decay,
+                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, 
+                float *unorm,
+                const float beta1, 
+                const float eps, const int step, 
+                float* __restrict__ const quantiles1, 
+                float* max1, float* new_max1, 
+                const float weight_decay,
+                const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1, 
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, 
+                const float eps, const int step, const float lr, 
+                float* __restrict__ const quantiles1, 
+                float* max1, float* new_max1, 
+                float weight_decay, const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void
+kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
+                float *unorm,
+                const float beta1, const float beta2,
+                const float eps, const int step, 
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER>
+__global__ void
+kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
+                const float *unorm, const float max_unorm, const float param_norm,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr, 
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* max1, float* max2, float* new_max1, float* new_max2,
+                float weight_decay, const float gnorm_scale, const int n);
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
+                const float beta1, const float beta2, const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
+template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
+		T* p, T* __restrict__ const g, unsigned char* state1,
+                const float beta1, const float beta2,
+                const float eps, const int step, const float lr,
+                float* __restrict__ const quantiles1,
+                float* absmax1,
+                float weight_decay,
+                const float gnorm_scale, const bool skip_zeros, const int n);
+template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
+__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
+template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
+  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
+  half *out, float* newRowStats, float* newcolStats, half * __restrict__ const bias, const int numRows, const int numCols, const int tileCols, const int n);
+template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kgetColRowStats(T * __restrict__ A, float *rowStats, float *colStats, int * nnz_count_row, float nnz_threshold, int rows, int cols, int tiledRows, int tiledCols);
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int SPARSE_DECOMP> __global__ void kDoubleRowColQuant(half *__restrict__ const A, float *__restrict__ const rowStats, float * __restrict__ const colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int * __restrict__ nnz_block_ptr, float threshold, int rows, int cols, int tiledCols);
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
+#endif
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
--- a/cuda_install.sh
+++ b/cuda_install.sh
+URL92=https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux
+URL100=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
+URL101=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run
+URL102=https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
+URL110=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run
+URL111=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
+URL112=https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
+URL113=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+URL114=https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run
+URL115=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
+URL116=https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
+URL117=https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
+URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+CUDA_VERSION=$1
+BASE_PATH=$2
+if [[ -n "$CUDA_VERSION" ]]; then
+  if   [[ "$CUDA_VERSION" -eq "92" ]]; then
+    URL=$URL92
+    FOLDER=cuda-9.2
+  elif   [[ "$CUDA_VERSION" -eq "100" ]]; then
+    URL=$URL100
+    FOLDER=cuda-10.0
+  elif   [[ "$CUDA_VERSION" -eq "101" ]]; then
+    URL=$URL101
+    FOLDER=cuda-10.1
+  elif   [[ "$CUDA_VERSION" -eq "102" ]]; then
+    URL=$URL102
+    FOLDER=cuda-10.2
+  elif   [[ "$CUDA_VERSION" -eq "110" ]]; then
+    URL=$URL110
+    FOLDER=cuda-11.0
+  elif   [[ "$CUDA_VERSION" -eq "111" ]]; then
+    URL=$URL111
+    FOLDER=cuda-11.1
+  elif   [[ "$CUDA_VERSION" -eq "112" ]]; then
+    URL=$URL112
+    FOLDER=cuda-11.2
+  elif   [[ "$CUDA_VERSION" -eq "113" ]]; then
+    URL=$URL113
+    FOLDER=cuda-11.3
+  elif [[ "$CUDA_VERSION" -eq "114" ]]; then
+    URL=$URL114
+    FOLDER=cuda-11.4
+  elif [[ "$CUDA_VERSION" -eq "115" ]]; then
+    URL=$URL115
+    FOLDER=cuda-11.5
+  elif [[ "$CUDA_VERSION" -eq "116" ]]; then
+    URL=$URL116
+    FOLDER=cuda-11.6
+  elif [[ "$CUDA_VERSION" -eq "117" ]]; then
+    URL=$URL117
+    FOLDER=cuda-11.7
+  elif [[ "$CUDA_VERSION" -eq "118" ]]; then
+    URL=$URL118
+    FOLDER=cuda-11.8
+  else
+    echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
+  fi
+else
+    echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
+fi
+FILE=$(basename $URL)
+if [[ -n "$CUDA_VERSION" ]]; then
+  echo $URL
+  echo $FILE
+  wget $URL
+  bash $FILE --no-drm --no-man-page --override --toolkitpath=$BASE_PATH/$FOLDER/ --toolkit --silent
+  echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$BASE_PATH/$FOLDER/lib64/" >> ~/.bashrc
+  echo "export PATH=$PATH:$BASE_PATH/$FOLDER/bin/" >> ~/.bashrc
+  source ~/.bashrc
+else
+  echo ""
+fi
--- a/deploy.sh
+++ b/deploy.sh
--- a/environment.yml
+++ b/environment.yml
--- a/errors_and_solutions.md
+++ b/errors_and_solutions.md