Apply clang-format rules (#1678)

4955d136 · Matthew Douglas · GitHub · 61db0859 · 4955d136 · 4955d136
Unverified Commit 4955d136 authored Jun 13, 2025 by Matthew Douglas Committed by GitHub Jun 13, 2025
11 changed files
--- a/csrc/common.cpp
+++ b/csrc/common.cpp
@@ -26,10 +26,12 @@ void quantize_block(const quantize_block_args& args) {
        if (idx < 255) {
            float dist_left = fabs(normed_value - (args.code[idx]));
            float dist_right = fabs(normed_value - (args.code[idx + 1]));
-            if (dist_right < dist_left) { idx += 1; }
+            if (dist_right < dist_left) {
+                idx += 1;
+            }
        }
        // 5. store index
-        args.out[i] = (unsigned char) idx;
+        args.out[i] = (unsigned char)idx;
    }
 }
--- a/csrc/common.cuh
+++ b/csrc/common.cuh
@@ -28,7 +28,8 @@
 // The maximum number of resident threads per SM varies by arch.
 // For A100/H100 and all prior to Turing, it is 2048, which allows
 // for 2 full blocks of 1024 threads per SM.
-// Reference: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
+// Reference:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications-technical-specifications-per-compute-capability
 #if __CUDA_ARCH__ == 750
 #define BNB_MAX_THREADS_PER_SM 1024
 #elif __CUDA_ARCH__ >= 860 && __CUDA_ARCH__ <= 890

--- a/csrc/common.h
+++ b/csrc/common.h
@@ -5,21 +5,18 @@
 using namespace BinSearch;
-#define BLOCK_SIZE 16384
 struct quantize_block_args {
-    BinAlgo<Scalar, float, Direct2> *bin_searcher;
+    BinAlgo<Scalar, float, Direct2>* bin_searcher;
-    float *code;
+    float* code;
-    float *A;
+    float* A;
-    float *absmax;
+    float* absmax;
-    unsigned char *out;
+    unsigned char* out;
    long long block_end;
    long long block_idx;
    long long threadidx;
    long long blocksize;
 };
 void quantize_block(const quantize_block_args& args);
 #endif
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -4,7 +4,7 @@
 using namespace BinSearch;
-void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n) {
+void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n) {
    for (long long block_idx = 0; block_idx < n; block_idx += blocksize) {
        long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
        long long block_end = block_idx + valid_items;
@@ -13,8 +13,7 @@ void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, lo
    }
 }
-void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n)
+void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n) {
-{
    // the default code is has range [-0.993, 1.0] which can cause an error in the binary search algorithm used below
    code[0] = -1.0f;
@@ -28,15 +27,13 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
    int thread_wave_size = 256;
    // we chunk the threads into waves of 256 since the max limit is
    // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
-    for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
+    for (long long offset = 0; offset < num_blocks; offset += thread_wave_size) {
-    {
        long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
        std::vector<std::thread> threads(valid_chunks);
        std::vector<quantize_block_args> args(valid_chunks);
        int chunks_processed = 0;
-      for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
+        for (long long block_idx = offset * blocksize; block_idx < n; block_idx += blocksize) {
-      {
            long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
            long long block_end = block_idx + valid_items;
@@ -53,11 +50,12 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
            threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
            chunks_processed += 1;
-          if(chunks_processed == valid_chunks){ break; }
+            if (chunks_processed == valid_chunks) {
+                break;
+            }
        }
        for (int i = 0; i < valid_chunks; i++)
            threads[i].join();
    }
 }
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <stdio.h>
-void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n);
+void quantize_cpu(float* code, float* A, float* absmax, unsigned char* out, long long blocksize, long long n);
-void dequantize_cpu(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n);
+void dequantize_cpu(float* code, unsigned char* A, float* absmax, float* out, long long blocksize, long long n);
 #endif
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
--- a/csrc/kernels.cuh
+++ b/csrc/kernels.cuh
@@ -9,116 +9,129 @@
 #ifndef kernels
 #define kernels
+__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
-__global__ void kQuantize(float * code, float * __restrict__ const A, unsigned char *out, const int n);
+__global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);
-__global__ void kDequantize(float *code, unsigned char *A, float *out, const int n);
+template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
-template<typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE> __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n);
+__global__ void kQuantizeBlockwise(
-template<typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE> __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * absmax, T *out, const int blocksize, const int n);
+    float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
+    const int rand_offset, const int n
-template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+);
-__global__ void kPreconditionOptimizer32bit2State(T* g, T* p,
+template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
-                float* state1, float* state2, float *unorm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
-                const int step, const float lr, const float gnorm_scale, const int n);
-template<typename T, int OPTIMIZER>
-__global__ void kOptimizer32bit2State(T* g, T* p,
-                float* state1, float* state2, float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2, const float beta3, const float alpha,
-                const float eps, const float weight_decay,
-                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
-template<typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
-__global__ void kPreconditionOptimizer32bit1State(T* g, T* p,
-                float* state1, float *unorm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
-                const int step, const float lr, const float gnorm_scale, const int n);
-template<typename T, int OPTIMIZER>
-__global__ void kOptimizer32bit1State(T* g, T* p,
-                float* state1,  float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2, const float eps, const float weight_decay,
-                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);
-template<typename T, int OPTIMIZER>
-__global__ void
-kPreconditionOptimizerStatic8bit1State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1,
-                float *unorm,
-                const float beta1, const float beta2,
-                const float eps, const int step,
-                float* __restrict__ const quantiles1,
-                float* max1, float* new_max1,
-                const float weight_decay,
-                const float gnorm_scale, const int n);
-template<typename T, int OPTIMIZER>
-__global__ void
-kOptimizerStatic8bit1State(T* p, T* const g, unsigned char* state1,
-                const float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2,
-                const float eps, const int step, const float lr,
-                float* __restrict__ const quantiles1,
-                float* max1, float* new_max1,
-                float weight_decay, const float gnorm_scale, const int n);
-template<typename T, int OPTIMIZER>
-__global__ void
-kPreconditionOptimizerStatic8bit2State(T* p, T* __restrict__ const g, unsigned char*__restrict__  const state1, unsigned char* __restrict__ const state2,
-                float *unorm,
-                const float beta1, const float beta2,
-                const float eps, const int step,
-                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
-                float* max1, float* max2, float* new_max1, float* new_max2,
-                const float gnorm_scale, const int n);
-template<typename T, int OPTIMIZER>
 __global__ void
-kOptimizerStatic8bit2State(T* p, T* const g, unsigned char* state1, unsigned char* state2,
+    kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);
-                const float *unorm, const float max_unorm, const float param_norm,
-                const float beta1, const float beta2,
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
-                const float eps, const int step, const float lr,
+__global__ void kPreconditionOptimizer32bit2State(
-                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+    T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
-                float* max1, float* max2, float* new_max1, float* new_max2,
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
-                float weight_decay, const float gnorm_scale, const int n);
+);
-template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit2StateBlockwise(
+template <typename T, int OPTIMIZER>
-		T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2,
+__global__ void kOptimizer32bit2State(
-                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const int step, const float lr,
+    T* g, T* p, float* state1, float* state2, float* unorm, const float max_unorm, const float param_norm,
-                float* __restrict__ const quantiles1, float* __restrict__ const quantiles2,
+    const float beta1, const float beta2, const float beta3, const float alpha, const float eps,
-                float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n);
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const bool skip_zeros,
+    const int n
-template<typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH> __global__ void kOptimizerStatic8bit1StateBlockwise(
+);
-		T* p, T* __restrict__ const g, unsigned char* state1,
-                const float beta1, const float beta2,
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
-                const float eps, const int step, const float lr,
+__global__ void kPreconditionOptimizer32bit1State(
-                float* __restrict__ const quantiles1,
+    T* g, T* p, float* state1, float* unorm, const float beta1, const float beta2, const float eps,
-                float* absmax1,
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
-                float weight_decay,
+);
-                const float gnorm_scale, const bool skip_zeros, const int n);
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit1State(
-template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileClipping(T * __restrict__ g, float *gnorm_vec, int step, const int n);
+    T* g, T* p, float* state1, float* unorm, const float max_unorm, const float param_norm, const float beta1,
+    const float beta2, const float eps, const float weight_decay, const int step, const float lr,
-template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out,  float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+    const float gnorm_scale, const bool skip_zeros, const int n
+);
-template <int ITEMS_PER_THREAD, int THREADS>__global__ void kdequant_mm_int32_fp16(
-  int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
+template <typename T, int OPTIMIZER>
-  half *out, half * __restrict__ const bias, const int numRows, const int numCols, const int n);
+__global__ void kPreconditionOptimizerStatic8bit1State(
+    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, float* unorm, const float beta1,
-template<typename T, int THREADS, int SPARSE_DECOMP> __global__ void kgetRowStats(T * __restrict__ A, float *rowStats, float threshold, int rows, int cols);
+    const float beta2, const float eps, const int step, float* __restrict__ const quantiles1, float* max1,
-template<typename T, int THREADS, int SPARSE_DECOMP> __global__ void kInt8VectorQuant(T * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols);
+    float* new_max1, const float weight_decay, const float gnorm_scale, const int n
+);
-template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
+template <typename T, int OPTIMIZER>
-template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A,  T* B,  T * out,  int lda, int ldb, int ldc);
+__global__ void kOptimizerStatic8bit1State(
-template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+    T* p, T* const g, unsigned char* state1, const float* unorm, const float max_unorm, const float param_norm,
-template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize);
+    const float beta1, const float beta2, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* max1, float* new_max1, float weight_decay, const float gnorm_scale,
-template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n);
+    const int n
+);
+template <typename T, int OPTIMIZER>
+__global__ void kPreconditionOptimizerStatic8bit2State(
+    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, unsigned char* __restrict__ const state2,
+    float* unorm, const float beta1, const float beta2, const float eps, const int step,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
+    float* new_max1, float* new_max2, const float gnorm_scale, const int n
+);
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizerStatic8bit2State(
+    T* p, T* const g, unsigned char* state1, unsigned char* state2, const float* unorm, const float max_unorm,
+    const float param_norm, const float beta1, const float beta2, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
+    float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, const int n
+);
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__global__ void kOptimizerStatic8bit2StateBlockwise(
+    T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1, const float beta2,
+    const float beta3, const float alpha, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* absmax1, float* absmax2,
+    float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n
+);
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__global__ void kOptimizerStatic8bit1StateBlockwise(
+    T* p, T* __restrict__ const g, unsigned char* state1, const float beta1, const float beta2, const float eps,
+    const int step, const float lr, float* __restrict__ const quantiles1, float* absmax1, float weight_decay,
+    const float gnorm_scale, const bool skip_zeros, const int n
+);
+template <typename T, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);
+template <typename T, int SPMM_ITEMS, int BITS>
+__global__ void kspmm_coo_very_sparse_naive(
+    int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
+    float* __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB
+);
+template <int ITEMS_PER_THREAD, int THREADS>
+__global__ void kdequant_mm_int32_fp16(
+    int* __restrict__ const A, float* __restrict__ const rowStats, float* __restrict__ const colStats, half* out,
+    half* __restrict__ const bias, const int numRows, const int numCols, const int n
+);
+template <typename T, int THREADS, int SPARSE_DECOMP>
+__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
+template <typename T, int THREADS, int SPARSE_DECOMP>
+__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
+__global__ void kTransformRowToFormat(
+    char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
+);
+template <typename T, int BITS, int THREADS>
+__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
+template <typename T, int THREADS>
+__global__ void kgemm_4bit_inference(
+    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc,
+    int blocksize
+);
+template <typename T, int THREADS, int BITS>
+__global__ void kgemm_4bit_inference_naive(
+    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, const float* datatype, T* out,
+    int lda, int ldb, int ldc, int blocksize
+);
+template <typename T, int FUNC> __global__ void kfunc(T* A, T* B, T value, long n);
 #endif
--- a/csrc/mps_ops.mm
+++ b/csrc/mps_ops.mm
@@ -5,37 +5,34 @@
 #define NUM 4
 #define NUM_BLOCK 4096
-static inline MPSGraph* get_graph()
+static inline MPSGraph* get_graph() {
-{
    static MPSGraph* cur = nil;
-  if(!cur) {
+    if (!cur) {
        cur = [[MPSGraph alloc] init];
    }
    return cur;
 }
-static inline id<MTLDevice> get_device()
+static inline id<MTLDevice> get_device() {
-{
+    NSError* error = nil;
-  NSError *error = nil;
    static id<MTLDevice> device = nil;
-  if(!device) {
+    if (!device) {
        device = MTLCreateSystemDefaultDevice();
    }
-  if(!device) {
+    if (!device) {
        NSLog(@"Failed to get MPS device");
        abort();
    }
    return device;
 }
-static inline id<MTLLibrary> get_library()
+static inline id<MTLLibrary> get_library() {
-{
+    NSError* error = nil;
-  NSError *error = nil;
    static id<MTLLibrary> library = nil;
-  if(!library) {
+    if (!library) {
        library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
    }
-  if(!library) {
+    if (!library) {
        NSLog(@"Failed to load bitsandbytes.metallib");
        abort();
    }
@@ -44,20 +41,18 @@ static inline id<MTLLibrary> get_library()
 /*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
 {
-  id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"];
+  id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0
-  return out;
+dataType:MPSDataTypeInt8 axis:0 name:@"out"]; return out;
 }*/
 // MPSGraph function for quantize
-extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n)
+extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) {
-{
    id<MTLDevice> device = get_device();
    id<MTLLibrary> library = get_library();
    static id<MTLFunction> kernel = nil;
-  if(!kernel) {
+    if (!kernel) {
        kernel = [library newFunctionWithName:@"quantize"];
-    if(!kernel) {
+        if (!kernel) {
            NSLog(@"Failed to load bitsandbytes.metallib");
            abort();
        }

--- a/csrc/ops.cu
+++ b/csrc/ops.cu
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -3,41 +3,41 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 #ifndef ops_H
 #define ops_H
+#include <assert.h>
 #include <cstdint>
-#include <stdio.h>
 #include <iostream>
-#include <assert.h>
+#include <stdio.h>
-#include <cuda_runtime_api.h>
-#include <cuda_fp16.h>
-#include <cublas_v2.h>
 #include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
 #include <cusparse.h>
-#include <vector>
 #include <functional>
+#include <vector>
+#define CUDA_CHECK_RETURN(value)                                                                                       \
-#define CUDA_CHECK_RETURN(value) {                      \
+    {                                                                                                                  \
        cudaError_t _m_cudaStat = value;                                                                               \
        if (_m_cudaStat != cudaSuccess) {                                                                              \
-    fprintf(stderr, "Error %s at line %d in file %s\n",         \
+            fprintf(stderr, "Error %s at line %d in file %s\n", cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);  \
-        cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
            exit(1);                                                                                                   \
-  } }
+        }                                                                                                              \
+    }
-#define CHECK_CUSPARSE(value) {                      \
+#define CHECK_CUSPARSE(value)                                                                                          \
+    {                                                                                                                  \
        cusparseStatus_t _m_cudaStat = value;                                                                          \
        if (_m_cudaStat != CUSPARSE_STATUS_SUCCESS) {                                                                  \
-    fprintf(stderr, "Error %s at line %d in file %s\n",         \
+            fprintf(                                                                                                   \
-        cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__);   \
+                stderr, "Error %s at line %d in file %s\n", cusparseGetErrorString(_m_cudaStat), __LINE__, __FILE__    \
+            );                                                                                                         \
            exit(1);                                                                                                   \
-  } }
+        }                                                                                                              \
+    }
 inline void checkCudaStatus(cudaError_t status) {
    if (status != cudaSuccess) {
@@ -49,19 +49,17 @@ inline void checkCudaStatus(cudaError_t status) {
 inline int checkCublasStatus(cublasStatus_t status) {
    if (status != CUBLAS_STATUS_SUCCESS) {
        printf("cuBLAS API failed with status %d\n", status);
-        //throw std::logic_error("cuBLAS API failed");
+        // throw std::logic_error("cuBLAS API failed");
        return 1;
    }
    return 0;
 }
-typedef enum Operations_t
+typedef enum Operations_t {
-{
    ksmul = 0,
 } Operations_t;
-typedef enum Optimizer_t
+typedef enum Optimizer_t {
-{
    ADAM = 0,
    MOMENTUM = 1,
    RMSPROP = 2,
@@ -71,8 +69,7 @@ typedef enum Optimizer_t
    ADEMAMIX = 6
 } Optimizer_t;
-typedef enum Transform_t
+typedef enum Transform_t {
-{
    ROW = 0,
    COL = 1,
    COL32 = 2,
@@ -80,109 +77,135 @@ typedef enum Transform_t
    COL_AMPERE = 4,
 } Transform_t;
-typedef enum DataType_t
+typedef enum DataType_t {
-{
    General8bit = 0,
    FP4 = 1,
    NF4 = 2,
 } DataType_t;
-typedef enum Funcs_t
+typedef enum Funcs_t {
-{
    FILL = 0,
    ARANGE = 1,
    _MUL = 2,
 } Funcs_t;
-class Context
+class Context {
-{
  public:
    cublasHandle_t m_handle;
-				Context()
+    Context() {
-				{
        cublasHandle_t handle;
        cublasCreate_v2(&handle);
        m_handle = handle;
    }
 };
-class ContextLt
+class ContextLt {
-{
  public:
    cublasLtHandle_t m_handle;
-				ContextLt()
+    ContextLt() {
-				{
        cublasLtHandle_t handle;
        cublasLtCreate(&handle);
        m_handle = handle;
    }
 };
-class ContextCusparse
+class ContextCusparse {
-{
  public:
    cusparseHandle_t m_handle;
-				ContextCusparse()
+    ContextCusparse() {
-				{
        cusparseHandle_t handle;
        cusparseCreate(&handle);
        m_handle = handle;
    }
 };
-void quantize(float *code, float *A, unsigned char *out, int n);
+void quantize(float* code, float* A, unsigned char* out, int n);
-void dequantize(float *code, unsigned char *A, float *out, int n, cudaStream_t stream);
+void dequantize(float* code, unsigned char* A, float* out, int n, cudaStream_t stream);
-template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
+template <typename T, int STOCHASTIC, int DATA_TYPE>
-template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int block_size, const int n, cudaStream_t stream);
+void quantizeBlockwise(
+    float* code, T* A, float* absmax, unsigned char* out, float* rand, int rand_offset, int blocksize, const int n
-template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
+);
-                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
+template <typename T, int DATA_TYPE>
-                float beta1, float beta2, float beta3, float alpha, float eps, float weight_decay,
+void dequantizeBlockwise(
-                int step, float lr, const float gnorm_scale, bool skip_zeros, int n);
+    float* code, unsigned char* A, float* absmax, T* out, int block_size, const int n, cudaStream_t stream
+);
-template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g, unsigned char* state1, unsigned char* state2,
-                float *unorm, float max_unorm, float param_norm,
+template <typename T, int OPTIMIZER>
-                float beta1, float beta2,
+void optimizer32bit(
-                float eps, int step, float lr,
+    T* g, T* p, float* state1, float* state2, float* unorm, float max_unorm, float param_norm, float beta1, float beta2,
-                float* quantiles1, float* quantiles2,
+    float beta3, float alpha, float eps, float weight_decay, int step, float lr, const float gnorm_scale,
-                float* max1, float* max2, float* new_max1, float* new_max2,
+    bool skip_zeros, int n
-                float weight_decay,
+);
-                const float gnorm_scale, int n);
+template <typename T, int OPTIMIZER>
-template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(T* p, T* g,
+void optimizerStatic8bit(
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr,
+    T* p, T* g, unsigned char* state1, unsigned char* state2, float* unorm, float max_unorm, float param_norm,
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale,
+    float beta1, float beta2, float eps, int step, float lr, float* quantiles1, float* quantiles2, float* max1,
-								bool skip_zeros, int n);
+    float* max2, float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, int n
+);
-template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n);
+template <typename T, int OPTIMIZER>
-void gemmex(Context * context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
+void optimizerStatic8bitBlockwise(
-void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
+    T* p, T* g, unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha,
-                    long long int strideA, long long int strideB, long long int strideC, int batchCount);
+    float eps, int step, float lr, float* quantiles1, float* quantiles2, float* absmax1, float* absmax2,
+    float weight_decay, const float gnorm_scale, bool skip_zeros, int n
-template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream);
+);
-void cutlass_igemm(bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc);
+template <typename T> void percentileClipping(T* g, float* gnorm_vec, int step, const int n);
-void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half* bias, int numRows, int numCols, cudaStream_t stream);
-void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream);
+void gemmex(
-void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, cudaStream_t stream);
+    Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
+    int ldb, int ldc
-void spmm_coo(cusparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B);
+);
+void strided_gemmex(
-template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
+    Context* context, bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda,
+    int ldb, int ldc, long long int strideA, long long int strideB, long long int strideC, int batchCount
-void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB);
+);
-template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits);
+template <int DTYPE_OUT, int SCALE_ROWS>
-template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize);
+int igemmlt(
-template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize, cudaStream_t stream);
+    cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t* A, const int8_t* B, void* C, float* row_scale,
+    int lda, int ldb, int ldc, cudaStream_t stream
-template <typename T, int FUNC> void func(T *A, T *B, T value, long n);
+);
+void cutlass_igemm(
+    bool transposeA, bool transposeB, int m, int n, int k, void* A, void* B, void* C, int lda, int ldb, int ldc
+);
+void dequant_mm_int32_fp16(
+    int* A, float* rowStats, float* colStats, half* out, half* bias, int numRows, int numCols, cudaStream_t stream
+);
+void getRowStats(half* A, float* rowStats, float threshold, int rows, int cols, cudaStream_t stream);
+void int8VectorQuant(
+    half* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols, cudaStream_t stream
+);
+void spmm_coo(
+    cusparseHandle_t handle, int* A_rowidx, int* A_colidx, half* A_vals, int A_nnz, int A_rows, int A_cols, int B_cols,
+    int ldb, half* B, int ldc, half* C, bool transposed_B
+);
+template <typename T, int BITS>
+void spmm_coo_very_sparse_naive(
+    int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
+    float* dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB
+);
+void matmul4bite(half* A, unsigned char* B, half* out, int lda, int ldb, int rowsA, int colsA, int colsB);
+template <typename T> void gemm_host(int m, int n, int k, T* A, T* B, T* out, int lda, int ldb, int ldc, int bits);
+template <typename T>
+void gemm_4bit_inference(
+    int m, int n, int k, T* A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc, int blocksize
+);
+template <typename T, int BITS>
+void gemm_4bit_inference_naive(
+    int m, int n, int k, T* A, unsigned char* B, float* absmax, float* datatype, T* out, int lda, int ldb, int ldc,
+    int blocksize, cudaStream_t stream
+);
+template <typename T, int FUNC> void func(T* A, T* B, T value, long n);
 #endif
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp