Enable line-ending and other hygiene lints (#1006)

6974920b · Aarni Koskela · GitHub · 3a630c58 · 6974920b · 6974920b
Unverified Commit 6974920b authored Feb 01, 2024 by Aarni Koskela Committed by GitHub Feb 01, 2024
16 changed files
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
        return 1.00000000f*absmax*sign; // 1011
      else
        return 0.66666667f*absmax*sign; // 1010
-    else 
+    else
      if((val & 0b0001) == 1) // 100
        return 5.208333333e-03f*absmax*sign; // 1001
      else
@@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val)
    if((val & 0b0100) == 4) // 1
      if((val & 0b0010) == 2) // 11
        if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
        else
          return 0.7229568362236023f;
      else
        if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
        else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
    else
      if((val & 0b0010) == 2) //10
        if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
        else
-          return 0.24611230194568634f; 
+          return 0.24611230194568634f;
-      else 
+      else
        if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
        else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
  else
    if((val & 0b0100) == 4) // 0
      if((val & 0b0010) == 2) //01
        if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
        else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
      else
        if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
        else
          return -0.28444138169288635f;
    else
@@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val)
        if((val & 0b0001) == 1) // 001
          return -0.39491748809814453f;
        else
-          return -0.5250730514526367f; 
+          return -0.5250730514526367f;
-      else 
+      else
        if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
        else
-          return -1.0f; 
+          return -1.0f;
 }
@@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val)
    if((val & 0b0100) == 4) // 1
      if((val & 0b0010) == 2) // 11
        if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
        else
          return 0.7229568362236023f;
      else
        if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
        else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
    else
      if((val & 0b0010) == 2) //10
        if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
        else
-          return 0.24611230194568634f; 
+          return 0.24611230194568634f;
-      else 
+      else
        if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
        else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
  else
    if((val & 0b0100) == 4) // 0
      if((val & 0b0010) == 2) //01
        if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
        else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
      else
        if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
        else
          return -0.28444138169288635f;
    else
@@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val)
        if((val & 0b0001) == 1) // 001
          return -0.39491748809814453f;
        else
-          return -0.5250730514526367f; 
+          return -0.5250730514526367f;
-      else 
+      else
        if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
        else
-          return -1.0f; 
+          return -1.0f;
 }
@@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
              //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
              //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
              g_val *= gnorm_scale;
 							s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
@@ -3069,7 +3069,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// use k warps per thread block
 //// 1. threadblock use read-only cache to read in register tile for A into shared memory
 //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
-//// 3. each warp reads a segment of values 16x32 from B 
+//// 3. each warp reads a segment of values 16x32 from B
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
@@ -3531,7 +3531,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
 {
-  // per threadblock: 
+  // per threadblock:
  // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
  // 4 warps -> 4 loads per iter
  // 1x32 * 32x4 -> 1x4 outputs per thread block
@@ -3764,7 +3764,7 @@ template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long
  {
    switch(FUNC)
    {
-      case FILL: 
+      case FILL:
        A[i] = (T)value;
        break;
      case ARANGE:

--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -389,7 +389,7 @@ extern "C"
 		int hasPrefetch = 0;
 		CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead
 		if (hasPrefetch == 0) return;
 		CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(cudaPeekAtLastError());
 	}

--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
- sections: 
+- sections:
  - local: index
    title: Bits & Bytes
  - local: quickstart
    title: Quickstart
  - local: installation
    title: Installation
  title: Get started
\ No newline at end of file
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
@@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow
  journal={9th International Conference on Learning Representations, ICLR},
  year={2022}
 }
 ```
\ No newline at end of file
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -6,7 +6,7 @@
 ## Minimal example
-The following code illustrates the steps above. 
+The following code illustrates the steps above.
 ```python
 ```
\ No newline at end of file
--- a/environment.yml
+++ b/environment.yml
@@ -42,4 +42,4 @@ dependencies:
 ## ENV UPDATE:
 # # add new packages to environment.yml, then:
 # mamba env update -n bnb -f environment.yml
\ No newline at end of file
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
@@ -22,6 +22,3 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
 print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc

--- a/install_cuda.py
+++ b/install_cuda.py
@@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path):
    # Install CUDA
    print(f"Installing CUDA version {version}...")
    install_command = [
-        "bash", filepath, 
+        "bash", filepath,
-        "--no-drm", "--no-man-page", "--override", 
+        "--no-drm", "--no-man-page", "--override",
        "--toolkitpath=" + install_path, "--toolkit", "--silent"
    ]
    print(f"Running command: {' '.join(install_command)}")
    try:
        subprocess.run(install_command, check=True)
    except subprocess.CalledProcessError as e:
@@ -99,4 +99,4 @@ def main():
        sys.exit(1)
 if __name__ == "__main__":
    main()
\ No newline at end of file
--- a/scripts/stale.py
+++ b/scripts/stale.py
@@ -55,4 +55,4 @@ def main():
 if __name__ == "__main__":
    main()
\ No newline at end of file
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -519,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
                    torch.testing.assert_close(
                        gradB1, gradB2, atol=0.18, rtol=0.3
                    )
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -19,11 +19,3 @@ def test_manual_override(requires_cuda):
    import bitsandbytes as bnb
    loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
    #assert loaded_lib == 'libbitsandbytes_cuda122.so'
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2345,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant):
        torch.testing.assert_close(A, C2)
        #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001)
        #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -120,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
        for out in outputs:
            print(out)
        raise ValueError(f'Failure count: {failure_count}/{n_cases}')
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -637,6 +637,3 @@ def test_4bit_warnings():
        net(inp)
    assert len(record) == 2
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -58,4 +58,3 @@ def test_switchback(vector_wise_quantization):
            print('GX1', err_sb, err_baseline)
            assert err_sb < 2 * err_baseline