Unverified Commit 6974920b authored by Aarni Koskela's avatar Aarni Koskela Committed by GitHub
Browse files

Enable line-ending and other hygiene lints (#1006)

parent 3a630c58
......@@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
return 1.00000000f*absmax*sign; // 1011
else
return 0.66666667f*absmax*sign; // 1010
else
else
if((val & 0b0001) == 1) // 100
return 5.208333333e-03f*absmax*sign; // 1001
else
......@@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val)
if((val & 0b0100) == 4) // 1
if((val & 0b0010) == 2) // 11
if((val & 0b0001) == 1) // 111
return 1.0f;
return 1.0f;
else
return 0.7229568362236023f;
else
if((val & 0b0001) == 1) // 110
return 0.5626170039176941f;
return 0.5626170039176941f;
else
return 0.44070982933044434f;
return 0.44070982933044434f;
else
if((val & 0b0010) == 2) //10
if((val & 0b0001) == 1) // 101
return 0.33791524171829224f;
return 0.33791524171829224f;
else
return 0.24611230194568634f;
else
return 0.24611230194568634f;
else
if((val & 0b0001) == 1) // 100
return 0.16093020141124725f;
return 0.16093020141124725f;
else
return 0.07958029955625534f;
return 0.07958029955625534f;
else
if((val & 0b0100) == 4) // 0
if((val & 0b0010) == 2) //01
if((val & 0b0001) == 1) // 011
return 0.0f;
return 0.0f;
else
return -0.09105003625154495f;
return -0.09105003625154495f;
else
if((val & 0b0001) == 1) // 010
return -0.18477343022823334f;
return -0.18477343022823334f;
else
return -0.28444138169288635f;
else
......@@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val)
if((val & 0b0001) == 1) // 001
return -0.39491748809814453f;
else
return -0.5250730514526367f;
else
return -0.5250730514526367f;
else
if((val & 0b0001) == 1) // 000
return -0.6961928009986877f;
return -0.6961928009986877f;
else
return -1.0f;
return -1.0f;
}
......@@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val)
if((val & 0b0100) == 4) // 1
if((val & 0b0010) == 2) // 11
if((val & 0b0001) == 1) // 111
return 1.0f;
return 1.0f;
else
return 0.7229568362236023f;
else
if((val & 0b0001) == 1) // 110
return 0.5626170039176941f;
return 0.5626170039176941f;
else
return 0.44070982933044434f;
return 0.44070982933044434f;
else
if((val & 0b0010) == 2) //10
if((val & 0b0001) == 1) // 101
return 0.33791524171829224f;
return 0.33791524171829224f;
else
return 0.24611230194568634f;
else
return 0.24611230194568634f;
else
if((val & 0b0001) == 1) // 100
return 0.16093020141124725f;
return 0.16093020141124725f;
else
return 0.07958029955625534f;
return 0.07958029955625534f;
else
if((val & 0b0100) == 4) // 0
if((val & 0b0010) == 2) //01
if((val & 0b0001) == 1) // 011
return 0.0f;
return 0.0f;
else
return -0.09105003625154495f;
return -0.09105003625154495f;
else
if((val & 0b0001) == 1) // 010
return -0.18477343022823334f;
return -0.18477343022823334f;
else
return -0.28444138169288635f;
else
......@@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val)
if((val & 0b0001) == 1) // 001
return -0.39491748809814453f;
else
return -0.5250730514526367f;
else
return -0.5250730514526367f;
else
if((val & 0b0001) == 1) // 000
return -0.6961928009986877f;
return -0.6961928009986877f;
else
return -1.0f;
return -1.0f;
}
......@@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
//float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
//g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
g_val *= gnorm_scale;
s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
......@@ -3069,7 +3069,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
//// use k warps per thread block
//// 1. threadblock use read-only cache to read in register tile for A into shared memory
//// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
//// 3. each warp reads a segment of values 16x32 from B
//// 3. each warp reads a segment of values 16x32 from B
//// 4. do dequantization from register of B into second pair of registers
//// 5. store (4) into fragment
//// 6. matmul aggregate into fragment C
......@@ -3531,7 +3531,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize)
{
// per threadblock:
// per threadblock:
// load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
// 4 warps -> 4 loads per iter
// 1x32 * 32x4 -> 1x4 outputs per thread block
......@@ -3764,7 +3764,7 @@ template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long
{
switch(FUNC)
{
case FILL:
case FILL:
A[i] = (T)value;
break;
case ARANGE:
......
......@@ -389,7 +389,7 @@ extern "C"
int hasPrefetch = 0;
CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead
if (hasPrefetch == 0) return;
CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
......
- sections:
- sections:
- local: index
title: Bits & Bytes
- local: quickstart
title: Quickstart
- local: installation
title: Installation
title: Get started
\ No newline at end of file
title: Get started
......@@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
bash install_cuda.sh 117 ~/local 1
bash install_cuda.sh 117 ~/local 1
```
To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
......@@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow
journal={9th International Conference on Learning Representations, ICLR},
year={2022}
}
```
\ No newline at end of file
```
......@@ -6,7 +6,7 @@
## Minimal example
The following code illustrates the steps above.
The following code illustrates the steps above.
```python
```
\ No newline at end of file
```
......@@ -42,4 +42,4 @@ dependencies:
## ENV UPDATE:
# # add new packages to environment.yml, then:
# mamba env update -n bnb -f environment.yml
\ No newline at end of file
# mamba env update -n bnb -f environment.yml
......@@ -22,6 +22,3 @@ model = AutoModelForCausalLM.from_pretrained(
)
generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
......@@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b
wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
......
......@@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path):
# Install CUDA
print(f"Installing CUDA version {version}...")
install_command = [
"bash", filepath,
"--no-drm", "--no-man-page", "--override",
"bash", filepath,
"--no-drm", "--no-man-page", "--override",
"--toolkitpath=" + install_path, "--toolkit", "--silent"
]
print(f"Running command: {' '.join(install_command)}")
try:
subprocess.run(install_command, check=True)
except subprocess.CalledProcessError as e:
......@@ -99,4 +99,4 @@ def main():
sys.exit(1)
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -55,4 +55,4 @@ def main():
if __name__ == "__main__":
main()
\ No newline at end of file
main()
......@@ -519,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
torch.testing.assert_close(
gradB1, gradB2, atol=0.18, rtol=0.3
)
......@@ -19,11 +19,3 @@ def test_manual_override(requires_cuda):
import bitsandbytes as bnb
loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
#assert loaded_lib == 'libbitsandbytes_cuda122.so'
......@@ -2345,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant):
torch.testing.assert_close(A, C2)
#torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001)
#torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
......@@ -120,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
for out in outputs:
print(out)
raise ValueError(f'Failure count: {failure_count}/{n_cases}')
......@@ -637,6 +637,3 @@ def test_4bit_warnings():
net(inp)
assert len(record) == 2
......@@ -58,4 +58,3 @@ def test_switchback(vector_wise_quantization):
print('GX1', err_sb, err_baseline)
assert err_sb < 2 * err_baseline
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment