Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
3ec3dd26
Commit
3ec3dd26
authored
Mar 13, 2024
by
Aarni Koskela
Browse files
Fix type documentation for optimizer `args`
parent
8706830f
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
70 additions
and
70 deletions
+70
-70
bitsandbytes/optim/adagrad.py
bitsandbytes/optim/adagrad.py
+6
-6
bitsandbytes/optim/adam.py
bitsandbytes/optim/adam.py
+12
-12
bitsandbytes/optim/adamw.py
bitsandbytes/optim/adamw.py
+12
-12
bitsandbytes/optim/lamb.py
bitsandbytes/optim/lamb.py
+6
-6
bitsandbytes/optim/lars.py
bitsandbytes/optim/lars.py
+6
-6
bitsandbytes/optim/lion.py
bitsandbytes/optim/lion.py
+12
-12
bitsandbytes/optim/optimizer.py
bitsandbytes/optim/optimizer.py
+4
-4
bitsandbytes/optim/rmsprop.py
bitsandbytes/optim/rmsprop.py
+6
-6
bitsandbytes/optim/sgd.py
bitsandbytes/optim/sgd.py
+6
-6
No files found.
bitsandbytes/optim/adagrad.py
View file @
3ec3dd26
...
...
@@ -38,8 +38,8 @@ class Adagrad(Optimizer1State):
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -105,8 +105,8 @@ class Adagrad8bit(Optimizer1State):
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 8):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -173,8 +173,8 @@ class Adagrad32bit(Optimizer1State):
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/adam.py
View file @
3ec3dd26
...
...
@@ -47,8 +47,8 @@ class Adam(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -108,8 +108,8 @@ class Adam8bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -169,8 +169,8 @@ class Adam32bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -230,8 +230,8 @@ class PagedAdam(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -291,8 +291,8 @@ class PagedAdam8bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -352,8 +352,8 @@ class PagedAdam32bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/adamw.py
View file @
3ec3dd26
...
...
@@ -39,8 +39,8 @@ class AdamW(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -100,8 +100,8 @@ class AdamW8bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -161,8 +161,8 @@ class AdamW32bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -221,8 +221,8 @@ class PagedAdamW(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -281,8 +281,8 @@ class PagedAdamW8bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -341,8 +341,8 @@ class PagedAdamW32bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/lamb.py
View file @
3ec3dd26
...
...
@@ -45,8 +45,8 @@ class LAMB(Optimizer2State):
Whether to use the AdamW variant.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -109,8 +109,8 @@ class LAMB8bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
adam_w_mode (`bool`, defaults to `True`):
Whether to use the AdamW variant.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -173,8 +173,8 @@ class LAMB32bit(Optimizer2State):
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
adam_w_mode (`bool`, defaults to `True`):
Whether to use the AdamW variant.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/lars.py
View file @
3ec3dd26
...
...
@@ -41,8 +41,8 @@ class LARS(Optimizer1State):
Whether to use Nesterov momentum.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -98,8 +98,8 @@ class LARS8bit(Optimizer1State):
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -155,8 +155,8 @@ class LARS32bit(Optimizer1State):
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/lion.py
View file @
3ec3dd26
...
...
@@ -33,8 +33,8 @@ class Lion(Optimizer1State):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -85,8 +85,8 @@ class Lion8bit(Optimizer1State):
The beta values are the decay rates of the first and second-order moment of the optimizer.
weight_decay (`float`, defaults to 0):
The weight decay value for the optimizer.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -137,8 +137,8 @@ class Lion32bit(Optimizer1State):
The beta values are the decay rates of the first and second-order moment of the optimizer.
weight_decay (`float`, defaults to 0):
The weight decay value for the optimizer.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -191,8 +191,8 @@ class PagedLion(Optimizer1State):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -242,8 +242,8 @@ class PagedLion8bit(Optimizer1State):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -293,8 +293,8 @@ class PagedLion32bit(Optimizer1State):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/optimizer.py
View file @
3ec3dd26
...
...
@@ -373,8 +373,8 @@ class Optimizer2State(Optimizer8bit):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -596,8 +596,8 @@ class Optimizer1State(Optimizer8bit):
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/rmsprop.py
View file @
3ec3dd26
...
...
@@ -41,8 +41,8 @@ class RMSprop(Optimizer1State):
Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -104,8 +104,8 @@ class RMSprop8bit(Optimizer1State):
Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -167,8 +167,8 @@ class RMSprop32bit(Optimizer1State):
Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
bitsandbytes/optim/sgd.py
View file @
3ec3dd26
...
...
@@ -38,8 +38,8 @@ class SGD(Optimizer1State):
Whether to use Nesterov momentum.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -94,8 +94,8 @@ class SGD8bit(Optimizer1State):
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
@@ -150,8 +150,8 @@ class SGD32bit(Optimizer1State):
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`
di
ct`, defaults to `None`):
A
dictionary
with additional arguments.
args (`
obje
ct`, defaults to `None`):
A
n object
with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment