Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
be5cecb8
Unverified
Commit
be5cecb8
authored
Jan 02, 2023
by
Tim Dettmers
Committed by
GitHub
Jan 02, 2023
Browse files
Merge branch 'main' into main
parents
8724c990
f0ec93d0
Changes
41
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
458 additions
and
263 deletions
+458
-263
bitsandbytes/optim/optimizer.py
bitsandbytes/optim/optimizer.py
+20
-22
bitsandbytes/optim/rmsprop.py
bitsandbytes/optim/rmsprop.py
+9
-9
bitsandbytes/optim/sgd.py
bitsandbytes/optim/sgd.py
+6
-6
compile_from_source.md
compile_from_source.md
+2
-2
csrc/cpu_ops.cpp
csrc/cpu_ops.cpp
+1
-1
csrc/kernels.cu
csrc/kernels.cu
+93
-69
csrc/kernels.cuh
csrc/kernels.cuh
+20
-22
csrc/ops.cu
csrc/ops.cu
+43
-14
csrc/ops.cuh
csrc/ops.cuh
+9
-9
csrc/pythonInterface.c
csrc/pythonInterface.c
+10
-11
cuda_install.sh
cuda_install.sh
+0
-3
howto_config_override.md
howto_config_override.md
+4
-4
include/Algo-Direct-Common.h
include/Algo-Direct-Common.h
+4
-4
include/SIMD.h
include/SIMD.h
+1
-1
setup.py
setup.py
+1
-4
tests/test_autograd.py
tests/test_autograd.py
+4
-4
tests/test_cuda_setup_evaluator.py
tests/test_cuda_setup_evaluator.py
+4
-4
tests/test_functional.py
tests/test_functional.py
+216
-63
tests/test_modules.py
tests/test_modules.py
+5
-5
tests/test_optim.py
tests/test_optim.py
+6
-6
No files found.
bitsandbytes/optim/optimizer.py
View file @
be5cecb8
...
...
@@ -12,13 +12,13 @@ import torch
import
bitsandbytes.functional
as
F
class
MockArgs
(
object
)
:
class
MockArgs
:
def
__init__
(
self
,
initial_data
):
for
key
in
initial_data
:
setattr
(
self
,
key
,
initial_data
[
key
])
class
GlobalOptimManager
(
object
)
:
class
GlobalOptimManager
:
_instance
=
None
def
__init__
(
self
):
...
...
@@ -56,9 +56,9 @@ class GlobalOptimManager(object):
"""
Overrides initial optimizer config for specific parameters.
The key-values of the optimizer config for the input parameters are overidden
The key-values of the optimizer config for the input parameters are over
r
idden
This can be both, optimizer parameters like "betas", or "lr" or it can be
8-bit specific paramters like "optim_bits", "percentile_clipping".
8-bit specific param
e
ters like "optim_bits", "percentile_clipping".
Parameters
----------
...
...
@@ -93,13 +93,12 @@ class GlobalOptimManager(object):
class
Optimizer8bit
(
torch
.
optim
.
Optimizer
):
def
__init__
(
self
,
params
,
defaults
,
optim_bits
=
32
):
super
(
Optimizer8bit
,
self
).
__init__
(
params
,
defaults
)
super
().
__init__
(
params
,
defaults
)
self
.
initialized
=
False
self
.
name2qmap
=
{}
self
.
mng
=
GlobalOptimManager
.
get_instance
()
self
.
non_castable_tensor_keys
=
set
(
[
self
.
non_castable_tensor_keys
=
{
"qmap1"
,
"qmap2"
,
"max1"
,
...
...
@@ -112,8 +111,7 @@ class Optimizer8bit(torch.optim.Optimizer):
"absmax1"
,
"absmax2"
,
"unorm_vec"
,
]
)
}
if
optim_bits
==
8
:
self
.
fill_qmap
()
...
...
@@ -123,7 +121,7 @@ class Optimizer8bit(torch.optim.Optimizer):
self
.
name2qmap
[
"udynamic"
]
=
F
.
create_dynamic_map
(
signed
=
False
)
def
__setstate__
(
self
,
state
):
super
(
Optimizer8bit
,
self
).
__setstate__
(
state
)
super
().
__setstate__
(
state
)
def
load_state_dict
(
self
,
state_dict
):
r
"""Loads the optimizer state.
...
...
@@ -155,8 +153,8 @@ class Optimizer8bit(torch.optim.Optimizer):
id_map
=
{
old_id
:
p
for
old_id
,
p
in
zip
(
chain
.
from_iterable
(
(
g
[
"params"
]
for
g
in
saved_groups
)
)
,
chain
.
from_iterable
(
(
g
[
"params"
]
for
g
in
groups
)
)
,
chain
.
from_iterable
(
g
[
"params"
]
for
g
in
saved_groups
),
chain
.
from_iterable
(
g
[
"params"
]
for
g
in
groups
),
)
}
...
...
@@ -284,11 +282,11 @@ class Optimizer8bit(torch.optim.Optimizer):
return
config
def
init_state
(
self
,
group
,
p
,
gindex
,
pindex
):
raise
NotImplementedError
(
f
"init_state method needs to be overidden"
)
raise
NotImplementedError
(
"init_state method needs to be over
r
idden"
)
def
update_step
(
self
,
group
,
p
,
gindex
,
pindex
):
raise
NotImplementedError
(
f
"The update_step method needs to be overidden"
"The update_step method needs to be over
r
idden"
)
...
...
@@ -310,9 +308,9 @@ class Optimizer2State(Optimizer8bit):
skip_zeros
=
False
,
):
if
not
0.0
<=
lr
:
raise
ValueError
(
"Invalid learning rate: {}"
.
format
(
lr
)
)
raise
ValueError
(
f
"Invalid learning rate:
{
lr
}
"
)
if
not
0.0
<=
eps
:
raise
ValueError
(
"Invalid epsilon value: {
}"
.
format
(
eps
)
)
raise
ValueError
(
f
"Invalid epsilon value:
{
eps
}
"
)
if
isinstance
(
betas
,
str
):
# format: '(beta1, beta2)'
betas
=
betas
.
replace
(
"("
,
""
).
replace
(
")"
,
""
).
strip
().
split
(
","
)
...
...
@@ -324,10 +322,10 @@ class Optimizer2State(Optimizer8bit):
)
if
not
0.0
<=
weight_decay
:
raise
ValueError
(
"Invalid weight_decay value: {
}"
.
format
(
weight_decay
)
f
"Invalid weight_decay value:
{
weight_decay
}
"
)
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
)
super
(
Optimizer2State
,
self
).
__init__
(
params
,
defaults
,
optim_bits
)
super
().
__init__
(
params
,
defaults
,
optim_bits
)
if
args
is
None
:
args
=
{}
...
...
@@ -542,9 +540,9 @@ class Optimizer1State(Optimizer8bit):
skip_zeros
=
False
,
):
if
not
0.0
<=
lr
:
raise
ValueError
(
"Invalid learning rate: {}"
.
format
(
lr
)
)
raise
ValueError
(
f
"Invalid learning rate:
{
lr
}
"
)
if
not
0.0
<=
eps
:
raise
ValueError
(
"Invalid epsilon value: {
}"
.
format
(
eps
)
)
raise
ValueError
(
f
"Invalid epsilon value:
{
eps
}
"
)
for
i
in
range
(
len
(
betas
)):
if
not
0.0
<=
betas
[
i
]
<
1.0
:
raise
ValueError
(
...
...
@@ -552,10 +550,10 @@ class Optimizer1State(Optimizer8bit):
)
if
not
0.0
<=
weight_decay
:
raise
ValueError
(
"Invalid weight_decay value: {
}"
.
format
(
weight_decay
)
f
"Invalid weight_decay value:
{
weight_decay
}
"
)
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
)
super
(
Optimizer1State
,
self
).
__init__
(
params
,
defaults
,
optim_bits
)
super
().
__init__
(
params
,
defaults
,
optim_bits
)
if
args
is
None
:
args
=
{}
...
...
bitsandbytes/optim/rmsprop.py
View file @
be5cecb8
...
...
@@ -23,11 +23,11 @@ class RMSprop(Optimizer1State):
):
if
alpha
==
0
:
raise
NotImplementedError
(
f
"RMSprop with alpha==0.0 is not supported!"
"RMSprop with alpha==0.0 is not supported!"
)
if
centered
:
raise
NotImplementedError
(
f
"Centered RMSprop is not supported!"
)
super
(
RMSprop
,
self
).
__init__
(
raise
NotImplementedError
(
"Centered RMSprop is not supported!"
)
super
().
__init__
(
"rmsprop"
,
params
,
lr
,
...
...
@@ -59,11 +59,11 @@ class RMSprop8bit(Optimizer1State):
):
if
alpha
==
0
:
raise
NotImplementedError
(
f
"RMSprop with alpha==0.0 is not supported!"
"RMSprop with alpha==0.0 is not supported!"
)
if
centered
:
raise
NotImplementedError
(
f
"Centered RMSprop is not supported!"
)
super
(
RMSprop8bit
,
self
).
__init__
(
raise
NotImplementedError
(
"Centered RMSprop is not supported!"
)
super
().
__init__
(
"rmsprop"
,
params
,
lr
,
...
...
@@ -96,11 +96,11 @@ class RMSprop32bit(Optimizer1State):
if
alpha
==
0
:
raise
NotImplementedError
(
f
"RMSprop with alpha==0.0 is not supported!"
"RMSprop with alpha==0.0 is not supported!"
)
if
centered
:
raise
NotImplementedError
(
f
"Centered RMSprop is not supported!"
)
super
(
RMSprop32bit
,
self
).
__init__
(
raise
NotImplementedError
(
"Centered RMSprop is not supported!"
)
super
().
__init__
(
"rmsprop"
,
params
,
lr
,
...
...
bitsandbytes/optim/sgd.py
View file @
be5cecb8
...
...
@@ -21,8 +21,8 @@ class SGD(Optimizer1State):
block_wise
=
True
,
):
if
momentum
==
0
:
raise
NotImplementedError
(
f
"SGD without momentum is not supported!"
)
super
(
SGD
,
self
).
__init__
(
raise
NotImplementedError
(
"SGD without momentum is not supported!"
)
super
().
__init__
(
"momentum"
,
params
,
lr
,
...
...
@@ -52,8 +52,8 @@ class SGD8bit(Optimizer1State):
block_wise
=
True
,
):
if
momentum
==
0
:
raise
NotImplementedError
(
f
"SGD without momentum is not supported!"
)
super
(
SGD8bit
,
self
).
__init__
(
raise
NotImplementedError
(
"SGD without momentum is not supported!"
)
super
().
__init__
(
"momentum"
,
params
,
lr
,
...
...
@@ -83,8 +83,8 @@ class SGD32bit(Optimizer1State):
block_wise
=
True
,
):
if
momentum
==
0
:
raise
NotImplementedError
(
f
"SGD without momentum is not supported!"
)
super
(
SGD32bit
,
self
).
__init__
(
raise
NotImplementedError
(
"SGD without momentum is not supported!"
)
super
().
__init__
(
"momentum"
,
params
,
lr
,
...
...
compile_from_source.md
View file @
be5cecb8
csrc/cpu_ops.cpp
View file @
be5cecb8
csrc/kernels.cu
View file @
be5cecb8
...
...
@@ -428,16 +428,16 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
}
template
<
typename
T
,
int
BLOCK_SIZE
,
int
NUM_PER_TH
,
int
STOCHASTIC
>
__launch_bounds__
(
TH
,
4
)
//
__launch_bounds__(TH, 4)
__global__
void
kQuantizeBlockwise
(
float
*
code
,
T
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
)
{
const
int
n_full
=
gridDim
.
x
*
BLOCK_SIZE
;
int
valid_items
=
0
;
const
int
base_idx
=
(
blockIdx
.
x
*
BLOCK_SIZE
);
T
vals
[
NUM
];
float
rand_vals
[
NUM
];
unsigned
char
qvals
[
NUM
];
T
vals
[
NUM
_PER_TH
];
float
rand_vals
[
NUM
_PER_TH
];
unsigned
char
qvals
[
NUM
_PER_TH
];
//float local_abs_max = -FLT_MAX;
float
local_abs_max
=
0.0
f
;
int
local_rand_idx
=
0
;
...
...
@@ -454,8 +454,8 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
__shared__
float
smem_code
[
256
];
__shared__
float
smem_absmax_value
[
1
];
if
(
threadIdx
.
x
<
256
)
smem_code
[
threadIdx
.
x
]
=
code
[
threadIdx
.
x
];
for
(
int
i
=
threadIdx
.
x
;
i
<
256
;
i
+=
blockDim
.
x
)
smem_code
[
i
]
=
code
[
i
];
for
(
unsigned
int
i
=
base_idx
;
i
<
n_full
;
i
+=
gridDim
.
x
*
BLOCK_SIZE
)
{
...
...
@@ -510,15 +510,15 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
}
template
<
typename
T
,
int
BLOCK_SIZE
,
int
THREADS
,
int
NUM_PER_TH
>
__global__
void
kDequantizeBlockwise
(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
T
*
out
,
const
int
n
)
__global__
void
kDequantizeBlockwise
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
T
*
out
,
const
int
n
)
{
const
int
n_full
=
gridDim
.
x
*
BLOCK_SIZE
;
int
valid_items
=
0
;
const
int
base_idx
=
(
blockIdx
.
x
*
BLOCK_SIZE
);
T
vals
[
NUM
];
unsigned
char
qvals
[
NUM
];
T
vals
[
NUM
_PER_TH
];
unsigned
char
qvals
[
NUM
_PER_TH
];
float
local_abs_max
=
-
FLT_MAX
;
typedef
cub
::
BlockLoad
<
unsigned
char
,
THREADS
,
NUM_PER_TH
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadChar
;
...
...
@@ -526,10 +526,11 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
__shared__
typename
LoadChar
::
TempStorage
loadchar
;
__shared__
typename
StoreT
::
TempStorage
storet
;
__shared__
float
smem_code
[
256
];
//__shared__ float smem_code[256];
//float local_code[16];
if
(
threadIdx
.
x
<
256
)
smem_code
[
threadIdx
.
x
]
=
code
[
threadIdx
.
x
];
//
if(threadIdx.x < 256)
//
smem_code[threadIdx.x] = code[threadIdx.x];
for
(
unsigned
int
i
=
base_idx
;
i
<
n_full
;
i
+=
gridDim
.
x
*
BLOCK_SIZE
)
{
...
...
@@ -539,9 +540,10 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * __restrict__ c
__syncthreads
();
LoadChar
(
loadchar
).
Load
(
&
(
A
[
i
]),
qvals
,
valid_items
,
128
);
// load code through read-only cache via __ldg
#pragma unroll NUM_PER_TH
for
(
int
j
=
0
;
j
<
NUM_PER_TH
;
j
++
)
vals
[
j
]
=
smem_
code
[
qvals
[
j
]]
*
local_abs_max
;
vals
[
j
]
=
__ldg
(
&
code
[
qvals
[
j
]]
)
*
local_abs_max
;
__syncthreads
();
StoreT
(
storet
).
Store
(
&
(
out
[
i
]),
vals
,
valid_items
);
...
...
@@ -2791,11 +2793,33 @@ template __global__ void kQuantizeBlockwise<half, 4096, 4, 0>(float * code, half
template
__global__
void
kQuantizeBlockwise
<
float
,
4096
,
4
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
4096
,
4
,
1
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
4096
,
4
,
1
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
2048
,
512
,
4
>(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
2048
,
512
,
4
>(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
2048
,
4
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
2048
,
4
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
1024
,
4
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
1024
,
4
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
512
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
512
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
256
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
256
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
128
,
2
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
128
,
2
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
half
,
64
,
1
,
0
>(
float
*
code
,
half
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kQuantizeBlockwise
<
float
,
64
,
1
,
0
>(
float
*
code
,
float
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
4096
,
1024
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
2048
,
512
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
2048
,
512
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
1024
,
256
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
1024
,
256
,
4
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
512
,
256
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
256
,
128
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
256
,
128
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
128
,
64
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
128
,
64
,
2
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
half
,
64
,
64
,
1
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
const
int
n
);
template
__global__
void
kDequantizeBlockwise
<
float
,
64
,
64
,
1
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
const
int
n
);
...
...
csrc/kernels.cuh
View file @
be5cecb8
...
...
@@ -15,7 +15,7 @@ __global__ void kQuantize(float * code, float * __restrict__ const A, unsigned c
__global__
void
kDequantize
(
float
*
code
,
unsigned
char
*
A
,
float
*
out
,
const
int
n
);
template
<
typename
T
,
int
BLOCK_SIZE
,
int
NUM_PER_TH
,
int
STOCHASTIC
>
__global__
void
kQuantizeBlockwise
(
float
*
code
,
T
*
__restrict__
const
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
__restrict__
const
rand
,
const
int
rand_offset
,
const
int
n
);
template
<
typename
T
,
int
BLOCK_SIZE
,
int
THREADS
,
int
NUM_PER_TH
>
__global__
void
kDequantizeBlockwise
(
float
*
code
,
unsigned
char
*
__restrict__
const
A
,
float
*
__restrict__
const
absmax
,
T
*
out
,
const
int
n
);
template
<
typename
T
,
int
BLOCK_SIZE
,
int
THREADS
,
int
NUM_PER_TH
>
__global__
void
kDequantizeBlockwise
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
T
*
out
,
const
int
n
);
template
<
typename
T
,
int
OPTIMIZER
,
int
BLOCK_SIZE
,
int
NUM_VALS
>
__global__
void
kPreconditionOptimizer32bit2State
(
T
*
g
,
T
*
p
,
...
...
@@ -121,5 +121,3 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
template
<
int
FORMAT
>
__global__
void
kExtractOutliers
(
char
*
A
,
int
*
idx
,
char
*
out
,
int
idx_size
,
int
rowsA
,
int
colsA
,
int
tiledRowsA
,
int
tiledColsA
);
#endif
csrc/ops.cu
View file @
be5cecb8
...
...
@@ -50,11 +50,29 @@ void dequantize(float *code, unsigned char *A, float *out, int n)
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
template
<
typename
T
,
int
STOCHASTIC
>
void
quantizeBlockwise
(
float
*
code
,
T
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
)
template
<
typename
T
,
int
STOCHASTIC
>
void
quantizeBlockwise
(
float
*
code
,
T
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
)
{
int
num_blocks
=
n
/
4096
;
num_blocks
=
n
%
4096
==
0
?
num_blocks
:
num_blocks
+
1
;
int
num_blocks
=
n
/
blocksize
;
num_blocks
=
n
%
blocksize
==
0
?
num_blocks
:
num_blocks
+
1
;
if
(
STOCHASTIC
==
1
)
assert
(
blocksize
==
4096
);
if
(
blocksize
==
4096
)
kQuantizeBlockwise
<
T
,
4096
,
4
,
STOCHASTIC
><<<
num_blocks
,
1024
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
2048
)
kQuantizeBlockwise
<
T
,
2048
,
4
,
0
><<<
num_blocks
,
512
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
1024
)
kQuantizeBlockwise
<
T
,
1024
,
4
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
512
)
kQuantizeBlockwise
<
T
,
512
,
2
,
0
><<<
num_blocks
,
256
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
256
)
kQuantizeBlockwise
<
T
,
256
,
2
,
0
><<<
num_blocks
,
128
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
128
)
kQuantizeBlockwise
<
T
,
128
,
2
,
0
><<<
num_blocks
,
64
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
else
if
(
blocksize
==
64
)
kQuantizeBlockwise
<
T
,
64
,
1
,
0
><<<
num_blocks
,
64
>>>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
...
...
@@ -66,6 +84,17 @@ template<typename T> void dequantizeBlockwise(float *code, unsigned char *A, flo
kDequantizeBlockwise
<
T
,
4096
,
1024
,
4
><<<
num_blocks
,
4096
/
4
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
2048
)
kDequantizeBlockwise
<
T
,
2048
,
512
,
4
><<<
num_blocks
,
2048
/
4
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
1024
)
kDequantizeBlockwise
<
T
,
1024
,
256
,
4
><<<
num_blocks
,
1024
/
4
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
512
)
kDequantizeBlockwise
<
T
,
512
,
256
,
2
><<<
num_blocks
,
512
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
256
)
kDequantizeBlockwise
<
T
,
256
,
128
,
2
><<<
num_blocks
,
256
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
128
)
kDequantizeBlockwise
<
T
,
128
,
64
,
2
><<<
num_blocks
,
128
/
2
>>>
(
code
,
A
,
absmax
,
out
,
n
);
else
if
(
blocksize
==
64
)
kDequantizeBlockwise
<
T
,
64
,
64
,
1
><<<
num_blocks
,
64
/
1
>>>
(
code
,
A
,
absmax
,
out
,
n
);
CUDA_CHECK_RETURN
(
cudaPeekAtLastError
());
}
...
...
@@ -659,10 +688,10 @@ template void transformRowToFormat<COL_AMPERE, 1>(char * A, char *out, int rows,
template
void
estimateQuantiles
(
half
*
A
,
float
*
code
,
float
offset
,
int
n
);
template
void
estimateQuantiles
(
float
*
A
,
float
*
code
,
float
offset
,
int
n
);
template
void
quantizeBlockwise
<
half
,
0
>(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
);
template
void
quantizeBlockwise
<
float
,
0
>(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
);
template
void
quantizeBlockwise
<
half
,
1
>(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
);
template
void
quantizeBlockwise
<
float
,
1
>(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
);
template
void
quantizeBlockwise
<
half
,
0
>(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
);
template
void
quantizeBlockwise
<
float
,
0
>(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
);
template
void
quantizeBlockwise
<
half
,
1
>(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
);
template
void
quantizeBlockwise
<
float
,
1
>(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
);
template
void
dequantizeBlockwise
<
half
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
int
blocksize
,
const
int
n
);
template
void
dequantizeBlockwise
<
float
>(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
int
blocksize
,
const
int
n
);
...
...
csrc/ops.cuh
View file @
be5cecb8
...
...
@@ -128,7 +128,7 @@ template <typename T> void estimateQuantiles(T *A, float *code, float offset, in
void
quantize
(
float
*
code
,
float
*
A
,
unsigned
char
*
out
,
int
n
);
void
dequantize
(
float
*
code
,
unsigned
char
*
A
,
float
*
out
,
int
n
);
template
<
typename
T
,
int
STOCHASTIC
>
void
quantizeBlockwise
(
float
*
code
,
T
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
);
template
<
typename
T
,
int
STOCHASTIC
>
void
quantizeBlockwise
(
float
*
code
,
T
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
int
blocksize
,
const
int
n
);
template
<
typename
T
>
void
dequantizeBlockwise
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
T
*
out
,
int
block_size
,
const
int
n
);
template
<
typename
T
,
int
OPTIMIZER
>
void
optimizer32bit
(
T
*
g
,
T
*
p
,
...
...
csrc/pythonInterface.c
View file @
be5cecb8
...
...
@@ -75,10 +75,10 @@ MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, 32)
void
percentileClipping_g32
(
float
*
g
,
float
*
gnorm_vec
,
int
step
,
const
int
n
){
percentileClipping
<
float
>
(
g
,
gnorm_vec
,
step
,
n
);
}
void
percentileClipping_g16
(
half
*
g
,
float
*
gnorm_vec
,
int
step
,
const
int
n
){
percentileClipping
<
half
>
(
g
,
gnorm_vec
,
step
,
n
);
}
void
quantizeBlockwise_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
const
int
n
){
quantizeBlockwise
<
half
,
0
>
(
code
,
A
,
absmax
,
out
,
NULL
,
0
,
n
);
}
void
quantizeBlockwise_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
const
int
n
){
quantizeBlockwise
<
float
,
0
>
(
code
,
A
,
absmax
,
out
,
NULL
,
0
,
n
);
}
void
quantizeBlockwise_stochastic_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise
<
half
,
1
>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
}
void
quantizeBlockwise_stochastic_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise
<
float
,
1
>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
}
void
quantizeBlockwise_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
int
blocksize
,
const
int
n
){
quantizeBlockwise
<
half
,
0
>
(
code
,
A
,
absmax
,
out
,
NULL
,
0
,
blocksize
,
n
);
}
void
quantizeBlockwise_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
int
blocksize
,
const
int
n
){
quantizeBlockwise
<
float
,
0
>
(
code
,
A
,
absmax
,
out
,
NULL
,
0
,
blocksize
,
n
);
}
void
quantizeBlockwise_stochastic_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise
<
half
,
1
>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
4096
,
n
);
}
void
quantizeBlockwise_stochastic_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise
<
float
,
1
>
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
4096
,
n
);
}
void
dequantizeBlockwise_fp16
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
half
*
out
,
int
blocksize
,
const
int
n
){
dequantizeBlockwise
<
half
>
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
\
void
dequantizeBlockwise_fp32
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
int
blocksize
,
const
int
n
){
dequantizeBlockwise
<
float
>
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
...
...
@@ -140,8 +140,8 @@ extern "C"
void
cestimate_quantiles_fp16
(
half
*
A
,
float
*
code
,
float
offset
,
int
n
){
estimateQuantiles_fp16
(
A
,
code
,
offset
,
n
);
}
void
cquantize
(
float
*
code
,
float
*
A
,
unsigned
char
*
out
,
int
n
){
quantize
(
code
,
A
,
out
,
n
);
}
void
cdequantize
(
float
*
code
,
unsigned
char
*
A
,
float
*
out
,
int
n
){
dequantize
(
code
,
A
,
out
,
n
);
}
void
cquantize_blockwise_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
const
int
n
){
quantizeBlockwise_fp16
(
code
,
A
,
absmax
,
out
,
n
);
}
void
cquantize_blockwise_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
const
int
n
){
quantizeBlockwise_fp32
(
code
,
A
,
absmax
,
out
,
n
);
}
void
cquantize_blockwise_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
int
blocksize
,
const
int
n
){
quantizeBlockwise_fp16
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
void
cquantize_blockwise_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
int
blocksize
,
const
int
n
){
quantizeBlockwise_fp32
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
void
cquantize_blockwise_stochastic_fp16
(
float
*
code
,
half
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise_stochastic_fp16
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
}
void
cquantize_blockwise_stochastic_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
float
*
rand
,
int
rand_offset
,
const
int
n
){
quantizeBlockwise_stochastic_fp32
(
code
,
A
,
absmax
,
out
,
rand
,
rand_offset
,
n
);
}
...
...
@@ -290,4 +290,3 @@ extern "C"
void
cquantize_blockwise_cpu_fp32
(
float
*
code
,
float
*
A
,
float
*
absmax
,
unsigned
char
*
out
,
long
long
blocksize
,
long
long
n
){
quantize_cpu
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
void
cdequantize_blockwise_cpu_fp32
(
float
*
code
,
unsigned
char
*
A
,
float
*
absmax
,
float
*
out
,
long
long
blocksize
,
long
long
n
){
dequantize_cpu
(
code
,
A
,
absmax
,
out
,
blocksize
,
n
);
}
}
cuda_install.sh
View file @
be5cecb8
...
...
@@ -76,6 +76,3 @@ if [[ -n "$CUDA_VERSION" ]]; then
else
echo
""
fi
howto_config_override.md
View file @
be5cecb8
include/Algo-Direct-Common.h
View file @
be5cecb8
include/SIMD.h
View file @
be5cecb8
setup.py
View file @
be5cecb8
...
...
@@ -18,7 +18,7 @@ def read(fname):
setup
(
name
=
f
"bitsandbytes"
,
version
=
f
"0.35.
3
"
,
version
=
f
"0.35.
4
"
,
author
=
"Tim Dettmers"
,
author_email
=
"dettmers@cs.washington.edu"
,
description
=
"8-bit optimizers and matrix multiplication routines."
,
...
...
@@ -26,9 +26,6 @@ setup(
keywords
=
"gpu optimizers optimization 8-bit quantization compression"
,
url
=
"https://github.com/TimDettmers/bitsandbytes"
,
packages
=
find_packages
(),
entry_points
=
{
"console_scripts"
:
[
"debug_cuda = bitsandbytes.debug_cli:cli"
],
},
package_data
=
{
""
:
libs
},
long_description
=
read
(
"README.md"
),
long_description_content_type
=
"text/markdown"
,
...
...
tests/test_autograd.py
View file @
be5cecb8
from
itertools
import
product
,
permutations
from
itertools
import
permutations
,
product
import
pytest
import
torch
...
...
@@ -27,7 +27,7 @@ str_values = list(
)
)
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dim4_{
3
}_func_{
4
}_dtype_{
5
}_requires_grad_{
6
}_transpose_{
7
}"
.
format
(
"dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}"
.
format
(
*
vals
)
for
vals
in
str_values
...
...
@@ -286,7 +286,7 @@ str_values = list(
has_bias
)
)
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dim4_{
3
}_func_{
4
}_dtype_{
5
}_requires_grad_{
6
}_transpose_{
7
}_decomp_{
8
}_has_fp16_weights_{
9
}_has_bias_{
10
}"
.
format
(
*
vals
)
for
vals
in
str_values
]
names
=
[
"dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}"
.
format
(
*
vals
)
for
vals
in
str_values
]
@
pytest
.
mark
.
parametrize
(
...
...
tests/test_cuda_setup_evaluator.py
View file @
be5cecb8
import
os
import
pytest
import
bitsandbytes
as
bnb
from
typing
import
List
,
NamedTuple
import
pytest
import
bitsandbytes
as
bnb
from
bitsandbytes.cuda_setup
import
(
CUDA_RUNTIME_LIB
,
evaluate_cuda_setup
,
determine_cuda_runtime_lib_path
,
evaluate_cuda_setup
,
extract_candidate_paths
,
)
...
...
tests/test_functional.py
View file @
be5cecb8
...
...
@@ -6,12 +6,14 @@ from itertools import product
import
einops
import
pytest
import
torch
import
numpy
as
np
import
bitsandbytes
as
bnb
from
bitsandbytes
import
functional
as
F
from
scipy.stats
import
norm
torch
.
set_printoptions
(
precision
=
4
,
sci_mode
=
False
,
linewidth
=
120
,
edgeitems
=
20
,
threshold
=
10000
precision
=
5
,
sci_mode
=
False
,
linewidth
=
120
,
edgeitems
=
20
,
threshold
=
10000
)
k
=
20
...
...
@@ -26,7 +28,7 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0):
class
FFN
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
input_features
,
hidden_size
,
bias
=
True
):
super
(
FFN
,
self
).
__init__
()
super
().
__init__
()
self
.
fc1
=
torch
.
nn
.
Linear
(
input_features
,
hidden_size
,
bias
=
bias
)
self
.
fc2
=
torch
.
nn
.
Linear
(
hidden_size
,
input_features
,
bias
=
bias
)
...
...
@@ -40,7 +42,7 @@ class FFN(torch.nn.Module):
return
x
class
Timer
(
object
)
:
class
Timer
:
def
__init__
(
self
):
self
.
starts
=
{}
self
.
ends
=
{}
...
...
@@ -67,7 +69,7 @@ class Timer(object):
self
.
ends
.
pop
(
name
)
if
print_ms
and
name
in
self
.
agg
:
print
(
"{
0
} took: {
1:.5f}s"
.
format
(
name
,
self
.
agg
[
name
]
/
1000.0
)
)
print
(
f
"
{
name
}
took:
{
self
.
agg
[
name
]
/
1000.0
:.
5
f
}
s"
)
return
self
.
agg
[
name
]
...
...
@@ -149,30 +151,41 @@ def test_dynamic_quantization():
def
test_dynamic_blockwise_quantization
():
#print('')
for
blocksize
in
[
4096
,
2048
,
1024
,
512
]:
diffs
=
[]
reldiffs
=
[]
for
i
in
range
(
100
):
A1
=
torch
.
randn
(
1024
,
1024
,
device
=
"cuda"
)
C
,
S
=
F
.
quantize_blockwise
(
A1
)
A2
=
F
.
dequantize_blockwise
(
C
,
S
)
C
,
S
=
F
.
quantize_blockwise
(
A1
,
blocksize
=
blocksize
)
A2
=
F
.
dequantize_blockwise
(
C
,
S
,
blocksize
=
blocksize
)
diff
=
torch
.
abs
(
A1
-
A2
)
reldiff
=
diff
/
torch
.
abs
(
A1
+
1e-8
)
diffs
.
append
(
diff
.
mean
().
item
())
reldiffs
.
append
(
reldiff
.
mean
().
item
())
assert
diffs
[
-
1
]
<
0.011
# print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs))
abserr
=
sum
(
diffs
)
/
len
(
diffs
)
relerr
=
sum
(
reldiffs
)
/
len
(
reldiffs
)
assert
abserr
<
0.011
assert
relerr
<
0.018
#print('randn', blocksize, sum(diffs)/len(diffs))
#print('randn', blocksize, sum(reldiffs)/len(reldiffs))
diffs
=
[]
for
i
in
range
(
100
):
A1
=
torch
.
rand
(
1024
,
1024
,
device
=
"cuda"
)
C
,
S
=
F
.
quantize_blockwise
(
A1
)
A2
=
F
.
dequantize_blockwise
(
C
,
S
)
diff
=
torch
.
abs
(
A1
-
A2
).
mean
().
item
()
assert
diff
<
0.0033
diffs
.
append
(
diff
)
torch
.
testing
.
assert_allclose
(
A1
,
A2
,
atol
=
1e-2
,
rtol
=
0
)
# print(sum(diffs)/len(diffs))
C
,
S
=
F
.
quantize_blockwise
(
A1
,
blocksize
=
blocksize
)
A2
=
F
.
dequantize_blockwise
(
C
,
S
,
blocksize
=
blocksize
)
diff
=
torch
.
abs
(
A1
-
A2
)
reldiff
=
diff
/
torch
.
abs
(
A1
+
1e-8
)
diffs
.
append
(
diff
.
mean
().
item
())
reldiffs
.
append
(
reldiff
.
mean
().
item
())
#torch.testing.assert_allclose(A1, A2, atol=1e-2, rtol=0)
abserr
=
sum
(
diffs
)
/
len
(
diffs
)
relerr
=
sum
(
reldiffs
)
/
len
(
reldiffs
)
assert
abserr
<
0.0035
assert
relerr
<
0.015
#print('rand', blocksize, sum(diffs)/len(diffs))
#print('rand', blocksize, sum(reldiffs)/len(reldiffs))
def
test_dynamic_blockwise_stochastic_quantization
():
...
...
@@ -289,7 +302,7 @@ batched = [False, True]
values
=
list
(
product
(
dim1
,
dim2
,
methods
,
batched
))
values_names
=
list
(
product
(
dim1
,
dim2
,
method_names
,
batched
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_quant_{
2
}_batched_{
3
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_quant_{}_batched_{}"
.
format
(
*
vals
)
for
vals
in
values_names
]
...
...
@@ -347,7 +360,7 @@ seq_dim = torch.randint(16, 256, size=(n,)).tolist()
transpose
=
[(
False
,
False
),
(
False
,
True
),
(
True
,
False
),
(
True
,
True
)]
values
=
list
(
product
(
hidden_dim
,
batch_dim
,
transpose
,
seq_dim
))
names
=
[
"hidden_dim_{
0
}_batch_dim_{
1
},transpose_{
2
}_seq_dim_{
3
}"
.
format
(
*
vals
)
"hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -412,7 +425,7 @@ hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist()
batch_dim
=
torch
.
randint
(
2
,
16
,
size
=
(
n
,)).
tolist
()
values
=
list
(
product
(
seq_dim
,
hidden_dim
,
batch_dim
))
names
=
[
"seq_dim{
0
}_hidden_dim{
1
}_batch_dim{
2
}"
.
format
(
*
vals
)
for
vals
in
values
"seq_dim{}_hidden_dim{}_batch_dim{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -444,7 +457,7 @@ batch_dim = torch.randint(2, 16, size=(n,)).tolist()
transpose
=
[
False
,
True
]
values
=
list
(
product
(
seq_dim
,
hidden_dim
,
batch_dim
,
transpose
))
names
=
[
"seq_dim={
0
}_hidden_dim={
1
}_batch_dim={
2
}_transpose{
3
}"
.
format
(
*
vals
)
"seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -529,7 +542,7 @@ dim4 = torch.randint(32, 256, size=(n,)).tolist()
transpose
=
[(
False
,
False
),
(
True
,
False
),
(
False
,
True
),
(
True
,
True
)]
values
=
list
(
product
(
dim1
,
dim2
,
dim3
,
dim4
,
transpose
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dim4_{
3
}_transpose_{
4
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -567,7 +580,7 @@ dim1 = torch.randint(1, 64, size=(n,)).tolist()
dim2
=
torch
.
randint
(
32
,
128
,
size
=
(
n
,)).
tolist
()
dim3
=
torch
.
randint
(
32
,
256
,
size
=
(
n
,)).
tolist
()
values
=
list
(
product
(
dim1
,
dim2
,
dim3
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_dim3_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, dim3"
,
values
,
ids
=
names
)
...
...
@@ -596,7 +609,7 @@ transpose = [False]
dims
=
[
2
,
3
]
values
=
list
(
product
(
dim1
,
dim2
,
dim3
,
dims
,
dtype
,
a_order
,
out_order
,
transpose
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dims_{
3
}_dtype_{
4
}_orderA_{
5
}_orderOut_{
6
}_transpose_{
7
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose"
,
values
,
ids
=
names
)
...
...
@@ -678,7 +691,7 @@ ldb = [0]
# ldb = list(range(256, 1*1024, 256))
values
=
list
(
product
(
dim1
,
dim2
,
dim3
,
dim4
,
dims
,
ldb
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dim4_{
3
}_dims_{
4
}_ldb_{
5
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -726,7 +739,7 @@ dims = (2,)
# ldb = list(range(256, 1*1024, 256))
values
=
list
(
product
(
dim1
,
dim2
,
dim3
,
dim4
,
dims
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dim4_{
3
}_dims_{
4
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -784,7 +797,7 @@ values = [
# values = list(product(batch, seq, model, hidden))
names
=
[
"batch_{
0
}_seq_{
1
}_model_{
2
}_hidden_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"batch_{}_seq_{}_model_{}_hidden_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -952,7 +965,7 @@ dims = (2,)
formatB
=
[
"col_turing"
,
"col_ampere"
]
has_bias
=
[
True
,
False
]
values
=
list
(
product
(
dim1
,
dim4
,
dims
,
formatB
,
has_bias
))
names
=
[
"dim1_{
0
}_dim4_{
1
}_dims_{
2
}_formatB_{
3
}_has_bias_{
4
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim4, dims, formatB, has_bias"
,
values
,
ids
=
names
)
...
...
@@ -1002,7 +1015,7 @@ dim2 = [1 * 1024]
dims
=
(
2
,)
# ldb = list(range(256, 1*1024, 256))
values
=
list
(
product
(
dim1
,
dim2
,
dims
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dims_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_dims_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, dims"
,
values
,
ids
=
names
)
...
...
@@ -1058,7 +1071,7 @@ dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
dim2
=
torch
.
randint
(
1
,
4
*
1024
,
size
=
(
n
,)).
tolist
()
values
=
list
(
product
(
dim1
,
dim2
))
names
=
[
"dim1_{
0
}_dim2_{
1
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2"
,
values
,
ids
=
names
)
...
...
@@ -1105,7 +1118,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner
=
torch
.
randint
(
1
,
4
*
1024
,
size
=
(
n
,)).
tolist
()
values
=
list
(
zip
(
dim1
,
dim4
,
inner
))
names
=
[
"dim1_{
0
}_dim4_{
1
}_inner_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim4_{}_inner_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim4, inner"
,
values
,
ids
=
names
)
...
...
@@ -1149,7 +1162,7 @@ dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
inner
=
torch
.
randint
(
1
,
4
*
1024
,
size
=
(
n
,)).
tolist
()
values
=
list
(
zip
(
dim1
,
dim4
,
inner
))
names
=
[
"dim1_{
0
}_dim4_{
1
}_inner_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim4_{}_inner_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim4, inner"
,
values
,
ids
=
names
)
...
...
@@ -1224,7 +1237,7 @@ inner = [12288 * 4, 4096 * 4]
dim4
=
[
12288
,
4096
]
values
=
list
(
zip
(
dim1
,
dim4
,
inner
))
names
=
[
"dim1_{
0
}_dim4_{
1
}_inner_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim4_{}_inner_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim4, inner"
,
values
,
ids
=
names
)
...
...
@@ -1290,7 +1303,7 @@ values = list(
product
(
dim1
,
dim2
,
dim3
,
dims
,
dtype
,
a_order
,
out_order
,
transpose
)
)
names
=
[
"dim1_{
0
}_dim2_{
1
}_dim3_{
2
}_dims_{
3
}_dtype_{
4
}_orderA_{
5
}_orderOut_{
6
}_{
7
}"
.
format
(
"dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}"
.
format
(
*
vals
)
for
vals
in
values
...
...
@@ -1341,7 +1354,7 @@ a_order = ["col_turing"]
out_order
=
[
"row"
]
values
=
list
(
product
(
dim1
,
dim2
,
dtype
,
a_order
,
out_order
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dtype_{
2
}_orderA_{
3
}_orderOut_{
4
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -1367,7 +1380,7 @@ dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist()
# dim2 = [5]
values
=
list
(
product
(
dim1
,
dim2
))
names
=
[
"dim1_{
0
}_dim2_{
1
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2"
,
values
,
ids
=
names
)
...
...
@@ -1404,7 +1417,7 @@ dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist()
# dim2 = [11]
transposed_B
=
[
False
,
True
]
values
=
list
(
product
(
dim1
,
dim2
,
transposed_B
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_transposed_B_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_transposed_B_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, transposed_B"
,
values
,
ids
=
names
)
...
...
@@ -1485,7 +1498,7 @@ n = 2
dim1
=
torch
.
randint
(
256
,
1
*
1024
,
size
=
(
n
,)).
tolist
()
dim2
=
torch
.
randint
(
256
,
1
*
1024
,
size
=
(
n
,)).
tolist
()
values
=
list
(
product
(
dim1
,
dim2
))
names
=
[
"dim1_{
0
}_dim2_{
1
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2"
,
values
,
ids
=
names
)
...
...
@@ -1550,7 +1563,7 @@ dtype = [torch.float16]
out_function
=
[
"zeros"
,
"ones"
]
values
=
list
(
product
(
dim1
,
dim2
,
dtype
,
out_function
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dtype_{
2
}_out_func_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"dim1_{}_dim2_{}_dtype_{}_out_func_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -1616,17 +1629,6 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func):
# print(time.time() - t0)
def
test_layout
():
a1
=
torch
.
rand
(
16
,
64
,
device
=
"cuda"
,
dtype
=
torch
.
float16
)
a1
=
torch
.
arange
(
16
*
64
,
device
=
"cuda"
).
reshape
(
16
,
64
).
byte
()
a2
,
s2
=
F
.
transform
(
a1
,
"col_turing"
)
print
(
a2
.
shape
)
print
(
a1
.
flatten
()[
8
*
64
:
8
*
64
+
32
])
for
i
in
range
(
4
):
print
(
a2
.
flatten
()[
i
*
8
*
32
:
i
*
8
*
32
+
32
],
0
)
def
test_coo2csr
():
threshold
=
1
A
=
torch
.
randn
(
128
,
128
).
half
().
cuda
()
...
...
@@ -1678,7 +1680,7 @@ dim2 = [2048]
# dim2 = [2]
dtype
=
[
torch
.
int8
]
values
=
list
(
product
(
dim1
,
dim2
,
dtype
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_dtype_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_dtype_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, dtype"
,
values
,
ids
=
names
)
...
...
@@ -1794,7 +1796,7 @@ values.append((batch_size, seqdim, 768, 4 * 768))
# values.append((batch_size, seqdim, 5140, 4*5140))
#values.append((batch_size, seqdim, 12288, 4*12288))
names
=
[
"batch_{
0
}_seq_{
1
}_model_{
2
}_hidden_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"batch_{}_seq_{}_model_{}_hidden_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -2040,3 +2042,154 @@ def test_blockwise_cpu_large():
assert
diffs
[
-
1
]
<
0.011
# print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs))
def
test_fp8_quant
():
for
e_bits
in
range
(
1
,
7
):
p_bits
=
7
-
e_bits
code
=
F
.
create_fp8_map
(
True
,
e_bits
,
p_bits
).
cuda
()
print
(
e_bits
,
p_bits
)
abserr
=
[]
relerr
=
[]
for
i
in
range
(
100
):
A1
=
torch
.
randn
(
1024
,
1024
,
device
=
"cuda"
)
C
,
SC
=
F
.
quantize_blockwise
(
A1
,
code
=
code
)
A2
=
F
.
dequantize_blockwise
(
C
,
SC
)
diff
=
torch
.
abs
(
A1
-
A2
)
reldiff
=
diff
/
torch
.
abs
(
A1
+
1e-8
)
abserr
.
append
(
diff
.
mean
().
item
())
relerr
.
append
(
reldiff
.
mean
().
item
())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr
=
[]
relerr
=
[]
for
i
in
range
(
100
):
A1
=
torch
.
rand
(
1024
,
1024
,
device
=
"cuda"
)
C
,
SC
=
F
.
quantize_blockwise
(
A1
,
code
=
code
)
A2
=
F
.
dequantize_blockwise
(
C
,
SC
)
diff
=
torch
.
abs
(
A1
-
A2
)
reldiff
=
diff
/
torch
.
abs
(
A1
+
1e-8
)
abserr
.
append
(
diff
.
mean
().
item
())
relerr
.
append
(
reldiff
.
mean
().
item
())
#assert diff < 0.0075
#print(sum(abserr)/len(abserr))
#print(sum(relerr)/len(relerr))
abserr
=
[]
relerr
=
[]
for
i
in
range
(
100
):
A1
=
torch
.
randn
(
1024
,
1024
,
device
=
"cuda"
)
C
,
SC
=
F
.
quantize_blockwise
(
A1
)
A2
=
F
.
dequantize_blockwise
(
C
,
SC
)
diff
=
torch
.
abs
(
A1
-
A2
)
reldiff
=
diff
/
torch
.
abs
(
A1
+
1e-8
)
abserr
.
append
(
diff
.
mean
().
item
())
relerr
.
append
(
reldiff
.
mean
().
item
())
#assert diff < 0.0075
#print(3, sum(abserr)/len(abserr))
#print(3, sum(relerr)/len(relerr))
def
test_few_bit_quant
():
#print('')
for
bits
in
range
(
2
,
9
):
#print('='*30, bits, '='*30)
for
method
in
[
'linear'
,
'fp8'
,
'dynamic'
,
'quantile'
]:
abserrs
=
[]
relerrs
=
[]
code
=
None
if
method
==
'linear'
:
code
=
F
.
create_linear_map
(
True
,
total_bits
=
bits
).
cuda
()
elif
method
==
'fp8'
:
ebits
=
math
.
ceil
(
bits
/
2
)
pbits
=
bits
-
ebits
-
1
code
=
F
.
create_fp8_map
(
True
,
ebits
,
pbits
,
bits
).
cuda
()
elif
method
==
'dynamic'
:
code
=
F
.
create_dynamic_map
(
True
,
bits
-
0
,
bits
).
cuda
()
elif
method
==
'quantile'
:
values
=
torch
.
randn
(
2048
,
2048
,
device
=
'cuda'
)
code
=
F
.
create_quantile_map
(
values
,
bits
).
cuda
()
# for some data types we have no zero
# for some data types we have one zero
# for some data types we have two zeros
assert
torch
.
unique
(
code
).
numel
()
in
[
2
**
bits
,
2
**
bits
-
1
],
f
'bits:
{
bits
}
, method:
{
method
}
'
#print(method, (code==0).sum())
assert
code
.
numel
()
==
256
for
i
in
range
(
10
):
values
=
torch
.
randn
(
1
,
32
,
device
=
'cuda'
)
values
/=
values
.
abs
().
max
()
#values[values.abs() < 1e-6] += 1e-5
q1
=
[]
v1
=
[]
for
v
in
values
[
0
]:
idx
=
torch
.
abs
(
v
-
code
).
argmin
()
q1
.
append
(
idx
.
item
())
v1
.
append
(
code
[
idx
].
item
())
q1
=
torch
.
Tensor
(
q1
).
cuda
()
v1
=
torch
.
Tensor
(
v1
).
cuda
()
q2
,
S2
=
F
.
quantize_blockwise
(
values
,
code
=
code
)
v2
=
F
.
dequantize_blockwise
(
q2
,
S2
)
idx
=
torch
.
isclose
(
q1
.
int
(),
q2
.
int
())
err2
=
torch
.
abs
(
v2
-
values
)
abserrs
.
append
(
err2
.
mean
().
item
())
relerrs
.
append
((
err2
/
(
1e-10
+
values
).
abs
()).
mean
().
item
())
if
idx
.
sum
():
# some weird cases
err1
=
torch
.
abs
(
v1
-
values
).
mean
()
#assert err2.mean() <= err1
else
:
torch
.
testing
.
assert_allclose
(
q1
,
q2
)
#print(method, 'abserr:', sum(abserrs)/len(abserrs), 'relerr:', sum(relerrs)/len(relerrs))
#assert False
def
test_kbit_quantile_estimation
():
for
i
in
range
(
100
):
data
=
torch
.
randn
(
1024
,
1024
,
device
=
'cuda'
)
for
bits
in
range
(
2
,
9
):
p
=
np
.
linspace
(
1.3e-4
,
1
-
1.3e-4
,
2
**
bits
)
val1
=
torch
.
Tensor
(
norm
.
ppf
(
p
)).
cuda
()
val2
=
F
.
estimate_quantiles
(
data
,
offset
=
0
,
num_quantiles
=
2
**
bits
)
err
=
torch
.
abs
(
val1
-
val2
).
mean
()
assert
err
<
0.038
for
i
in
range
(
100
):
data
=
torch
.
randn
(
1024
,
1024
,
device
=
'cuda'
)
for
bits
in
range
(
2
,
4
):
total_values
=
2
**
bits
-
1
p
=
np
.
linspace
(
0
,
1
,
2
*
total_values
+
1
)
idx
=
np
.
arange
(
1
,
2
*
total_values
+
1
,
2
)
p
=
p
[
idx
]
offset
=
1
/
(
2
*
total_values
)
p
=
np
.
linspace
(
offset
,
1
-
offset
,
total_values
)
val1
=
torch
.
Tensor
(
norm
.
ppf
(
p
)).
cuda
()
val2
=
F
.
estimate_quantiles
(
data
,
num_quantiles
=
2
**
bits
-
1
)
err
=
torch
.
abs
(
val1
-
val2
).
mean
()
assert
err
<
0.035
def
test_bench_dequantization
():
a
=
torch
.
rand
(
1024
,
1024
,
device
=
'cuda'
).
half
()
qa
,
SA
=
F
.
quantize_blockwise
(
a
)
max_theoretical_mu
=
1024
*
1024
*
2
/
1024
**
3
/
672
*
1000
*
1000
#print(max_theoretical_mu)
torch
.
cuda
.
synchronize
()
t0
=
time
.
time
()
for
i
in
range
(
100
):
F
.
dequantize_blockwise
(
qa
,
SA
,
blocksize
=
2048
)
torch
.
cuda
.
synchronize
()
#print((time.time()-t0)/1e6)
tests/test_modules.py
View file @
be5cecb8
...
...
@@ -7,7 +7,7 @@ from torch import nn
import
bitsandbytes
as
bnb
class
MockArgs
(
object
)
:
class
MockArgs
:
def
__init__
(
self
,
initial_data
):
for
key
in
initial_data
:
setattr
(
self
,
key
,
initial_data
[
key
])
...
...
@@ -15,7 +15,7 @@ class MockArgs(object):
class
MLP8bit
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
dim1
,
dim2
,
has_fp16_weights
=
True
,
memory_efficient_backward
=
False
,
threshold
=
0.0
):
super
(
MLP8bit
,
self
).
__init__
()
super
().
__init__
()
self
.
fc1
=
bnb
.
nn
.
Linear8bitLt
(
dim1
,
dim2
,
has_fp16_weights
=
has_fp16_weights
,
memory_efficient_backward
=
memory_efficient_backward
,
threshold
=
threshold
...
...
@@ -289,7 +289,7 @@ class LinearFunction(torch.autograd.Function):
class
Linear8bit
(
nn
.
Module
):
def
__init__
(
self
,
input_features
,
output_features
,
bias
=
True
,
args
=
None
):
super
(
Linear8bit
,
self
).
__init__
()
super
().
__init__
()
self
.
input_features
=
input_features
self
.
output_features
=
output_features
self
.
args
=
args
...
...
@@ -312,7 +312,7 @@ class Linear8bit(nn.Module):
threshold
=
[
0.0
,
3.0
]
values
=
threshold
names
=
[
"threshold_{
0}"
.
format
(
vals
)
for
vals
in
values
]
names
=
[
f
"threshold_
{
vals
}
"
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"threshold"
,
values
,
ids
=
names
)
...
...
@@ -378,7 +378,7 @@ def test_linear8bitlt_accumulated_gradient():
threshold
=
[
0.0
,
2.0
]
values
=
threshold
names
=
[
"threshold_{
0}"
.
format
(
vals
)
for
vals
in
values
]
names
=
[
f
"threshold_
{
vals
}
"
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"threshold"
,
values
,
ids
=
names
)
...
...
tests/test_optim.py
View file @
be5cecb8
...
...
@@ -18,7 +18,7 @@ k = 20
def
get_temp_dir
():
path
=
"/tmp/autoswap/{
0}"
.
format
(
str
(
uuid
.
uuid4
())
)
path
=
f
"/tmp/autoswap/
{
str
(
uuid
.
uuid4
())
}
"
os
.
makedirs
(
path
,
exist_ok
=
True
)
return
path
...
...
@@ -116,7 +116,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names
=
[
"adam"
,
"momentum"
,
"rmsprop"
,
"lars"
]
values
=
list
(
product
(
dim1
,
dim2
,
gtype
,
optimizer_names
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_gtype_{
2
}_optim_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"dim1_{}_dim2_{}_gtype_{}_optim_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -187,7 +187,7 @@ dim1 = [1024]
dim2
=
[
32
,
1024
,
4097
]
gtype
=
[
torch
.
float32
,
torch
.
float16
]
values
=
list
(
product
(
dim1
,
dim2
,
gtype
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_gtype_{
2
}"
.
format
(
*
vals
)
for
vals
in
values
]
names
=
[
"dim1_{}_dim2_{}_gtype_{}"
.
format
(
*
vals
)
for
vals
in
values
]
@
pytest
.
mark
.
parametrize
(
"dim1, dim2, gtype"
,
values
,
ids
=
names
)
...
...
@@ -250,7 +250,7 @@ optimizer_names = [
]
values
=
list
(
product
(
dim1
,
dim2
,
gtype
,
optimizer_names
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_gtype_{
2
}_optim_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"dim1_{}_dim2_{}_gtype_{}_optim_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -391,7 +391,7 @@ gtype = [torch.float32]
optim_bits
=
[
32
,
8
]
values
=
list
(
product
(
dim1
,
dim2
,
gtype
,
optim_bits
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_gtype_{
2
}_optim_bits_{
3
}"
.
format
(
*
vals
)
"dim1_{}_dim2_{}_gtype_{}_optim_bits_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
@@ -495,7 +495,7 @@ gtype = [torch.float32, torch.float16]
optimizer_names
=
[
"adam8bit_blockwise"
]
values
=
list
(
product
(
dim1
,
dim2
,
gtype
,
optimizer_names
))
names
=
[
"dim1_{
0
}_dim2_{
1
}_gtype_{
2
}_optim_{
3
}"
.
format
(
*
vals
)
for
vals
in
values
"dim1_{}_dim2_{}_gtype_{}_optim_{}"
.
format
(
*
vals
)
for
vals
in
values
]
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment