Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
e9fa03b7
"examples/vscode:/vscode.git/clone" did not exist on "346197c47722993cf8cd9f41891d1457ef82decc"
Commit
e9fa03b7
authored
Apr 07, 2023
by
Tim Dettmers
Browse files
Some fixed for loading PEFT modules with Params4bit.
parent
1ccb7bde
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
78 additions
and
20 deletions
+78
-20
bitsandbytes/functional.py
bitsandbytes/functional.py
+7
-3
bitsandbytes/nn/modules.py
bitsandbytes/nn/modules.py
+48
-4
csrc/kernels.cu
csrc/kernels.cu
+21
-11
tests/test_optim.py
tests/test_optim.py
+2
-2
No files found.
bitsandbytes/functional.py
View file @
e9fa03b7
...
...
@@ -362,9 +362,13 @@ def get_special_format_str():
def
is_on_gpu
(
tensors
):
on_gpu
=
True
gpu_ids
=
set
()
for
t
in
tensors
:
if
t
is
None
:
continue
# NULL pointers are fine
on_gpu
&=
t
.
device
.
type
==
'cuda'
gpu_ids
.
add
(
t
.
device
.
index
)
if
len
(
gpu_ids
)
>
1
:
raise
TypeError
(
f
'Input tensors need to be on the same GPU, but found the following tensor and device combinations:
{
[(
t
.
shape
,
t
.
device
)
for
t
in
tensors
]
}
'
)
return
on_gpu
def
get_ptr
(
A
:
Tensor
)
->
ct
.
c_void_p
:
...
...
@@ -617,7 +621,7 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ra
assert
rand
is
None
lib
.
cquantize_blockwise_cpu_fp32
(
get_ptr
(
code
),
get_ptr
(
A
),
get_ptr
(
absmax
),
get_ptr
(
out
),
ct
.
c_longlong
(
blocksize
),
ct
.
c_longlong
(
A
.
numel
()))
state
=
(
absmax
,
code
,
blocksize
)
state
=
[
absmax
,
code
,
blocksize
]
return
out
,
state
...
...
@@ -763,9 +767,9 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz
#qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
qabsmax
,
state2
=
quantize_blockwise
(
absmax
,
blocksize
=
256
)
del
absmax
state
=
(
qabsmax
,
input_shape
,
A
.
dtype
,
blocksize
,
(
offset
,
state2
)
,
quant_type
)
state
=
[
qabsmax
,
input_shape
,
A
.
dtype
,
blocksize
,
[
offset
,
state2
]
,
quant_type
]
else
:
state
=
(
absmax
,
input_shape
,
A
.
dtype
,
blocksize
,
None
,
quant_type
)
state
=
[
absmax
,
input_shape
,
A
.
dtype
,
blocksize
,
None
,
quant_type
]
return
out
,
state
...
...
bitsandbytes/nn/modules.py
View file @
e9fa03b7
...
...
@@ -135,7 +135,6 @@ class Embedding(torch.nn.Embedding):
class
Params4bit
(
torch
.
nn
.
Parameter
):
def
__new__
(
cls
,
data
=
None
,
requires_grad
=
True
,
quant_state
=
None
,
blocksize
=
64
,
compress_statistics
=
True
,
quant_type
=
'fp4'
):
cls
.
quant_state
=
None
if
data
is
None
:
data
=
torch
.
empty
(
0
)
...
...
@@ -143,12 +142,14 @@ class Params4bit(torch.nn.Parameter):
self
.
blocksize
=
blocksize
self
.
compress_statistics
=
compress_statistics
self
.
quant_type
=
quant_type
self
.
quant_state
=
quant_state
self
.
data
=
data
return
self
def
cuda
(
self
,
device
):
w
=
self
.
data
.
contiguous
().
half
().
cuda
(
device
)
w_
fp4
,
quant_state
=
bnb
.
functional
.
quantize_4bit
(
w
,
blocksize
=
self
.
blocksize
,
compress_statistics
=
self
.
compress_statistics
,
quant_type
=
self
.
quant_type
)
self
.
data
=
w_
fp4
w_
4bit
,
quant_state
=
bnb
.
functional
.
quantize_4bit
(
w
,
blocksize
=
self
.
blocksize
,
compress_statistics
=
self
.
compress_statistics
,
quant_type
=
self
.
quant_type
)
self
.
data
=
w_
4bit
self
.
quant_state
=
quant_state
return
self
...
...
@@ -171,8 +172,19 @@ class Params4bit(torch.nn.Parameter):
if
(
device
is
not
None
and
device
.
type
==
"cuda"
and
self
.
data
.
device
.
type
==
"cpu"
):
return
self
.
cuda
(
device
)
else
:
s
=
self
.
quant_state
if
s
is
not
None
:
# make sure the quantization state is on the right device
s
[
0
]
=
s
[
0
].
to
(
device
)
if
self
.
compress_statistics
:
# TODO: refactor this. This is a nightmare
s
[
-
2
][
0
]
=
s
[
-
2
][
0
].
to
(
device
)
# offset
s
[
-
2
][
1
][
0
]
=
s
[
-
2
][
1
][
0
].
to
(
device
)
# nested quantiation state statitics
s
[
-
2
][
1
][
1
]
=
s
[
-
2
][
1
][
1
].
to
(
device
)
# nested quantiation codebook
new_param
=
Params4bit
(
super
().
to
(
device
=
device
,
dtype
=
dtype
,
non_blocking
=
non_blocking
),
requires_grad
=
self
.
requires_grad
,
quant_state
=
self
.
quant_state
)
requires_grad
=
self
.
requires_grad
,
quant_state
=
self
.
quant_state
,
blocksize
=
self
.
blocksize
,
compress_statistics
=
self
.
compress_statistics
,
quant_type
=
self
.
quant_type
)
return
new_param
...
...
@@ -200,6 +212,38 @@ class Linear4bit(nn.Linear):
return
out
def
_save_to_state_dict
(
self
,
destination
,
prefix
,
keep_vars
):
super
().
_save_to_state_dict
(
destination
,
prefix
,
keep_vars
)
# we only need to save extra state if .cuda was called
# then we have the (1) quantization weight and the (2) quantization config
#quant_state = getattr(self.weight, 'quant_state', None)
#if quant_state is not None:
# # 2. quantization state
# destination[prefix + 'quant_state'] = quant_state
#destination[prefix + 'weight'] = self.weight.detach()
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
super
().
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
#for key in unexpected_keys:
# input_name = key[len(prefix):]
# if input_name == "quant_state":
# if getattr(self.weight, 'quant_state', None) is None:
# # buffers not yet initialized, can't call them directly without
# raise RuntimeError("Loading a quantized checkpoint into non-quantized Linear4bit is "
# "not supported. Please call module.cuda() before module.load_state_dict()")
# input_param = state_dict[key]
# self.weight.quant_state = input_param
# assert isinstance(self.weight, Param4bit)
# unexpected_keys.remove(key)
class
LinearFP4
(
Linear4bit
):
def
__init__
(
self
,
input_features
,
output_features
,
bias
=
True
,
compute_dtype
=
None
,
compress_statistics
=
True
):
super
().
__init__
(
input_features
,
output_features
,
bias
,
compute_dtype
,
compress_statistics
,
'fp4'
)
...
...
csrc/kernels.cu
View file @
e9fa03b7
...
...
@@ -1681,6 +1681,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
unsigned
char
c1s
[
N_PER_TH
];
unsigned
char
c2s
[
N_PER_TH
];
T
g_vals
[
N_PER_TH
];
T
p_vals
[
N_PER_TH
];
typedef
cub
::
BlockLoad
<
T
,
BLOCK_SIZE
/
N_PER_TH
,
N_PER_TH
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadT
;
typedef
cub
::
BlockLoad
<
unsigned
char
,
BLOCK_SIZE
/
N_PER_TH
,
N_PER_TH
,
cub
::
BLOCK_LOAD_WARP_TRANSPOSE
>
LoadChar
;
...
...
@@ -1742,15 +1743,23 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
# pragma unroll N_PER_TH
for
(
unsigned
int
j
=
0
;
j
<
N_PER_TH
;
j
++
)
{
g_val
=
float
(
g_vals
[
j
]);
g_val
*=
gnorm_scale
;
if
(
!
skip_zeros
||
(
skip_zeros
&&
((
float
)
g_vals
[
j
]
!=
0.0
f
)))
if
(
!
isnan
((
float
)
g_vals
[
j
])
&&
!
isinf
((
float
)
g_vals
[
j
]))
{
s1_vals
[
j
]
=
smem_quantiles1
[
lane_id
][
c1s
[
j
]]
*
absmax1
[
i
/
BLOCK_SIZE
];
s1_vals
[
j
]
=
(
s1_vals
[
j
]
*
beta1
)
+
(((
1.0
f
-
beta1
)
*
g_val
));
s2_vals
[
j
]
=
smem_quantiles2
[
lane_id
][
c2s
[
j
]]
*
absmax2
[
i
/
BLOCK_SIZE
];
g_val
=
g_vals
[
j
];
//float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
//g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
g_val
*=
gnorm_scale
;
s2_vals
[
j
]
=
(
s2_vals
[
j
]
*
beta2
)
+
(((
1.0
f
-
beta2
)
*
g_val
*
g_val
));
s1_vals
[
j
]
=
smem_quantiles1
[
lane_id
][
c1s
[
j
]]
*
absmax1
[
i
/
BLOCK_SIZE
];
s1_vals
[
j
]
=
(
s1_vals
[
j
]
*
beta1
)
+
(((
1.0
f
-
beta1
)
*
g_val
));
}
else
{
s1_vals
[
j
]
=
0.0
f
;
s2_vals
[
j
]
=
0.0
f
;
}
new_local_abs_max1
=
fmaxf
(
new_local_abs_max1
,
fabsf
(
s1_vals
[
j
]));
...
...
@@ -1782,22 +1791,23 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
}
__syncthreads
();
LoadT
(
temp_storage
.
loadh
).
Load
(
&
(
p
[
i
]),
g
_vals
,
valid_items
,
(
T
)
0.0
f
);
LoadT
(
temp_storage
.
loadh
).
Load
(
&
(
p
[
i
]),
p
_vals
,
valid_items
,
(
T
)
0.0
f
);
// reduce: 2.67/1.69 -> 2.67/1.70
# pragma unroll N_PER_TH
for
(
unsigned
int
j
=
0
;
j
<
N_PER_TH
;
j
++
)
{
if
(
!
skip_zeros
||
(
skip_zeros
&&
((
float
)
g_vals
[
j
]
!=
0.0
f
)))
//if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
if
(
!
isnan
((
float
)
g_vals
[
j
])
&&
!
isinf
((
float
)
g_vals
[
j
]))
{
g
_vals
[
j
]
=
(
T
)(((
float
)
g
_vals
[
j
])
+
((
step_size
*
(
__fdividef
(
s1_vals
[
j
],(
sqrtf
(
s2_vals
[
j
])
+
(
correction2
*
eps
)))))));
p
_vals
[
j
]
=
(
T
)(((
float
)
p
_vals
[
j
])
+
((
step_size
*
(
__fdividef
(
s1_vals
[
j
],(
sqrtf
(
s2_vals
[
j
])
+
(
correction2
*
eps
)))))));
if
(
weight_decay
>
0.0
f
)
g
_vals
[
j
]
=
((
float
)
g
_vals
[
j
])
*
(
1.0
f
-
(
lr
*
weight_decay
));
p
_vals
[
j
]
=
((
float
)
p
_vals
[
j
])
*
(
1.0
f
-
(
lr
*
weight_decay
));
}
}
// store: 0.85/1.44 -> 2.48/1.57
__syncthreads
();
StoreT
(
temp_storage
.
storeh
).
Store
(
&
(
p
[
i
]),
g
_vals
,
valid_items
);
StoreT
(
temp_storage
.
storeh
).
Store
(
&
(
p
[
i
]),
p
_vals
,
valid_items
);
// quantizaztion: 2.67/1.70 -> 3.4/3.3
# pragma unroll N_PER_TH
...
...
tests/test_optim.py
View file @
e9fa03b7
...
...
@@ -282,7 +282,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name):
errors
=
[]
relerrors
=
[]
for
i
in
range
(
5
0
):
for
i
in
range
(
10
0
):
g
=
torch
.
randn
(
dim1
,
dim2
,
device
=
"cuda"
,
dtype
=
gtype
)
*
0.01
p1
.
grad
=
g
.
clone
().
float
()
p2
.
grad
=
g
.
clone
()
...
...
@@ -314,7 +314,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name):
)
==
0
)
assert
num_not_close
.
sum
().
item
()
<
20
#
assert num_not_close.sum().item() < 20
dequant_states
.
append
(
s1
.
clone
())
err
=
torch
.
abs
(
p1
-
p2
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment