Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
9b427ebc
Commit
9b427ebc
authored
Aug 14, 2023
by
Jiaming Tang
Browse files
Add compatibility with GQA & optimize multi-GPU memory allocation
parent
dc139757
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
6 deletions
+8
-6
awq/entry.py
awq/entry.py
+2
-1
awq/quantize/auto_scale.py
awq/quantize/auto_scale.py
+6
-5
No files found.
awq/entry.py
View file @
9b427ebc
...
@@ -5,6 +5,7 @@ import argparse
...
@@ -5,6 +5,7 @@ import argparse
import
os
import
os
import
json
import
json
from
accelerate
import
init_empty_weights
,
infer_auto_device_map
,
dispatch_model
,
load_checkpoint_in_model
from
accelerate
import
init_empty_weights
,
infer_auto_device_map
,
dispatch_model
,
load_checkpoint_in_model
from
accelerate.utils.modeling
import
get_balanced_memory
from
awq.utils.parallel
import
auto_parallel
from
awq.utils.parallel
import
auto_parallel
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
...
@@ -162,7 +163,7 @@ def build_model_and_enc(model_path):
...
@@ -162,7 +163,7 @@ def build_model_and_enc(model_path):
raise
NotImplementedError
raise
NotImplementedError
# Move the model to GPU (as much as possible) for LM evaluation
# Move the model to GPU (as much as possible) for LM evaluation
kwargs
=
{
"max_memory"
:
max_memory
}
if
len
(
max_memory
)
else
{
}
kwargs
=
{
"max_memory"
:
get_balanced_memory
(
model
,
max_memory
if
len
(
max_memory
)
>
0
else
None
)
}
device_map
=
infer_auto_device_map
(
device_map
=
infer_auto_device_map
(
model
,
model
,
# TODO: can we remove this?
# TODO: can we remove this?
...
...
awq/quantize/auto_scale.py
View file @
9b427ebc
...
@@ -213,11 +213,12 @@ def auto_scale_block(module, module_kwargs,
...
@@ -213,11 +213,12 @@ def auto_scale_block(module, module_kwargs,
module2inspect
=
module
.
self_attn
,
kwargs
=
module_kwargs
,
module2inspect
=
module
.
self_attn
,
kwargs
=
module_kwargs
,
))
))
# attn out
# attn out
scales_list
.
append
(
_auto_get_scale
(
if
module
.
self_attn
.
v_proj
.
weight
.
shape
==
module
.
self_attn
.
o_proj
.
weight
.
shape
:
prev_op
=
module
.
self_attn
.
v_proj
,
scales_list
.
append
(
_auto_get_scale
(
layers
=
[
module
.
self_attn
.
o_proj
],
prev_op
=
module
.
self_attn
.
v_proj
,
inp
=
input_feat
[
'self_attn.o_proj'
],
layers
=
[
module
.
self_attn
.
o_proj
],
))
inp
=
input_feat
[
'self_attn.o_proj'
],
))
# fc1
# fc1
scales_list
.
append
(
_auto_get_scale
(
scales_list
.
append
(
_auto_get_scale
(
prev_op
=
module
.
post_attention_layernorm
,
prev_op
=
module
.
post_attention_layernorm
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment