Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
efea69e1
Unverified
Commit
efea69e1
authored
Aug 14, 2023
by
Ji Lin
Committed by
GitHub
Aug 14, 2023
Browse files
Merge pull request #67 from Sakits/main
Add compatibility with GQA & optimize multi-GPU memory allocation
parents
dc139757
b190df35
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
6 deletions
+9
-6
awq/entry.py
awq/entry.py
+2
-1
awq/quantize/auto_scale.py
awq/quantize/auto_scale.py
+7
-5
No files found.
awq/entry.py
View file @
efea69e1
...
@@ -5,6 +5,7 @@ import argparse
...
@@ -5,6 +5,7 @@ import argparse
import
os
import
os
import
json
import
json
from
accelerate
import
init_empty_weights
,
infer_auto_device_map
,
dispatch_model
,
load_checkpoint_in_model
from
accelerate
import
init_empty_weights
,
infer_auto_device_map
,
dispatch_model
,
load_checkpoint_in_model
from
accelerate.utils.modeling
import
get_balanced_memory
from
awq.utils.parallel
import
auto_parallel
from
awq.utils.parallel
import
auto_parallel
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
...
@@ -162,7 +163,7 @@ def build_model_and_enc(model_path):
...
@@ -162,7 +163,7 @@ def build_model_and_enc(model_path):
raise
NotImplementedError
raise
NotImplementedError
# Move the model to GPU (as much as possible) for LM evaluation
# Move the model to GPU (as much as possible) for LM evaluation
kwargs
=
{
"max_memory"
:
max_memory
}
if
len
(
max_memory
)
else
{
}
kwargs
=
{
"max_memory"
:
get_balanced_memory
(
model
,
max_memory
if
len
(
max_memory
)
>
0
else
None
)
}
device_map
=
infer_auto_device_map
(
device_map
=
infer_auto_device_map
(
model
,
model
,
# TODO: can we remove this?
# TODO: can we remove this?
...
...
awq/quantize/auto_scale.py
View file @
efea69e1
...
@@ -213,11 +213,13 @@ def auto_scale_block(module, module_kwargs,
...
@@ -213,11 +213,13 @@ def auto_scale_block(module, module_kwargs,
module2inspect
=
module
.
self_attn
,
kwargs
=
module_kwargs
,
module2inspect
=
module
.
self_attn
,
kwargs
=
module_kwargs
,
))
))
# attn out
# attn out
scales_list
.
append
(
_auto_get_scale
(
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
prev_op
=
module
.
self_attn
.
v_proj
,
if
module
.
self_attn
.
v_proj
.
weight
.
shape
==
module
.
self_attn
.
o_proj
.
weight
.
shape
:
layers
=
[
module
.
self_attn
.
o_proj
],
scales_list
.
append
(
_auto_get_scale
(
inp
=
input_feat
[
'self_attn.o_proj'
],
prev_op
=
module
.
self_attn
.
v_proj
,
))
layers
=
[
module
.
self_attn
.
o_proj
],
inp
=
input_feat
[
'self_attn.o_proj'
],
))
# fc1
# fc1
scales_list
.
append
(
_auto_get_scale
(
scales_list
.
append
(
_auto_get_scale
(
prev_op
=
module
.
post_attention_layernorm
,
prev_op
=
module
.
post_attention_layernorm
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment