Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
3b9f2875
Unverified
Commit
3b9f2875
authored
Jul 10, 2023
by
Jiaming Tang
Committed by
GitHub
Jul 10, 2023
Browse files
Merge pull request #33 from abhinavkulkarni/dev/more_models
parents
ab536fb1
d2a10bd9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
14 additions
and
3 deletions
+14
-3
awq/entry.py
awq/entry.py
+2
-2
awq/quantize/auto_scale.py
awq/quantize/auto_scale.py
+6
-1
awq/quantize/pre_quant.py
awq/quantize/pre_quant.py
+6
-0
No files found.
awq/entry.py
View file @
3b9f2875
...
...
@@ -73,9 +73,9 @@ def build_model_and_enc(model_path):
# all hf model
config
=
AutoConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
if
"mpt"
in
config
.
__class__
.
__name__
.
lower
():
enc
=
AutoTokenizer
.
from_pretrained
(
config
.
tokenizer_name
)
enc
=
AutoTokenizer
.
from_pretrained
(
config
.
tokenizer_name
,
trust_remote_code
=
True
)
else
:
enc
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
False
)
enc
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
False
,
trust_remote_code
=
True
)
if
args
.
load_quant
:
# directly load quantized weights
print
(
"Loading pre-computed quantized weights..."
)
...
...
awq/quantize/auto_scale.py
View file @
3b9f2875
...
...
@@ -107,11 +107,14 @@ def auto_scale_block(module, module_kwargs,
def
_search_module_scale
(
block
,
linears2scale
:
list
,
x
,
kwargs
=
{}):
# w: co, ci
# x: n, ci
x
=
x
.
to
(
next
(
block
.
parameters
()).
device
)
weight
=
torch
.
cat
([
_m
.
weight
for
_m
in
linears2scale
],
dim
=
0
)
w_max
=
get_weight_scale
(
weight
,
q_group_size
=
q_config
.
get
(
"q_group_size"
,
-
1
))
# Clear GPU memory
del
weight
torch
.
cuda
.
empty_cache
()
x
=
x
.
to
(
next
(
block
.
parameters
()).
device
)
with
torch
.
no_grad
():
org_out
=
block
(
x
,
**
kwargs
)
if
isinstance
(
org_out
,
tuple
):
...
...
@@ -126,6 +129,8 @@ def auto_scale_block(module, module_kwargs,
n_grid
=
20
history
=
[]
# Clear GPU memory
torch
.
cuda
.
empty_cache
()
org_sd
=
{
k
:
v
.
cpu
()
for
k
,
v
in
block
.
state_dict
().
items
()}
for
ratio
in
range
(
n_grid
):
ratio
=
ratio
*
1
/
n_grid
...
...
awq/quantize/pre_quant.py
View file @
3b9f2875
...
...
@@ -135,6 +135,9 @@ def run_awq(
# now solve for scaling and clipping
input_feat
=
{
k
:
torch
.
cat
(
v
,
dim
=
0
)
for
k
,
v
in
input_feat
.
items
()}
# Clear GPU memory
torch
.
cuda
.
empty_cache
()
if
auto_scale
:
# if it applies, we should also modify the input_feat with scales
scales_list
=
auto_scale_block
(
layer
,
layer_kwargs
,
...
...
@@ -146,6 +149,9 @@ def run_awq(
# append prefix to make names global
awq_results
[
"scale"
]
+=
append_str_prefix
(
scales_list
,
get_op_name
(
model
,
layer
)
+
"."
)
# Clear GPU memory
torch
.
cuda
.
empty_cache
()
if
mse_range
:
clip_list
=
auto_clip_block
(
layer
,
w_bit
=
w_bit
,
q_config
=
q_config
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment