Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
ab536fb1
Unverified
Commit
ab536fb1
authored
Jul 04, 2023
by
Jiaming Tang
Committed by
GitHub
Jul 04, 2023
Browse files
Merge pull request #22 from abhinavkulkarni/dev/more_models
parents
8e7e9ccc
6371c3a0
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
46 additions
and
25 deletions
+46
-25
awq/entry.py
awq/entry.py
+29
-24
awq/quantize/auto_clip.py
awq/quantize/auto_clip.py
+4
-0
awq/quantize/auto_scale.py
awq/quantize/auto_scale.py
+9
-1
awq/quantize/quantizer.py
awq/quantize/quantizer.py
+4
-0
No files found.
awq/entry.py
View file @
ab536fb1
from
lm_eval
import
evaluator
,
tasks
from
lm_eval
import
evaluator
,
tasks
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
,
AutoConfig
,
AutoModelForSeq2SeqLM
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
,
AutoConfig
import
torch
import
torch
import
argparse
import
argparse
import
os
import
os
import
json
import
json
from
accelerate
import
init_empty_weights
,
load_checkpoint_and_dispatch
from
accelerate
import
init_empty_weights
,
infer_auto_device_map
,
dispatch_model
,
load_checkpoint_and_dispatch
from
awq.utils.parallel
import
auto_parallel
from
awq.utils.parallel
import
auto_parallel
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.pre_quant
import
run_awq
,
apply_awq
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
from
awq.quantize.quantizer
import
pseudo_quantize_model_weight
,
real_quantize_model_weight
...
@@ -20,6 +20,12 @@ parser.add_argument('--num_fewshot', type=int, default=0)
...
@@ -20,6 +20,12 @@ parser.add_argument('--num_fewshot', type=int, default=0)
# model config
# model config
parser
.
add_argument
(
'--parallel'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--parallel'
,
action
=
'store_true'
,
help
=
"enable model parallelism"
)
help
=
"enable model parallelism"
)
# max memory to offload larger models to CPU
parser
.
add_argument
(
'--max_memory'
,
type
=
str
,
nargs
=
'*'
,
help
=
"List of device_id:max_memory pairs to be parsed into a dictionary; "
\
+
"Example: 0:10GiB 1:10GiB cpu:30GiB; "
\
+
"mode details here: "
\
+
"https://huggingface.co/docs/accelerate/usage_guides/big_modeling"
)
parser
.
add_argument
(
'--auto_parallel'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--auto_parallel'
,
action
=
'store_true'
,
help
=
"automatically set parallel and batch_size"
)
help
=
"automatically set parallel and batch_size"
)
# quantization config
# quantization config
...
@@ -43,6 +49,9 @@ parser.add_argument('--load_awq', type=str, default=None,
...
@@ -43,6 +49,9 @@ parser.add_argument('--load_awq', type=str, default=None,
help
=
"load the awq search results"
)
help
=
"load the awq search results"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
max_memory
=
[
v
.
split
(
':'
)
for
v
in
(
args
.
max_memory
or
[])]
max_memory
=
{(
int
(
k
)
if
k
.
isdigit
()
else
k
):
v
for
k
,
v
in
max_memory
}
if
args
.
auto_parallel
:
if
args
.
auto_parallel
:
gpu_list
=
auto_parallel
(
args
)
gpu_list
=
auto_parallel
(
args
)
...
@@ -69,7 +78,6 @@ def build_model_and_enc(model_path):
...
@@ -69,7 +78,6 @@ def build_model_and_enc(model_path):
enc
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
False
)
enc
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
False
)
if
args
.
load_quant
:
# directly load quantized weights
if
args
.
load_quant
:
# directly load quantized weights
# no need to really load the fp16 weights... just to get the model structure
print
(
"Loading pre-computed quantized weights..."
)
print
(
"Loading pre-computed quantized weights..."
)
with
init_empty_weights
():
with
init_empty_weights
():
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
config
=
config
,
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
config
=
config
,
...
@@ -84,21 +92,14 @@ def build_model_and_enc(model_path):
...
@@ -84,21 +92,14 @@ def build_model_and_enc(model_path):
)
)
else
:
# fp16 to quantized
else
:
# fp16 to quantized
args
.
run_awq
&=
not
args
.
load_awq
# if load_awq, no need to run awq
args
.
run_awq
&=
not
args
.
load_awq
# if load_awq, no need to run awq
# Init model on CPU:
kwargs
=
{
"torch_dtype"
:
torch
.
float16
,
"low_cpu_mem_usage"
:
True
}
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
config
=
config
,
trust_remote_code
=
True
,
**
kwargs
)
if
args
.
run_awq
:
if
args
.
run_awq
:
assert
args
.
dump_awq
,
"Please save the awq results with --dump_awq"
assert
args
.
dump_awq
,
"Please save the awq results with --dump_awq"
# Init model on CPU
def
skip
(
*
args
,
**
kwargs
):
pass
torch
.
nn
.
init
.
kaiming_normal_
=
skip
torch
.
nn
.
init
.
kaiming_uniform_
=
skip
torch
.
nn
.
init
.
uniform_
=
skip
torch
.
nn
.
init
.
normal_
=
skip
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
config
=
config
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
float16
)
awq_results
=
run_awq
(
awq_results
=
run_awq
(
model
,
enc
,
model
,
enc
,
w_bit
=
args
.
w_bit
,
q_config
=
q_config
,
w_bit
=
args
.
w_bit
,
q_config
=
q_config
,
...
@@ -112,12 +113,6 @@ def build_model_and_enc(model_path):
...
@@ -112,12 +113,6 @@ def build_model_and_enc(model_path):
print
(
"AWQ results saved at"
,
args
.
dump_awq
)
print
(
"AWQ results saved at"
,
args
.
dump_awq
)
exit
(
0
)
exit
(
0
)
else
:
# Inference with fake quant
# Init model on GPUs:
kwargs
=
{
"device_map"
:
"balanced"
,
"torch_dtype"
:
torch
.
float16
}
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
config
=
config
,
trust_remote_code
=
True
,
**
kwargs
)
if
args
.
load_awq
:
if
args
.
load_awq
:
print
(
"Loading pre-computed AWQ results from"
,
args
.
load_awq
)
print
(
"Loading pre-computed AWQ results from"
,
args
.
load_awq
)
...
@@ -147,6 +142,17 @@ def build_model_and_enc(model_path):
...
@@ -147,6 +142,17 @@ def build_model_and_enc(model_path):
else
:
else
:
raise
NotImplementedError
raise
NotImplementedError
# Move the model to GPU (as much as possible) for LM evaluation
kwargs
=
{
"max_memory"
:
max_memory
}
if
len
(
max_memory
)
else
{}
device_map
=
infer_auto_device_map
(
model
,
# TODO: can we remove this?
no_split_module_classes
=
[
"OPTDecoderLayer"
,
"LlamaDecoderLayer"
,
"BloomBlock"
,
"MPTBlock"
,
"DecoderLayer"
],
**
kwargs
)
model
=
dispatch_model
(
model
,
device_map
=
device_map
)
return
model
,
enc
return
model
,
enc
...
@@ -163,11 +169,10 @@ def main():
...
@@ -163,11 +169,10 @@ def main():
# a hack here to auto set model group
# a hack here to auto set model group
model
,
enc
=
build_model_and_enc
(
args
.
model_path
)
model
,
enc
=
build_model_and_enc
(
args
.
model_path
)
lm_eval_model
=
LMEvalAdaptor
(
args
.
model_path
,
model
,
enc
,
args
.
batch_size
)
if
args
.
tasks
is
not
None
:
if
args
.
tasks
is
not
None
:
task_names
=
args
.
tasks
.
split
(
","
)
task_names
=
args
.
tasks
.
split
(
","
)
lm_eval_model
=
LMEvalAdaptor
(
args
.
model_path
,
model
,
enc
,
args
.
batch_size
)
results
=
evaluator
.
simple_evaluate
(
results
=
evaluator
.
simple_evaluate
(
model
=
lm_eval_model
,
model
=
lm_eval_model
,
tasks
=
task_names
,
tasks
=
task_names
,
...
...
awq/quantize/auto_clip.py
View file @
ab536fb1
...
@@ -75,9 +75,11 @@ def auto_clip_block(module,
...
@@ -75,9 +75,11 @@ def auto_clip_block(module,
# due to qk bmm, it is hard to clip precisely
# due to qk bmm, it is hard to clip precisely
if
any
([
_
in
name
for
_
in
[
"q_"
,
"k_"
,
"query"
,
"key"
,
"Wqkv"
]]):
if
any
([
_
in
name
for
_
in
[
"q_"
,
"k_"
,
"query"
,
"key"
,
"Wqkv"
]]):
continue
continue
named_linears
[
name
].
cuda
()
max_val
=
auto_clip_layer
(
max_val
=
auto_clip_layer
(
named_linears
[
name
].
weight
,
input_feat
[
name
],
n_bit
=
w_bit
,
q_config
=
q_config
)
named_linears
[
name
].
weight
,
input_feat
[
name
],
n_bit
=
w_bit
,
q_config
=
q_config
)
clip_list
.
append
((
name
,
max_val
))
clip_list
.
append
((
name
,
max_val
))
named_linears
[
name
].
cpu
()
return
clip_list
return
clip_list
...
@@ -86,8 +88,10 @@ def apply_clip(module, clip_list):
...
@@ -86,8 +88,10 @@ def apply_clip(module, clip_list):
from
..utils.module
import
get_op_by_name
from
..utils.module
import
get_op_by_name
for
name
,
max_val
in
clip_list
:
for
name
,
max_val
in
clip_list
:
layer
=
get_op_by_name
(
module
,
name
)
layer
=
get_op_by_name
(
module
,
name
)
layer
.
cuda
()
max_val
=
max_val
.
to
(
layer
.
weight
.
device
)
max_val
=
max_val
.
to
(
layer
.
weight
.
device
)
org_shape
=
layer
.
weight
.
shape
org_shape
=
layer
.
weight
.
shape
layer
.
weight
.
data
=
layer
.
weight
.
data
.
reshape
(
*
max_val
.
shape
[:
2
],
-
1
)
layer
.
weight
.
data
=
layer
.
weight
.
data
.
reshape
(
*
max_val
.
shape
[:
2
],
-
1
)
layer
.
weight
.
data
=
torch
.
clamp
(
layer
.
weight
.
data
,
-
max_val
,
max_val
)
layer
.
weight
.
data
=
torch
.
clamp
(
layer
.
weight
.
data
,
-
max_val
,
max_val
)
layer
.
weight
.
data
=
layer
.
weight
.
data
.
reshape
(
org_shape
)
layer
.
weight
.
data
=
layer
.
weight
.
data
.
reshape
(
org_shape
)
layer
.
cpu
()
awq/quantize/auto_scale.py
View file @
ab536fb1
...
@@ -321,6 +321,10 @@ def apply_scale(module, scales_list, input_feat_dict=None):
...
@@ -321,6 +321,10 @@ def apply_scale(module, scales_list, input_feat_dict=None):
prev_op
=
get_op_by_name
(
module
,
prev_op_name
)
prev_op
=
get_op_by_name
(
module
,
prev_op_name
)
layers
=
[
get_op_by_name
(
module
,
name
)
for
name
in
layer_names
]
layers
=
[
get_op_by_name
(
module
,
name
)
for
name
in
layer_names
]
prev_op
.
cuda
()
for
layer
in
layers
:
layer
.
cuda
()
if
isinstance
(
prev_op
,
nn
.
Linear
):
if
isinstance
(
prev_op
,
nn
.
Linear
):
assert
len
(
layers
)
==
1
assert
len
(
layers
)
==
1
scale_fc_fc
(
prev_op
,
layers
[
0
],
scales
)
scale_fc_fc
(
prev_op
,
layers
[
0
],
scales
)
...
@@ -339,3 +343,7 @@ def apply_scale(module, scales_list, input_feat_dict=None):
...
@@ -339,3 +343,7 @@ def apply_scale(module, scales_list, input_feat_dict=None):
for
layer_name
in
layer_names
:
for
layer_name
in
layer_names
:
inp
=
input_feat_dict
[
layer_name
]
inp
=
input_feat_dict
[
layer_name
]
inp
.
div_
(
scales
.
view
(
1
,
-
1
).
to
(
inp
.
device
))
inp
.
div_
(
scales
.
view
(
1
,
-
1
).
to
(
inp
.
device
))
prev_op
.
cpu
()
for
layer
in
layers
:
layer
.
cpu
()
awq/quantize/quantizer.py
View file @
ab536fb1
...
@@ -98,7 +98,9 @@ def pseudo_quantize_model_weight(
...
@@ -98,7 +98,9 @@ def pseudo_quantize_model_weight(
for
i
in
tqdm
(
range
(
len
(
layers
)),
desc
=
"pseudo weight quantization..."
):
for
i
in
tqdm
(
range
(
len
(
layers
)),
desc
=
"pseudo weight quantization..."
):
named_linears
=
get_named_linears
(
layers
[
i
])
named_linears
=
get_named_linears
(
layers
[
i
])
for
n
,
m
in
named_linears
.
items
():
for
n
,
m
in
named_linears
.
items
():
m
.
cuda
()
m
.
weight
.
data
=
pseudo_quantize_tensor
(
m
.
weight
.
data
,
n_bit
=
w_bit
,
**
q_config
)
m
.
weight
.
data
=
pseudo_quantize_tensor
(
m
.
weight
.
data
,
n_bit
=
w_bit
,
**
q_config
)
m
.
cpu
()
@
torch
.
no_grad
()
@
torch
.
no_grad
()
...
@@ -121,11 +123,13 @@ def real_quantize_model_weight(
...
@@ -121,11 +123,13 @@ def real_quantize_model_weight(
q_linear
=
WQLinear
.
from_linear
(
q_linear
=
WQLinear
.
from_linear
(
module
,
w_bit
,
q_config
[
'q_group_size'
],
True
)
module
,
w_bit
,
q_config
[
'q_group_size'
],
True
)
else
:
else
:
module
.
cuda
()
module
.
weight
.
data
,
scales
,
zeros
=
pseudo_quantize_tensor
(
module
.
weight
.
data
,
n_bit
=
w_bit
,
get_scale_zp
=
True
,
**
q_config
)
module
.
weight
.
data
,
scales
,
zeros
=
pseudo_quantize_tensor
(
module
.
weight
.
data
,
n_bit
=
w_bit
,
get_scale_zp
=
True
,
**
q_config
)
scales
=
scales
.
t
().
contiguous
()
scales
=
scales
.
t
().
contiguous
()
zeros
=
zeros
.
t
().
contiguous
()
zeros
=
zeros
.
t
().
contiguous
()
q_linear
=
WQLinear
.
from_linear
(
q_linear
=
WQLinear
.
from_linear
(
module
,
w_bit
,
q_config
[
'q_group_size'
],
False
,
scales
,
zeros
)
module
,
w_bit
,
q_config
[
'q_group_size'
],
False
,
scales
,
zeros
)
module
.
cpu
()
set_op_by_name
(
layer
,
name
,
q_linear
)
set_op_by_name
(
layer
,
name
,
q_linear
)
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
gc
.
collect
()
gc
.
collect
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment