Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
80996d1d
Commit
80996d1d
authored
Sep 20, 2023
by
Casper Hansen
Browse files
Retire entry.py and create examples/eval.py
parent
d3550fec
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
56 additions
and
191 deletions
+56
-191
awq/entry.py
awq/entry.py
+0
-191
examples/eval.py
examples/eval.py
+56
-0
No files found.
awq/entry.py
deleted
100644 → 0
View file @
d3550fec
import
os
import
time
import
torch
import
argparse
from
lm_eval
import
evaluator
from
awq
import
AutoAWQForCausalLM
from
awq.quantize.auto_clip
import
apply_clip
from
awq.quantize.auto_scale
import
apply_scale
from
awq.utils.lm_eval_adaptor
import
LMEvalAdaptor
from
transformers
import
AutoTokenizer
,
GenerationConfig
def
load_search_result_into_memory
(
model
,
search_path
):
awq_results
=
torch
.
load
(
search_path
,
map_location
=
"cpu"
)
apply_scale
(
model
,
awq_results
[
"scale"
])
apply_clip
(
model
,
awq_results
[
"clip"
])
def
run_search
(
model_path
,
dump_path
,
quant_config
):
"""
Step 1/2: Search the pile for an optimal scaling factor.
"""
# Load model
model
=
AutoAWQForCausalLM
.
from_pretrained
(
model_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# Quantize
model
.
quantize
(
tokenizer
,
quant_config
=
quant_config
,
run_search
=
True
,
run_quant
=
False
)
# Save search results
model
.
save_quantized
(
dump_path
)
# Save tokenizer
tokenizer
.
save_pretrained
(
dump_path
)
def
run_quant
(
model_path
,
search_path
,
dump_path
,
quant_config
):
"""
Step 2/2: Use the search results to quantize model weights
"""
# Load model and search results
model
=
AutoAWQForCausalLM
.
from_pretrained
(
model_path
)
load_search_result_into_memory
(
model
.
model
,
search_path
)
# Run actual weight quantization
model
.
quantize
(
quant_config
=
quant_config
,
run_search
=
False
,
run_quant
=
True
)
# Save quantized model
model
.
save_quantized
(
dump_path
)
def
run_eval
(
model_path
,
quant_file
,
device
,
tasks
,
task_batch_size
,
task_n_shot
,
task_use_pretrained
):
"""
Post quantization: Evaluate perplexity on wikitext with EleutherAI Evaluation Harness
"""
# Load model
if
task_use_pretrained
:
model
=
AutoAWQForCausalLM
.
from_pretrained
(
model_path
)
else
:
model
=
AutoAWQForCausalLM
.
from_quantized
(
model_path
,
quant_file
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# Load adapter
lm_eval_model
=
LMEvalAdaptor
(
model_path
,
model
,
tokenizer
,
device
,
batch_size
=
task_batch_size
)
# Evaluate perplexity of quantized model
results
=
evaluator
.
simple_evaluate
(
model
=
lm_eval_model
,
tasks
=
tasks
.
split
(
','
),
batch_size
=
task_batch_size
,
no_cache
=
True
,
num_fewshot
=
task_n_shot
,
)
print
(
evaluator
.
make_table
(
results
))
@
torch
.
inference_mode
()
def
run_speed
(
model_path
,
quant_file
,
device
,
n_generate
=
128
,
n_context
=
256
,
batch_size
=
1
,
disable_fused_layers
=
False
):
def
_timer
(
func
):
start
=
time
.
time
()
out
=
func
()
return
out
,
time
.
time
()
-
start
def
_warmup
(
device
:
str
):
warm_up
=
torch
.
randn
((
4096
,
4096
)).
to
(
device
)
torch
.
mm
(
warm_up
,
warm_up
)
if
quant_file
:
fuse_layers
=
False
if
disable_fused_layers
else
True
model
,
load_time
=
_timer
(
lambda
:
AutoAWQForCausalLM
.
from_quantized
(
model_path
,
quant_file
,
fuse_layers
=
fuse_layers
))
else
:
model
,
load_time
=
_timer
(
lambda
:
AutoAWQForCausalLM
.
from_pretrained
(
model_path
))
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
_warmup
(
device
)
# Generate random inputs
n_context
=
n_context
-
n_generate
ids
=
torch
.
randint
(
0
,
tokenizer
.
vocab_size
,
(
batch_size
,
n_context
)).
cuda
()
# Context stage
_
,
context_time
=
_timer
(
lambda
:
model
.
generate
(
ids
,
generation_config
=
GenerationConfig
(
max_new_tokens
=
0
,
min_new_tokens
=
0
,
use_cache
=
True
)
))
# Generation stage
_
,
generation_time
=
_timer
(
lambda
:
model
.
generate
(
ids
,
generation_config
=
GenerationConfig
(
max_new_tokens
=
n_context
,
min_new_tokens
=
n_context
,
forced_eos_token_id
=-
100
,
pad_token_id
=
tokenizer
.
pad_token_id
,
eos_token_id
=-
100
,
use_cache
=
True
)
))
# Prints
memory_used
=
torch
.
cuda
.
max_memory_allocated
(
device
)
/
(
1024
**
2
)
context_tokens_per_second
=
n_context
/
context_time
*
batch_size
context_ms_per_token
=
(
context_time
*
1000
)
/
n_context
/
batch_size
inference_tokens_per_second
=
n_generate
/
generation_time
*
batch_size
inference_ms_per_token
=
(
generation_time
*
1000
)
/
n_generate
/
batch_size
print
(
f
"[=] Model summary:
{
model_path
}
[=]"
)
print
(
f
"[*] Load time:
{
load_time
:.
2
f
}
seconds"
)
print
(
f
"[*] Context speed:
{
context_tokens_per_second
:.
2
f
}
tokens/second (
{
context_ms_per_token
:.
2
f
}
ms/token)"
)
print
(
f
"[*] Generation speed:
{
inference_tokens_per_second
:.
2
f
}
tokens/second (
{
inference_ms_per_token
:.
2
f
}
ms/token)"
)
print
(
f
"[*] VRAM:
{
memory_used
:.
2
f
}
MB"
)
if
__name__
==
'__main__'
:
"""
- Run AWQ search and save result:
python -m awq.entry --entry_type search --model_path lmsys/vicuna-7b-v1.5 --search_path vicuna-7b-v1.5-awq
- Run AWQ to save the real quantized weights at the quant_path:
python -m awq.entry --entry_type quant --model_path lmsys/vicuna-7b-v1.5 --search_path vicuna-7b-v1.5-awq/awq_model_search_result.pt --quant_path vicuna-7b-v1.5-awq
- Run perplexity of quantized model:
python -m awq.entry --entry_type eval --model_path vicuna-7b-v1.5-awq --quant_file awq_model_w4_g128.pt
- Run perplexity unquantized FP16 model:
python -m awq.entry --entry_type eval --model_path lmsys/vicuna-7b-v1.5 --task_use_pretrained
- Run a speedtest to benchmark the quantized model:
python -m awq.entry --entry_type speed --model_path vicuna-7b-v1.5-awq --quant_file awq_model_w4_g128.pt --n_generate 128 --n_context 256
- Run a speedtest to benchmark the unquantized FP16 model:
python -m awq.entry --entry_type speed --model_path lmsys/vicuna-7b-v1.5 --n_generate 128 --n_context 256
"""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--entry_type'
,
type
=
str
,
help
=
'The type of task to run (search|quant|eval|speed)'
)
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
help
=
'Path to hf model'
)
parser
.
add_argument
(
'--search_path'
,
type
=
str
,
help
=
'Path to save/load AWQ search results'
)
parser
.
add_argument
(
'--quant_path'
,
type
=
str
,
help
=
'Path to save AWQ model to directory'
)
parser
.
add_argument
(
'--quant_file'
,
type
=
str
,
help
=
'Path to quantized AWQ model file'
)
parser
.
add_argument
(
'--device'
,
type
=
str
,
default
=
'cuda:0'
,
help
=
'Device to load model to'
)
parser
.
add_argument
(
'--w_bit'
,
type
=
int
,
default
=
4
)
parser
.
add_argument
(
'--q_group_size'
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
'--tasks'
,
type
=
str
,
default
=
'wikitext'
,
help
=
'Tasks to evaluate. '
'Separate tasks by comma for multiple tasks.'
'https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md'
)
parser
.
add_argument
(
"--task_use_pretrained"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Pass '--task_use_pretrained' to use a pretrained model running FP16"
)
parser
.
add_argument
(
'--task_batch_size'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--task_n_shot'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--n_generate'
,
type
=
int
,
default
=
128
)
parser
.
add_argument
(
'--n_context'
,
type
=
int
,
default
=
256
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--disable_fused_layers"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Pass '--disable_fused_layers' to disable fused layers"
)
args
=
parser
.
parse_args
()
quant_config
=
{
"zero_point"
:
True
,
"q_group_size"
:
args
.
q_group_size
,
"w_bit"
:
args
.
w_bit
}
if
args
.
entry_type
==
'search'
:
run_search
(
args
.
model_path
,
args
.
search_path
,
quant_config
)
elif
args
.
entry_type
==
'quant'
:
run_quant
(
args
.
model_path
,
args
.
search_path
,
args
.
quant_path
,
quant_config
)
elif
args
.
entry_type
==
'eval'
:
run_eval
(
args
.
model_path
,
args
.
quant_file
,
args
.
device
,
args
.
tasks
,
args
.
task_batch_size
,
args
.
task_n_shot
,
args
.
task_use_pretrained
)
elif
args
.
entry_type
==
'speed'
:
run_speed
(
args
.
model_path
,
args
.
quant_file
,
args
.
device
,
args
.
n_generate
,
args
.
n_context
,
args
.
batch_size
,
args
.
disable_fused_layers
)
else
:
raise
Exception
(
'--entry_type must be one of (search|quant|eval|speed)'
)
examples/eval.py
0 → 100644
View file @
80996d1d
import
argparse
from
lm_eval
import
evaluator
from
awq
import
AutoAWQForCausalLM
from
transformers
import
AutoTokenizer
from
awq.utils.lm_eval_adaptor
import
LMEvalAdaptor
def
run_eval
(
model_path
,
quant_file
,
device
,
tasks
,
task_batch_size
,
task_n_shot
,
task_use_pretrained
):
"""
Post quantization: Evaluate perplexity on wikitext with EleutherAI Evaluation Harness
"""
# Load model
if
task_use_pretrained
:
model
=
AutoAWQForCausalLM
.
from_pretrained
(
model_path
)
else
:
model
=
AutoAWQForCausalLM
.
from_quantized
(
model_path
,
quant_file
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# Load adapter
lm_eval_model
=
LMEvalAdaptor
(
model_path
,
model
,
tokenizer
,
device
,
batch_size
=
task_batch_size
)
# Evaluate perplexity of quantized model
results
=
evaluator
.
simple_evaluate
(
model
=
lm_eval_model
,
tasks
=
tasks
.
split
(
','
),
batch_size
=
task_batch_size
,
no_cache
=
True
,
num_fewshot
=
task_n_shot
,
)
print
(
evaluator
.
make_table
(
results
))
if
__name__
==
'__main__'
:
"""
- Run perplexity of quantized model:
python examples/eval.py --model_path vicuna-7b-v1.5-awq --quant_file awq_model_w4_g128.pt
- Run perplexity unquantized FP16 model:
python examples/eval.py --use_pretrained --model_path lmsys/vicuna-7b-v1.5
"""
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
help
=
'Path to hf model'
)
parser
.
add_argument
(
'--quant_file'
,
default
=
''
,
type
=
str
,
help
=
'Path to quantized AWQ model file'
)
parser
.
add_argument
(
'--device'
,
type
=
str
,
default
=
'cuda:0'
,
help
=
'Device to load model to'
)
parser
.
add_argument
(
"--use_pretrained"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Pass '--use_pretrained' to use a pretrained model running FP16"
)
parser
.
add_argument
(
'--tasks'
,
type
=
str
,
default
=
'wikitext'
,
help
=
'Tasks to evaluate. '
'Separate tasks by comma for multiple tasks.'
'https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--n_shot'
,
type
=
int
,
default
=
0
)
args
=
parser
.
parse_args
()
run_eval
(
args
.
model_path
,
args
.
quant_file
,
args
.
device
,
args
.
tasks
,
args
.
batch_size
,
args
.
n_shot
,
args
.
use_pretrained
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment