Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoGPTQ
Commits
6dc028e2
Commit
6dc028e2
authored
Nov 28, 2024
by
yangql
Browse files
Upload New File
parent
78397def
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
217 additions
and
0 deletions
+217
-0
quant_with_alpaca.py
quant_with_alpaca.py
+217
-0
No files found.
quant_with_alpaca.py
0 → 100644
View file @
6dc028e2
import
json
import
random
import
time
from
argparse
import
ArgumentParser
import
torch
from
datasets
import
Dataset
from
transformers
import
AutoTokenizer
,
TextGenerationPipeline
from
auto_gptq
import
AutoGPTQForCausalLM
,
BaseQuantizeConfig
def
load_data
(
data_path
,
tokenizer
,
n_samples
):
with
open
(
data_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
raw_data
=
json
.
load
(
f
)
raw_data
=
random
.
sample
(
raw_data
,
k
=
min
(
n_samples
,
len
(
raw_data
)))
def
dummy_gen
():
return
raw_data
def
tokenize
(
examples
):
instructions
=
examples
[
"instruction"
]
inputs
=
examples
[
"input"
]
outputs
=
examples
[
"output"
]
prompts
=
[]
texts
=
[]
input_ids
=
[]
attention_mask
=
[]
for
istr
,
inp
,
opt
in
zip
(
instructions
,
inputs
,
outputs
):
if
inp
:
prompt
=
f
"Instruction:
\n
{
istr
}
\n
Input:
\n
{
inp
}
\n
Output:
\n
"
text
=
prompt
+
opt
else
:
prompt
=
f
"Instruction:
\n
{
istr
}
\n
Output:
\n
"
text
=
prompt
+
opt
if
len
(
tokenizer
(
prompt
)[
"input_ids"
])
>=
tokenizer
.
model_max_length
:
continue
tokenized_data
=
tokenizer
(
text
)
input_ids
.
append
(
tokenized_data
[
"input_ids"
][:
tokenizer
.
model_max_length
])
attention_mask
.
append
(
tokenized_data
[
"attention_mask"
][:
tokenizer
.
model_max_length
])
prompts
.
append
(
prompt
)
texts
.
append
(
text
)
return
{
"input_ids"
:
input_ids
,
"attention_mask"
:
attention_mask
,
"prompt"
:
prompts
,
}
dataset
=
Dataset
.
from_generator
(
dummy_gen
)
dataset
=
dataset
.
map
(
tokenize
,
batched
=
True
,
batch_size
=
len
(
dataset
),
num_proc
=
1
,
keep_in_memory
=
True
,
load_from_cache_file
=
False
,
remove_columns
=
[
"instruction"
,
"input"
],
)
dataset
=
dataset
.
to_list
()
for
sample
in
dataset
:
sample
[
"input_ids"
]
=
torch
.
LongTensor
(
sample
[
"input_ids"
])
sample
[
"attention_mask"
]
=
torch
.
LongTensor
(
sample
[
"attention_mask"
])
return
dataset
def
main
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--pretrained_model_dir"
,
type
=
str
)
parser
.
add_argument
(
"--quantized_model_dir"
,
type
=
str
,
default
=
"./Qwen1.5-32B-Chat-gptq-4-test10.24"
)
parser
.
add_argument
(
"--bits"
,
type
=
int
,
default
=
4
,
choices
=
[
2
,
3
,
4
,
8
])
parser
.
add_argument
(
"--group_size"
,
type
=
int
,
default
=
128
,
help
=
"group size, -1 means no grouping or full rank"
,
)
parser
.
add_argument
(
"--desc_act"
,
action
=
"store_true"
,
help
=
"whether to quantize with desc_act"
,
default
=
True
)
parser
.
add_argument
(
"--num_samples"
,
type
=
int
,
default
=
128
,
help
=
"how many samples will be used to quantize model"
,
)
parser
.
add_argument
(
"--save_and_reload"
,
action
=
"store_true"
,
help
=
"whether save quantized model to disk and reload back"
,
default
=
True
)
parser
.
add_argument
(
"--fast_tokenizer"
,
action
=
"store_true"
,
help
=
"whether use fast tokenizer"
)
parser
.
add_argument
(
"--use_triton"
,
action
=
"store_true"
,
help
=
"whether use triton to speedup at inference"
,
)
parser
.
add_argument
(
"--per_gpu_max_memory"
,
type
=
int
,
default
=
None
,
help
=
"max memory used to load model per gpu"
,
)
parser
.
add_argument
(
"--cpu_max_memory"
,
type
=
int
,
default
=
None
,
help
=
"max memory used to offload model to cpu"
,
)
parser
.
add_argument
(
"--quant_batch_size"
,
type
=
int
,
default
=
1
,
help
=
"examples batch size for quantization"
,
)
parser
.
add_argument
(
"--trust_remote_code"
,
action
=
"store_true"
,
help
=
"whether to trust remote code when loading model"
,
)
args
=
parser
.
parse_args
()
max_memory
=
{}
if
args
.
per_gpu_max_memory
is
not
None
and
args
.
per_gpu_max_memory
>
0
:
if
torch
.
cuda
.
is_available
():
max_memory
.
update
({
i
:
f
"
{
args
.
per_gpu_max_memory
}
GIB"
for
i
in
range
(
torch
.
cuda
.
device_count
())})
if
args
.
cpu_max_memory
is
not
None
and
args
.
cpu_max_memory
>
0
and
max_memory
:
max_memory
[
"cpu"
]
=
f
"
{
args
.
cpu_max_memory
}
GIB"
if
not
max_memory
:
max_memory
=
None
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
pretrained_model_dir
,
use_fast
=
args
.
fast_tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
,
)
model
=
AutoGPTQForCausalLM
.
from_pretrained
(
args
.
pretrained_model_dir
,
quantize_config
=
BaseQuantizeConfig
(
bits
=
args
.
bits
,
group_size
=
args
.
group_size
,
desc_act
=
args
.
desc_act
),
max_memory
=
max_memory
,
trust_remote_code
=
args
.
trust_remote_code
,
)
examples
=
load_data
(
"dataset/alpaca_data_cleaned.json"
,
tokenizer
,
args
.
num_samples
)
examples_for_quant
=
[
{
"input_ids"
:
example
[
"input_ids"
],
"attention_mask"
:
example
[
"attention_mask"
]}
for
example
in
examples
]
start
=
time
.
time
()
model
.
quantize
(
examples_for_quant
,
batch_size
=
args
.
quant_batch_size
,
use_triton
=
args
.
use_triton
,
autotune_warmup_after_quantized
=
args
.
use_triton
,
)
end
=
time
.
time
()
print
(
f
"quantization took:
{
end
-
start
:
.
4
f
}
s"
)
if
not
args
.
quantized_model_dir
:
args
.
quantized_model_dir
=
args
.
pretrained_model_dir
if
args
.
save_and_reload
:
model
.
save_quantized
(
args
.
quantized_model_dir
)
del
model
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
model
=
AutoGPTQForCausalLM
.
from_quantized
(
args
.
quantized_model_dir
,
device
=
"cuda:0"
,
use_triton
=
args
.
use_triton
,
max_memory
=
max_memory
,
inject_fused_mlp
=
True
,
inject_fused_attention
=
True
,
trust_remote_code
=
args
.
trust_remote_code
,
)
pipeline_init_kwargs
=
{
"model"
:
model
,
"tokenizer"
:
tokenizer
}
if
not
max_memory
:
pipeline_init_kwargs
[
"device"
]
=
"cuda:0"
pipeline
=
TextGenerationPipeline
(
**
pipeline_init_kwargs
)
for
example
in
random
.
sample
(
examples
,
k
=
min
(
4
,
len
(
examples
))):
print
(
f
"prompt:
{
example
[
'prompt'
]
}
"
)
print
(
"-"
*
42
)
print
(
f
"golden:
{
example
[
'output'
]
}
"
)
print
(
"-"
*
42
)
start
=
time
.
time
()
generated_text
=
pipeline
(
example
[
"prompt"
],
return_full_text
=
False
,
num_beams
=
1
,
max_length
=
len
(
example
[
"input_ids"
])
+
128
,
# use this instead of max_new_token to disable UserWarning when integrate with logging
)[
0
][
"generated_text"
]
end
=
time
.
time
()
print
(
f
"quant:
{
generated_text
}
"
)
num_new_tokens
=
len
(
tokenizer
(
generated_text
)[
"input_ids"
])
print
(
f
"generate
{
num_new_tokens
}
tokens using
{
end
-
start
:
.
4
f
}
s,
{
num_new_tokens
/
(
end
-
start
)
}
tokens/s."
)
print
(
"="
*
42
)
if
__name__
==
"__main__"
:
import
logging
logging
.
basicConfig
(
format
=
"%(asctime)s %(levelname)s [%(name)s] %(message)s"
,
level
=
logging
.
INFO
,
datefmt
=
"%Y-%m-%d %H:%M:%S"
,
)
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment