Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7d4e1b85
Unverified
Commit
7d4e1b85
authored
Apr 02, 2024
by
Qubitium
Committed by
GitHub
Apr 01, 2024
Browse files
[Misc] Add support for new autogptq checkpoint_format (#3689)
Co-authored-by:
Robert Shaw
<
rshaw@neuralmagic.com
>
parent
93deb0b3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
83 additions
and
13 deletions
+83
-13
tests/quantization/test_autogptq_marlin_configs.py
tests/quantization/test_autogptq_marlin_configs.py
+68
-0
vllm/config.py
vllm/config.py
+15
-13
No files found.
tests/quantization/test_autogptq_marlin_configs.py
0 → 100644
View file @
7d4e1b85
"""Tests whether Marlin models can be loaded from the autogptq config.
Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
"""
from
dataclasses
import
dataclass
import
pytest
from
vllm.config
import
ModelConfig
@
dataclass
class
ModelPair
:
model_marlin
:
str
model_gptq
:
str
# Model Id // Expected Kernel
MODELS_QUANT_TYPE
=
[
# compat: autogptq <=0.7.1 is_marlin_format: bool
(
"neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"
,
"marlin"
),
(
"TheBloke/Llama-2-7B-Chat-GPTQ"
,
"gptq"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"
,
"marlin"
),
(
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
,
"gptq"
)
]
@
pytest
.
mark
.
parametrize
(
"model_quant_type"
,
MODELS_QUANT_TYPE
)
def
test_auto_gptq
(
model_quant_type
:
str
,
)
->
None
:
model_path
,
quant_type
=
model_quant_type
model_config_no_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
download_dir
=
None
,
load_format
=
"dummy"
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
None
# case 1
)
model_config_quant_arg
=
ModelConfig
(
model_path
,
model_path
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
download_dir
=
None
,
load_format
=
"dummy"
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
quantization
=
"gptq"
# case 2
)
assert
model_config_no_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_no_quant_arg
.
quantization
}
"
"for no --quantization None case"
)
assert
model_config_quant_arg
.
quantization
==
quant_type
,
(
f
"Expected quant_type ==
{
quant_type
}
for
{
model_path
}
, "
f
"but found
{
model_config_quant_arg
.
quantization
}
"
"for --quantization gptq case"
)
vllm/config.py
View file @
7d4e1b85
...
...
@@ -171,26 +171,28 @@ class ModelConfig:
self
.
quantization
=
self
.
quantization
.
lower
()
# Parse quantization method from the HF model config, if available.
hf_quant_config
=
getattr
(
self
.
hf_config
,
"quantization_config"
,
None
)
if
hf_quant_config
is
not
None
:
hf_quant_method
=
str
(
hf_quant_config
[
"quant_method"
]).
lower
()
# If the GPTQ model is serialized in marlin format, use marlin.
if
(
hf_quant_method
==
"gptq"
and
"is_marlin_format"
in
hf_quant_config
and
hf_quant_config
[
"is_marlin_format"
]):
quant_cfg
=
getattr
(
self
.
hf_config
,
"quantization_config"
,
None
)
if
quant_cfg
is
not
None
:
quant_method
=
quant_cfg
.
get
(
"quant_method"
,
""
).
lower
()
# compat: autogptq >=0.8.0 use checkpoint_format: str
# compat: autogptq <=0.7.1 is_marlin_format: bool
is_format_marlin
=
(
quant_cfg
.
get
(
"checkpoint_format"
)
==
"marlin"
or
quant_cfg
.
get
(
"is_marlin_format"
,
False
))
# Use marlin if the GPTQ model is serialized in marlin format.
if
quant_method
==
"gptq"
and
is_format_marlin
:
logger
.
info
(
"The model is serialized in Marlin format. "
"Using Marlin kernel."
)
hf_
quant_method
=
"marlin"
quant_method
=
"marlin"
if
self
.
quantization
==
"gptq"
:
self
.
quantization
=
hf_
quant_method
self
.
quantization
=
quant_method
if
self
.
quantization
is
None
:
self
.
quantization
=
hf_
quant_method
elif
self
.
quantization
!=
hf_
quant_method
:
self
.
quantization
=
quant_method
elif
self
.
quantization
!=
quant_method
:
raise
ValueError
(
"Quantization method specified in the model config "
f
"(
{
hf_
quant_method
}
) does not match the quantization "
f
"(
{
quant_method
}
) does not match the quantization "
f
"method specified in the `quantization` argument "
f
"(
{
self
.
quantization
}
)."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment