Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f192aeba
Unverified
Commit
f192aeba
authored
Nov 09, 2024
by
bnellnm
Committed by
GitHub
Nov 09, 2024
Browse files
[Bugfix] Enable some fp8 and quantized fullgraph tests (#10171)
Signed-off-by:
Bill Nell
<
bill@neuralmagic.com
>
parent
8e1529dc
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
16 deletions
+13
-16
tests/compile/utils.py
tests/compile/utils.py
+13
-16
No files found.
tests/compile/utils.py
View file @
f192aeba
...
...
@@ -9,29 +9,26 @@ from vllm.platforms import current_platform
TEST_MODELS
=
[
(
"facebook/opt-125m"
,
{}),
# TODO: add fake implementation for compressed-tensors
# ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
# "dtype": torch.float16,
# "quantization": "compressed-tensors"
# }),
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"compressed-tensors"
}),
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
{
"dtype"
:
torch
.
float16
,
"quantization"
:
"fp8"
}),
# TODO: add fake implementation for compressed-tensors
# ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
# "quantization": "compressed-tensors"
# }),
(
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"
,
{
"quantization"
:
"compressed-tensors"
}),
(
"meta-llama/Meta-Llama-3-8B"
,
{}),
]
# TODO: enable in pytorch 2.5
if
False
and
is_quant_method_supported
(
"aqlm"
):
# noqa: SIM223
if
is_quant_method_supported
(
"aqlm"
):
TEST_MODELS
.
append
((
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
,
{
"quantization"
:
"aqlm"
}))
# TODO:
enable in pytorch 2.5
# TODO:
figure out why this fails.
if
False
and
is_quant_method_supported
(
"gguf"
):
# noqa: SIM223
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
,
{
"quantization"
:
"gguf"
...
...
@@ -71,13 +68,13 @@ def check_full_graph_support(model,
os
.
environ
[
"VLLM_TORCH_COMPILE_LEVEL"
]
=
str
(
optimization_level
)
os
.
environ
[
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"
]
=
"1"
# Inductor doesn't support fp8 and the base meta llama uses too
# much memory.
quantization
=
model_kwargs
.
get
(
"quantization"
)
if
((
quantization
==
"fp8"
or
model
==
"meta-llama/Meta-Llama-3-8B"
)
# The base meta llama uses too much memory.
if
(
model
==
"meta-llama/Meta-Llama-3-8B"
and
optimization_level
>=
CompilationLevel
.
PIECEWISE
):
return
print
(
f
"MODEL=
{
model
}
"
)
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment