Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
01c977e9
Unverified
Commit
01c977e9
authored
Oct 16, 2025
by
Michael Goin
Committed by
GitHub
Oct 16, 2025
Browse files
[CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
b3dda72c
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
62 additions
and
134 deletions
+62
-134
tests/quantization/test_auto_round.py
tests/quantization/test_auto_round.py
+1
-1
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+29
-76
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+0
-35
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+0
-3
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+3
-1
tests/quantization/test_lm_head.py
tests/quantization/test_lm_head.py
+3
-1
tests/quantization/test_quark.py
tests/quantization/test_quark.py
+9
-6
tests/quantization/test_rtn.py
tests/quantization/test_rtn.py
+3
-2
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+14
-9
No files found.
tests/quantization/test_auto_round.py
View file @
01c977e9
...
...
@@ -26,7 +26,7 @@ MODELS = [
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_auto_round
(
vllm_runner
,
model
):
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
8
)
assert
output
print
(
f
"
{
output
[
0
][
1
]
}
"
)
tests/quantization/test_compressed_tensors.py
View file @
01c977e9
...
...
@@ -66,13 +66,6 @@ def enable_pickle(monkeypatch):
2560
,
True
,
),
(
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"channel"
,
QuantizationType
.
INT
,
2560
,
True
,
),
(
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama"
,
"tensor"
,
...
...
@@ -138,7 +131,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
20
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -146,12 +139,9 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
"model_path"
,
[
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym"
,
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
,
],
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"use_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
...
...
@@ -211,7 +201,7 @@ def test_compressed_tensors_w8a8_logprobs(
def
test_compressed_tensors_no_enforce_eager
(
vllm_runner
):
model_path
=
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
with
vllm_runner
(
model_path
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
...
...
@@ -219,15 +209,10 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
"model_args"
,
[
(
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"tensor"
),
(
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym"
,
"tensor"
),
(
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
"channel"
,
),
(
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym"
,
"channel"
,
),
],
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -253,7 +238,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
with
vllm_runner
(
model_path
,
dtype
=
torch
.
float16
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
dtype
=
torch
.
float16
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -268,7 +253,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
20
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -283,38 +268,6 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
True
,
False
,
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"group"
,
128
,
8
,
True
,
False
,
),
(
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
,
"channel"
,
None
,
4
,
True
,
False
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256"
,
"group"
,
128
,
8
,
False
,
False
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel"
,
"channel"
,
None
,
8
,
False
,
False
,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder"
,
"group"
,
...
...
@@ -330,7 +283,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
)
def
test_compressed_tensors_wNa16
(
vllm_runner
,
wNa16_args
):
model
,
strategy
,
group
,
pack_factor
,
symmetric
,
has_g_idx
=
wNa16_args
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -348,7 +301,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
...
...
@@ -357,7 +310,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
)
def
test_compressed_tensors_w4a16_marlin24
(
vllm_runner
):
model_path
=
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
with
vllm_runner
(
model_path
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -370,13 +323,13 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
def
test_compressed_tensors_fp8
(
vllm_runner
):
model_path
=
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
with
vllm_runner
(
model_path
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -399,7 +352,7 @@ def test_compressed_tensors_fp8(vllm_runner):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
...
...
@@ -412,8 +365,8 @@ def test_compressed_tensors_fp8(vllm_runner):
)
def
test_compressed_tensors_kv_cache
(
vllm_runner
):
model_path
=
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
20
)
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
output
=
llm
.
generate_greedy
(
"Hello world!"
,
max_tokens
=
4
)
assert
output
...
...
@@ -465,7 +418,7 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="d
)
def
test_compressed_tensors_2of4_quant_fp8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -476,7 +429,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -512,7 +465,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
)
def
test_compressed_tensors_2of4_quant_fp8_compressed
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -528,7 +481,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -564,7 +517,7 @@ def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
)
def
test_compressed_tensors_2of4_quant_int8_compressed
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -580,7 +533,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -611,7 +564,7 @@ def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
)
def
test_compressed_tensors_2of4_quant_int8
(
vllm_runner
,
args_2of4
):
model
,
weight_strategy
,
input_strategy
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -622,7 +575,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -637,7 +590,7 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
)
def
test_compressed_tensors_2of4_sparse
(
vllm_runner
,
args_2of4
):
model
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -656,7 +609,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -670,7 +623,7 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
)
def
test_compressed_tensors_2of4_sparse_compressed
(
vllm_runner
,
args_2of4
):
model
=
args_2of4
with
vllm_runner
(
model
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -689,7 +642,7 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -723,7 +676,7 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
assert
qkv_proj
.
scheme
.
group_size
==
16
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -758,7 +711,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
assert
proj
.
scheme
.
group_size
==
128
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
print
(
output
)
assert
output
...
...
@@ -792,7 +745,7 @@ def test_compressed_tensors_transforms_perplexity(
def
test_compressed_tensors_fp8_block_enabled
(
vllm_runner
):
model_path
=
"RedHatAI/Qwen3-0.6B-FP8-BLOCK"
with
vllm_runner
(
model_path
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
fp8_dtype
=
current_platform
.
fp8_dtype
()
def
check_model
(
model
):
...
...
@@ -816,5 +769,5 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
tests/quantization/test_cpu_offload.py
View file @
01c977e9
...
...
@@ -16,13 +16,6 @@ from ..utils import compare_two_settings
reason
=
"fp8 is not supported on this GPU type."
,
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Llama-3.2-1B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Qwen2-1.5B-Instruct-FP8"
,
...
...
@@ -46,13 +39,6 @@ def test_cpu_offload_gptq(monkeypatch):
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
@
pytest
.
mark
.
skipif
(
...
...
@@ -69,13 +55,6 @@ def test_cpu_offload_awq(monkeypatch):
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
@
pytest
.
mark
.
skipif
(
...
...
@@ -92,17 +71,3 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
tests/quantization/test_fp8.py
View file @
01c977e9
...
...
@@ -18,7 +18,6 @@ from vllm.platforms import current_platform
MODELS
=
[
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
"nm-testing/Phi-3-mini-128k-instruct-FP8"
,
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"
,
]
...
...
@@ -49,8 +48,6 @@ def test_model_load_and_run(
KV_CACHE_MODELS
=
[
# Deprecated AutoFP8 format using .kv_scale
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
,
# AutoFP8 format using separate .k_scale and .v_scale
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"
,
]
...
...
tests/quantization/test_gptq_dynamic.py
View file @
01c977e9
...
...
@@ -40,7 +40,9 @@ def test_gptq_with_dynamic(
GPTQMarlinLinearMethod
if
use_marlin_kernel
else
(
GPTQLinearMethod
)
)
with
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
)
as
llm
:
with
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
for
name
,
submodule
in
model
.
named_modules
():
...
...
tests/quantization/test_lm_head.py
View file @
01c977e9
...
...
@@ -31,7 +31,9 @@ def test_lm_head(
)
->
None
:
# `LLM.apply_model` requires pickling a function.
monkeypatch
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
with
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
)
as
vllm_model
:
with
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
,
enforce_eager
=
True
)
as
vllm_model
:
def
check_model
(
model
):
lm_head_layer
=
model
.
lm_head
...
...
tests/quantization/test_quark.py
View file @
01c977e9
...
...
@@ -56,7 +56,10 @@ def enable_pickle(monkeypatch):
def
test_quark_fp8_w_per_tensor_a_per_tensor
(
vllm_runner
,
kv_cache_dtype
,
tp
):
model_path
=
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
kv_cache_dtype
,
tensor_parallel_size
=
tp
model_path
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
tensor_parallel_size
=
tp
,
)
as
llm
:
def
check_model
(
model
):
...
...
@@ -74,14 +77,14 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
])
def
test_quark_fp8_w_per_channel_a_per_token
(
vllm_runner
,
tp
):
model_path
=
"amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
with
vllm_runner
(
model_path
,
tensor_parallel_size
=
tp
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
tensor_parallel_size
=
tp
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -98,14 +101,14 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
])
def
test_quark_int8_w_per_tensor_a_per_tensor
(
vllm_runner
,
tp
):
model_path
=
"amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
with
vllm_runner
(
model_path
,
tensor_parallel_size
=
tp
)
as
llm
:
with
vllm_runner
(
model_path
,
enforce_eager
=
True
,
tensor_parallel_size
=
tp
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
...
...
@@ -117,7 +120,7 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
...
...
tests/quantization/test_rtn.py
View file @
01c977e9
...
...
@@ -10,7 +10,6 @@ import pytest
from
tests.quantization.utils
import
is_quant_method_supported
MODELS
=
[
"microsoft/Phi-3-mini-4k-instruct"
,
# dense model
"ai21labs/Jamba-tiny-dev"
,
# MoE model
]
...
...
@@ -30,5 +29,7 @@ def test_model_rtn_startup(
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
quantization
=
"rtn"
)
as
vllm_model
:
with
vllm_runner
(
model
,
enforce_eager
=
True
,
dtype
=
dtype
,
quantization
=
"rtn"
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_torchao.py
View file @
01c977e9
...
...
@@ -19,7 +19,7 @@ def test_pre_quantized_model(vllm_runner):
dtype
=
"bfloat16"
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -39,8 +39,9 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_loca
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
pt_load_map_location
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -54,8 +55,9 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -69,8 +71,9 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -90,7 +93,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -122,8 +125,9 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
pt_load_map_location
=
"cuda:0"
,
quantization
=
"torchao"
,
hf_overrides
=
hf_overrides
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -156,8 +160,9 @@ def test_on_the_fly_quant_config_file(vllm_runner):
pt_load_map_location
=
"cuda:0"
,
quantization
=
"torchao"
,
hf_overrides
=
hf_overrides
,
enforce_eager
=
True
,
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -228,7 +233,7 @@ def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_
"torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
)
with
vllm_runner
(
model_name
=
model_name
,
dtype
=
"bfloat16"
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
@@ -245,7 +250,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
with
vllm_runner
(
model_name
=
model_name
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment