Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f6aa1226
Unverified
Commit
f6aa1226
authored
Nov 18, 2025
by
Alex
Committed by
GitHub
Nov 18, 2025
Browse files
[CI Sprint] Quantization CI Cleanup (#24130)
Signed-off-by:
Alex Yun
<
alexyun04@gmail.com
>
parent
184b12fd
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
32 additions
and
26 deletions
+32
-26
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+2
-2
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+8
-8
tests/quantization/test_experts_int8.py
tests/quantization/test_experts_int8.py
+4
-2
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+8
-5
tests/quantization/test_ipex_quant.py
tests/quantization/test_ipex_quant.py
+2
-2
tests/quantization/test_lm_head.py
tests/quantization/test_lm_head.py
+1
-1
tests/quantization/test_modelopt.py
tests/quantization/test_modelopt.py
+1
-1
tests/quantization/test_ptpc_fp8.py
tests/quantization/test_ptpc_fp8.py
+2
-1
tests/quantization/test_register_quantization_config.py
tests/quantization/test_register_quantization_config.py
+3
-3
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+1
-1
No files found.
tests/quantization/test_compressed_tensors.py
View file @
f6aa1226
...
...
@@ -141,7 +141,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
],
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"use_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
...
...
@@ -182,7 +182,7 @@ def test_compressed_tensors_w8a8_logprobs(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model_path
,
dtype
=
dtype
)
as
vllm_model
:
with
vllm_runner
(
model_path
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
...
...
tests/quantization/test_cpu_offload.py
View file @
f6aa1226
...
...
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Qwen2-1.5B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
...
...
@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
...
...
@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
...
...
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
)
tests/quantization/test_experts_int8.py
View file @
f6aa1226
...
...
@@ -21,7 +21,7 @@ MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
def
test_model_experts_int8_startup
(
hf_runner
,
vllm_runner
,
...
...
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
quantization
=
"experts_int8"
)
as
vllm_model
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
,
quantization
=
"experts_int8"
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_fp8.py
View file @
f6aa1226
...
...
@@ -45,10 +45,10 @@ def test_model_load_and_run(
if
force_marlin
:
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
with
vllm_runner
(
model_id
)
as
llm
:
with
vllm_runner
(
model_id
,
enforce_eager
=
True
)
as
llm
:
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
10
)
outputs
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
print
(
outputs
[
0
][
1
])
...
...
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
# `LLM.apply_model` requires pickling a function.
monkeypatch
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
with
vllm_runner
(
model_id
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
with
vllm_runner
(
model_id
,
kv_cache_dtype
=
"fp8"
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
attn
=
model
.
model
.
layers
[
0
].
self_attn
.
attn
...
...
@@ -112,7 +112,7 @@ def test_kv_cache_model_load_and_run(
# note: this does not test accuracy, just that we can run through
# see lm-eval tests for accuracy
outputs
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
10
)
outputs
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
print
(
outputs
[
0
][
1
])
...
...
@@ -142,7 +142,10 @@ def test_load_fp16_model(
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
,
kv_cache_dtype
=
kv_cache_dtype
"facebook/opt-125m"
,
quantization
=
"fp8"
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
)
as
llm
:
def
check_model
(
model
):
...
...
tests/quantization/test_ipex_quant.py
View file @
f6aa1226
...
...
@@ -26,7 +26,7 @@ DTYPE = ["bfloat16"]
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPE
)
def
test_ipex_quant
(
vllm_runner
,
model
,
dtype
):
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
print
(
output
)
tests/quantization/test_lm_head.py
View file @
f6aa1226
...
...
@@ -49,4 +49,4 @@ def test_lm_head(
vllm_model
.
apply_model
(
check_model
)
print
(
vllm_model
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
10
)[
0
][
1
])
print
(
vllm_model
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)[
0
][
1
])
tests/quantization/test_modelopt.py
View file @
f6aa1226
...
...
@@ -88,6 +88,6 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
llm
.
apply_model
(
check_model
)
# Run a simple generation test to ensure the model works
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
20
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
print
(
f
"ModelOpt FP8 output:
{
output
}
"
)
tests/quantization/test_ptpc_fp8.py
View file @
f6aa1226
...
...
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
"facebook/opt-125m"
,
dtype
=
dtype
,
quantization
=
"ptpc_fp8"
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
)
except
AssertionError
as
e
:
...
...
@@ -65,5 +66,5 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
4
)
assert
output
tests/quantization/test_register_quantization_config.py
View file @
f6aa1226
...
...
@@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import (
get_quantization_config
,
register_quantization_config
,
)
from
vllm.model_executor.layers.quantization.base_config
import
(
# noqa: E501
QuantizationConfig
,
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
# noqa: E501
)
...
...
@@ -142,5 +142,5 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
1
)
assert
output
tests/quantization/test_torchao.py
View file @
f6aa1226
...
...
@@ -392,7 +392,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
assert
not
has_int4_preshuffled_tensor
assert
weight_attrs
==
[
False
,
1
,
0
,
True
]
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
4
)
assert
output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment