Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
47e9038d
Unverified
Commit
47e9038d
authored
Mar 28, 2025
by
Michael Goin
Committed by
GitHub
Mar 29, 2025
Browse files
Fix cpu offload testing for gptq/awq/ct (#15648)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
432cf22a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
3 deletions
+42
-3
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+9
-3
tests/utils.py
tests/utils.py
+33
-0
No files found.
tests/quantization/test_cpu_offload.py
View file @
47e9038d
...
...
@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
def
test_cpu_offload_gptq
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
def
test_cpu_offload_awq
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
@@ -61,7 +65,9 @@ def test_cpu_offload_awq():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
def
test_cpu_offload_compressed_tensors
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
tests/utils.py
View file @
47e9038d
...
...
@@ -317,6 +317,37 @@ def _test_completion_close(
return
results
def
_test_chat
(
client
:
openai
.
OpenAI
,
model
:
str
,
prompt
:
str
,
):
results
=
[]
messages
=
[{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
prompt
}]
}]
# test with text prompt
chat_response
=
client
.
chat
.
completions
.
create
(
model
=
model
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
)
results
.
append
({
"test"
:
"completion_close"
,
"text"
:
chat_response
.
choices
[
0
].
message
.
content
,
"finish_reason"
:
chat_response
.
choices
[
0
].
finish_reason
,
"usage"
:
chat_response
.
usage
,
})
return
results
def
_test_embeddings
(
client
:
openai
.
OpenAI
,
model
:
str
,
...
...
@@ -512,6 +543,8 @@ def compare_all_settings(model: str,
results
+=
_test_completion
(
client
,
model
,
prompt
,
token_ids
)
elif
method
==
"generate_close"
:
results
+=
_test_completion_close
(
client
,
model
,
prompt
)
elif
method
==
"generate_chat"
:
results
+=
_test_chat
(
client
,
model
,
prompt
)
elif
method
==
"generate_with_image"
:
results
+=
_test_image_text
(
client
,
model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment