Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e1655287
Unverified
Commit
e1655287
authored
Aug 16, 2024
by
Michael Goin
Committed by
GitHub
Aug 15, 2024
Browse files
[CI] Move quantization cpu offload tests out of fastcheck (#7574)
parent
3b19e39d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
59 additions
and
55 deletions
+59
-55
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+0
-55
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+59
-0
No files found.
tests/basic_correctness/test_cpu_offload.py
View file @
e1655287
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/quantization/test_cpu_offload.py
0 → 100644
View file @
e1655287
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment