Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9e51b6a6
Unverified
Commit
9e51b6a6
authored
Aug 20, 2024
by
youkaichao
Committed by
GitHub
Aug 20, 2024
Browse files
[ci][test] adjust max wait time for cpu offloading test (#7709)
parent
6e4658c7
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
22 deletions
+34
-22
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+18
-9
tests/utils.py
tests/utils.py
+16
-13
No files found.
tests/quantization/test_cpu_offload.py
View file @
9e51b6a6
...
...
@@ -14,10 +14,12 @@ def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings
(
"meta-llama/Meta-Llama-3-8B-Instruct"
,
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
])
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"2"
],
max_wait_seconds
=
480
)
# Test loading a quantized checkpoint
compare_two_settings
(
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
,
[],
[
"--cpu-offload-gb"
,
"2"
])
[
"--cpu-offload-gb"
,
"2"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
...
...
@@ -25,11 +27,13 @@ def test_cpu_offload_fp8():
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test GPTQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
,
[
"--quantization"
,
"gptq"
],
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
])
[
"--quantization"
,
"gptq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
),
...
...
@@ -37,11 +41,13 @@ def test_cpu_offload_gptq():
def
test_cpu_offload_awq
():
# Test AWQ Marlin
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test AWQ
compare_two_settings
(
"Qwen/Qwen2-1.5B-Instruct-AWQ"
,
[
"--quantization"
,
"awq"
],
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
])
[
"--quantization"
,
"awq"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
),
...
...
@@ -49,11 +55,14 @@ def test_cpu_offload_awq():
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test w4a16_marlin24
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[],
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
# Test w8a8
compare_two_settings
(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
[
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
)
tests/utils.py
View file @
9e51b6a6
...
...
@@ -56,16 +56,14 @@ VLLM_PATH = Path(__file__).parent.parent
class
RemoteOpenAIServer
:
DUMMY_API_KEY
=
"token-abc123"
# vLLM's OpenAI server does not need API key
MAX_START_WAIT_S
=
240
# wait for server to start for 240 seconds
def
__init__
(
self
,
def
__init__
(
self
,
model
:
str
,
cli_args
:
List
[
str
],
*
,
env_dict
:
Optional
[
Dict
[
str
,
str
]]
=
None
,
auto_port
:
bool
=
True
,
)
->
None
:
max_wait_seconds
:
Optional
[
float
]
=
None
)
->
None
:
if
auto_port
:
if
"-p"
in
cli_args
or
"--port"
in
cli_args
:
raise
ValueError
(
"You have manually specified the port"
...
...
@@ -90,8 +88,9 @@ class RemoteOpenAIServer:
env
=
env
,
stdout
=
sys
.
stdout
,
stderr
=
sys
.
stderr
)
max_wait_seconds
=
max_wait_seconds
or
240
self
.
_wait_for_server
(
url
=
self
.
url_for
(
"health"
),
timeout
=
self
.
MAX_START_WAIT_S
)
timeout
=
max_wait_seconds
)
def
__enter__
(
self
):
return
self
...
...
@@ -145,7 +144,8 @@ def compare_two_settings(model: str,
arg1
:
List
[
str
],
arg2
:
List
[
str
],
env1
:
Optional
[
Dict
[
str
,
str
]]
=
None
,
env2
:
Optional
[
Dict
[
str
,
str
]]
=
None
):
env2
:
Optional
[
Dict
[
str
,
str
]]
=
None
,
max_wait_seconds
:
Optional
[
float
]
=
None
)
->
None
:
"""
Launch API server with two different sets of arguments/environments
and compare the results of the API calls.
...
...
@@ -164,7 +164,10 @@ def compare_two_settings(model: str,
token_ids
=
tokenizer
(
prompt
)[
"input_ids"
]
results
=
[]
for
args
,
env
in
((
arg1
,
env1
),
(
arg2
,
env2
)):
with
RemoteOpenAIServer
(
model
,
args
,
env_dict
=
env
)
as
server
:
with
RemoteOpenAIServer
(
model
,
args
,
env_dict
=
env
,
max_wait_seconds
=
max_wait_seconds
)
as
server
:
client
=
server
.
get_client
()
# test models list
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment