Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f53b8f0d
Unverified
Commit
f53b8f0d
authored
Jul 18, 2024
by
youkaichao
Committed by
GitHub
Jul 18, 2024
Browse files
[ci][test] add correctness test for cpu offloading (#6549)
parent
2d4733ba
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
105 additions
and
85 deletions
+105
-85
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-0
tests/basic_correctness/test_cpu_offload.py
tests/basic_correctness/test_cpu_offload.py
+8
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+2
-85
tests/utils.py
tests/utils.py
+94
-0
No files found.
.buildkite/test-pipeline.yaml
View file @
f53b8f0d
...
@@ -46,6 +46,7 @@ steps:
...
@@ -46,6 +46,7 @@ steps:
commands
:
commands
:
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-
pytest -v -s basic_correctness/test_basic_correctness.py
-
pytest -v -s basic_correctness/test_basic_correctness.py
-
pytest -v -s basic_correctness/test_cpu_offload.py
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
...
...
tests/basic_correctness/test_cpu_offload.py
0 → 100644
View file @
f53b8f0d
from
..utils
import
compare_two_settings
def
test_cpu_offload
():
compare_two_settings
(
"meta-llama/Llama-2-7b-hf"
,
[],
[
"--cpu-offload-gb"
,
"4"
])
compare_two_settings
(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
,
[],
[
"--cpu-offload-gb"
,
"1"
])
tests/distributed/test_pipeline_parallel.py
View file @
f53b8f0d
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
..utils
import
RemoteOpenAIServer
from
..utils
import
compare_two_settings
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -13,7 +12,6 @@ from ..utils import RemoteOpenAIServer
...
@@ -13,7 +12,6 @@ from ..utils import RemoteOpenAIServer
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
),
])
])
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
):
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
pp_args
=
[
pp_args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
...
@@ -48,85 +46,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
...
@@ -48,85 +46,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
pp_args
.
append
(
"--enforce-eager"
)
pp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
tp_args
.
append
(
"--enforce-eager"
)
prompt
=
"Hello, my name is"
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
)
token_ids
=
tokenizer
(
prompt
)[
"input_ids"
]
results
=
[]
for
args
in
(
pp_args
,
tp_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
server
:
client
=
server
.
get_client
()
# test models list
models
=
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
results
.
append
({
"test"
:
"models_list"
,
"id"
:
served_model
.
id
,
"root"
:
served_model
.
root
,
})
# test with text prompt
completion
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
)
results
.
append
({
"test"
:
"single_completion"
,
"text"
:
completion
.
choices
[
0
].
text
,
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
"usage"
:
completion
.
usage
,
})
# test using token IDs
completion
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
token_ids
,
max_tokens
=
5
,
temperature
=
0.0
,
)
results
.
append
({
"test"
:
"token_ids"
,
"text"
:
completion
.
choices
[
0
].
text
,
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
"usage"
:
completion
.
usage
,
})
# test simple list
batch
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
prompt
,
prompt
],
max_tokens
=
5
,
temperature
=
0.0
,
)
results
.
append
({
"test"
:
"simple_list"
,
"text0"
:
batch
.
choices
[
0
].
text
,
"text1"
:
batch
.
choices
[
1
].
text
,
})
# test streaming
batch
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
prompt
,
prompt
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
results
.
append
({
"test"
:
"streaming"
,
"texts"
:
texts
,
})
n
=
len
(
results
)
//
2
pp_results
=
results
[:
n
]
tp_results
=
results
[
n
:]
for
pp
,
tp
in
zip
(
pp_results
,
tp_results
):
assert
pp
==
tp
tests/utils.py
View file @
f53b8f0d
...
@@ -10,6 +10,7 @@ from typing import Any, Dict, List
...
@@ -10,6 +10,7 @@ from typing import Any, Dict, List
import
openai
import
openai
import
ray
import
ray
import
requests
import
requests
from
transformers
import
AutoTokenizer
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
from
vllm.distributed
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
init_distributed_environment
)
...
@@ -124,6 +125,99 @@ class RemoteOpenAIServer:
...
@@ -124,6 +125,99 @@ class RemoteOpenAIServer:
)
)
def
compare_two_settings
(
model
:
str
,
arg1
:
List
[
str
],
arg2
:
List
[
str
]):
"""
Launch API server with two different sets of arguments and compare the
results of the API calls. The arguments are after the model name.
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
prompt
=
"Hello, my name is"
token_ids
=
tokenizer
(
prompt
)[
"input_ids"
]
results
=
[]
for
args
in
(
arg1
,
arg2
):
with
RemoteOpenAIServer
(
model
,
args
)
as
server
:
client
=
server
.
get_client
()
# test models list
models
=
client
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
results
.
append
({
"test"
:
"models_list"
,
"id"
:
served_model
.
id
,
"root"
:
served_model
.
root
,
})
# test with text prompt
completion
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
)
results
.
append
({
"test"
:
"single_completion"
,
"text"
:
completion
.
choices
[
0
].
text
,
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
"usage"
:
completion
.
usage
,
})
# test using token IDs
completion
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
token_ids
,
max_tokens
=
5
,
temperature
=
0.0
,
)
results
.
append
({
"test"
:
"token_ids"
,
"text"
:
completion
.
choices
[
0
].
text
,
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
"usage"
:
completion
.
usage
,
})
# test simple list
batch
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
[
prompt
,
prompt
],
max_tokens
=
5
,
temperature
=
0.0
,
)
results
.
append
({
"test"
:
"simple_list"
,
"text0"
:
batch
.
choices
[
0
].
text
,
"text1"
:
batch
.
choices
[
1
].
text
,
})
# test streaming
batch
=
client
.
completions
.
create
(
model
=
model
,
prompt
=
[
prompt
,
prompt
],
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
results
.
append
({
"test"
:
"streaming"
,
"texts"
:
texts
,
})
n
=
len
(
results
)
//
2
arg1_results
=
results
[:
n
]
arg2_results
=
results
[
n
:]
for
arg1_result
,
arg2_result
in
zip
(
arg1_results
,
arg2_results
):
assert
arg1_result
==
arg2_result
,
\
f
"Results for
{
model
=
}
are not the same with
{
arg1
=
}
and
{
arg2
=
}
"
def
init_test_distributed_environment
(
def
init_test_distributed_environment
(
tp_size
:
int
,
tp_size
:
int
,
pp_size
:
int
,
pp_size
:
int
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment