Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad385667
Commit
ad385667
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.3.post1-dev'
parents
be0967c1
903593d3
Changes
364
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1316 additions
and
862 deletions
+1316
-862
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+0
-34
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+88
-37
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+1
-1
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+43
-23
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+48
-0
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+9
-0
tests/entrypoints/offline_mode/__init__.py
tests/entrypoints/offline_mode/__init__.py
+0
-0
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+77
-0
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+53
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+257
-0
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+62
-13
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+133
-6
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+32
-20
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_chunked_prompt.py
+126
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+131
-0
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+45
-8
tests/entrypoints/openai/test_disable_mp.py
tests/entrypoints/openai/test_disable_mp.py
+0
-715
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+76
-5
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+52
-0
tests/entrypoints/openai/test_lora_lineage.py
tests/entrypoints/openai/test_lora_lineage.py
+83
-0
No files found.
Too many changes to show.
To preserve performance only
364 of 364+
files are displayed.
Plain diff
Email patch
tests/entrypoints/llm/test_encode.py
View file @
ad385667
...
...
@@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput],
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
prompt
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
prompt
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
({
"prompt"
:
prompt
},
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
...
@@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
PROMPTS
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
pooling_params
=
pooling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
...
...
tests/entrypoints/llm/test_generate.py
View file @
ad385667
...
...
@@ -6,6 +6,7 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
MODEL_NAME
=
"facebook/opt-125m"
...
...
@@ -46,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
({
"prompt"
:
prompt
},
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
...
@@ -78,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
PROMPTS
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
sampling_params
=
sampling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
...
...
@@ -140,3 +104,90 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
ad385667
...
...
@@ -50,7 +50,7 @@ def zephyr_lora_files():
@
pytest
.
mark
.
skip_global_cleanup
def
test_multiple_lora_requests
(
llm
:
LLM
,
zephyr_lora_files
):
lora_request
=
[
LoRARequest
(
LORA_NAME
,
idx
+
1
,
zephyr_lora_files
)
LoRARequest
(
LORA_NAME
+
str
(
idx
)
,
idx
+
1
,
zephyr_lora_files
)
for
idx
in
range
(
len
(
PROMPTS
))
]
# Multiple SamplingParams should be matched with each prompt
...
...
tests/entrypoints/llm/test_guided_generate.py
View file @
ad385667
...
...
@@ -7,7 +7,7 @@ import pytest
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...conftest
import
cleanup
...
...
@@ -31,14 +31,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -57,15 +55,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_json
=
sample_json_schema
))
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
...
...
@@ -86,12 +82,11 @@ def test_guided_choice_completion(sample_guided_choice, llm):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
choice
=
sample_guided_choice
)
)
outputs
=
llm
.
generate
(
prompts
=
"The best language for type-safe systems programming is "
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_choice
=
sample_guided_choice
))
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -112,13 +107,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_statements
)
)
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_grammar
=
sample_sql_statements
)
)
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -140,3 +135,28 @@ def test_guided_grammar(sample_sql_statements, llm):
assert
generated_text
.
strip
()
==
ground_truth
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_options_request_deprecation_warning
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"guided_options_request"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_validation_against_both_guided_decoding_options
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
with
pytest
.
raises
(
ValueError
,
match
=
"Cannot set both"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
tests/entrypoints/llm/test_lazy_outlines.py
0 → 100644
View file @
ad385667
import
sys
from
vllm
import
LLM
,
SamplingParams
def
test_lazy_outlines
(
sample_regex
):
"""If users don't use guided decoding, outlines should not be imported.
"""
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
llm
=
LLM
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
gpu_memory_utilization
=
0.3
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
tests/entrypoints/llm/test_prompt_validation.py
0 → 100644
View file @
ad385667
import
pytest
from
vllm
import
LLM
def
test_empty_prompt
():
llm
=
LLM
(
model
=
"gpt2"
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
tests/entrypoints/offline_mode/__init__.py
0 → 100644
View file @
ad385667
tests/entrypoints/offline_mode/test_offline_mode.py
0 → 100644
View file @
ad385667
"""Tests for HF_HUB_OFFLINE mode"""
import
importlib
import
sys
import
weakref
import
pytest
from
vllm
import
LLM
from
...conftest
import
cleanup
MODEL_NAME
=
"facebook/opt-125m"
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
@
pytest
.
mark
.
skip_global_cleanup
def
test_offline_mode
(
llm
:
LLM
,
monkeypatch
):
# we use the llm fixture to ensure the model files are in-cache
del
llm
# Set HF to offline mode and ensure we can still construct an LLM
try
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules
()
# Cached model files should be used in offline mode
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
finally
:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
monkeypatch
.
delenv
(
"HF_HUB_OFFLINE"
)
_re_import_modules
()
pass
def
_re_import_modules
():
hf_hub_module_names
=
[
k
for
k
in
sys
.
modules
if
k
.
startswith
(
"huggingface_hub"
)
]
transformers_module_names
=
[
k
for
k
in
sys
.
modules
if
k
.
startswith
(
"transformers"
)
and
not
k
.
startswith
(
"transformers_modules"
)
]
reload_exception
=
None
for
module_name
in
hf_hub_module_names
+
transformers_module_names
:
try
:
importlib
.
reload
(
sys
.
modules
[
module_name
])
except
Exception
as
e
:
reload_exception
=
e
# Try to continue clean up so that other tests are less likely to
# be affected
# Error this test if reloading a module failed
if
reload_exception
is
not
None
:
raise
reload_exception
tests/entrypoints/openai/test_accuracy.py
0 → 100644
View file @
ad385667
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""
import
lm_eval
import
pytest
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT
=
500
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
DEFAULT_ARGS
=
[
"--max-model-len"
,
"4096"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
[
"--enable-chunked-prefill"
],
# Chunked
[
"--num-scheduler-steps"
,
"8"
],
# MS
[
"--num-scheduler-steps"
,
"8"
,
"--multi-step-stream-outputs"
]
# MS+Stream
]
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy
(
more_args
):
args
=
list
(
DEFAULT_ARGS
)
args
.
extend
(
more_args
)
print
(
f
"Running with:
{
args
}
"
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
url
=
f
"
{
remote_server
.
url_for
(
'v1'
)
}
/completions"
model_args
=
(
f
"model=
{
MODEL_NAME
}
,"
f
"base_url=
{
url
}
,"
f
"num_concurrent=
{
NUM_CONCURRENT
}
,tokenized_requests=False"
)
results
=
lm_eval
.
simple_evaluate
(
model
=
"local-completions"
,
model_args
=
model_args
,
tasks
=
TASK
,
)
measured_value
=
results
[
"results"
][
TASK
][
FILTER
]
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
tests/entrypoints/openai/test_audio.py
0 → 100644
View file @
ad385667
from
typing
import
Dict
,
List
import
openai
import
pytest
import
pytest_asyncio
from
vllm.assets.audio
import
AudioAsset
from
vllm.multimodal.utils
import
encode_audio_base64
,
fetch_audio
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"fixie-ai/ultravox-v0_3"
TEST_AUDIO_URLS
=
[
AudioAsset
(
"winning_call"
).
url
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"5"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_audio
()
->
Dict
[
str
,
str
]:
return
{
audio_url
:
encode_audio_base64
(
*
fetch_audio
(
audio_url
))
for
audio_url
in
TEST_AUDIO_URLS
}
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_audio_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
f
"data:audio/wav;base64,
{
base64_encoded_audio
[
audio_url
]
}
"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_chat_streaming_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
with
pytest
.
raises
(
openai
.
BadRequestError
):
# test multi-audio input
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
temperature
=
0.0
,
)
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
completion
=
completion
.
choices
[
0
].
text
assert
completion
is
not
None
and
len
(
completion
)
>=
0
tests/entrypoints/openai/test_basic.py
View file @
ad385667
from
http
import
HTTPStatus
from
typing
import
List
import
openai
import
pytest
import
pytest_asyncio
import
requests
from
vllm.version
import
__version__
as
VLLM_VERSION
...
...
@@ -11,8 +13,44 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
'module'
)
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
List
[
str
]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if
not
hasattr
(
request
,
"param"
):
return
[]
val
=
request
.
param
if
isinstance
(
val
,
str
):
return
[
val
]
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
(
server_args
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
...
...
@@ -22,17 +60,28 @@ def server():
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
*
server_args
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
...
...
@@ -43,6 +92,15 @@ async def test_show_version(client: openai.AsyncOpenAI):
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
...
...
@@ -50,12 +108,3 @@ async def test_check_health(client: openai.AsyncOpenAI):
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_log_metrics
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/metrics"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
tests/entrypoints/openai/test_chat.py
View file @
ad385667
# imports for guided decoding tests
import
json
import
re
from
typing
import
List
from
typing
import
Dict
,
List
,
Optional
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
import
torch
from
openai
import
BadRequestError
...
...
@@ -46,9 +47,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
...
...
@@ -174,6 +176,88 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name, prompt_logprobs"
,
[(
MODEL_NAME
,
1
),
(
MODEL_NAME
,
0
),
(
MODEL_NAME
,
-
1
),
(
MODEL_NAME
,
None
)],
)
async
def
test_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
params
:
Dict
=
{
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
"model"
:
model_name
}
if
prompt_logprobs
is
not
None
:
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
prompt_logprobs
}
if
prompt_logprobs
is
not
None
and
prompt_logprobs
<
0
:
with
pytest
.
raises
(
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
**
params
)
else
:
completion
=
await
client
.
chat
.
completions
.
create
(
**
params
)
if
prompt_logprobs
is
not
None
:
assert
completion
.
prompt_logprobs
is
not
None
assert
len
(
completion
.
prompt_logprobs
)
>
0
else
:
assert
completion
.
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_more_than_one_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
params
:
Dict
=
{
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"Who won the world series in 2020?"
},
{
"role"
:
"assistant"
,
"content"
:
"The Los Angeles Dodgers won the World Series in 2020."
},
{
"role"
:
"user"
,
"content"
:
"Where was it played?"
}],
"model"
:
model_name
,
"extra_body"
:
{
"prompt_logprobs"
:
1
}
}
completion_1
=
await
client
.
chat
.
completions
.
create
(
**
params
)
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
2
}
completion_2
=
await
client
.
chat
.
completions
.
create
(
**
params
)
assert
len
(
completion_1
.
prompt_logprobs
[
3
])
==
1
assert
len
(
completion_2
.
prompt_logprobs
[
3
])
==
2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
...
...
@@ -349,18 +433,28 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
extra_body
=
dict
(
min_tokens
=
10
),
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
"continuous_usage_stats"
:
True
,
},
)
last_completion_tokens
=
0
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
last_completion_tokens
==
0
or
\
chunk
.
usage
.
completion_tokens
>
last_completion_tokens
or
\
(
not
chunk
.
choices
and
chunk
.
usage
.
completion_tokens
==
last_completion_tokens
)
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
last_completion_tokens
=
chunk
.
usage
.
completion_tokens
assert
last_completion_tokens
==
10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
...
...
@@ -755,6 +849,39 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
assert
loaded
==
{
"result"
:
2
},
loaded
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_schema
(
client
:
openai
.
AsyncOpenAI
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
(
'what is 1+1? please respond with a JSON object, '
'the format is {"result": 2}'
)
}],
response_format
=
{
"type"
:
"json_schema"
,
"json_schema"
:
{
"name"
:
"foo_test"
,
"schema"
:
{
"type"
:
"object"
,
"properties"
:
{
"result"
:
{
"type"
:
"integer"
},
},
},
}
})
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
...
...
tests/
async_engine
/test_chat_template.py
→
tests/
entrypoints/openai
/test_chat_template.py
View file @
ad385667
import
os
import
pathlib
import
pytest
from
vllm.entrypoints.chat_utils
import
load_chat_template
from
vllm.entrypoints.chat_utils
import
(
apply_hf_chat_template
,
load_chat_template
)
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
chatml_jinja_path
=
pathlib
.
Path
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))).
parent
.
parent
/
"examples/template_chatml.jinja"
from
...utils
import
VLLM_PATH
chatml_jinja_path
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
"facebook/opt-125m"
,
None
,
True
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
None
,
False
,
"Hello</s>Hi there!</s>What is the capital of</s>"
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
...
...
@@ -25,12 +20,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
"""
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
False
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of"""
),
(
"facebook/opt-125m"
,
chatml_jinja_path
,
False
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of"""
)
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""
),
]
TEST_MESSAGES
=
[
...
...
@@ -47,6 +50,10 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
},
]
ASSISTANT_MESSAGE_TO_CONTINUE
=
{
'role'
:
'assistant'
,
'content'
:
'The capital of'
}
def
test_load_chat_template
():
...
...
@@ -78,10 +85,10 @@ def test_no_load_chat_template_literallike():
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
"model,template,add_generation_prompt,
continue_final_message,
expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
continue_final_message
,
expected_output
):
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
template_content
=
load_chat_template
(
chat_template
=
template
)
...
...
@@ -89,15 +96,20 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
model
=
model
,
messages
=
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
)
messages
=
TEST_MESSAGES
+
[
ASSISTANT_MESSAGE_TO_CONTINUE
]
if
continue_final_message
else
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
)
# Call the function and get the result
result
=
tokenizer
.
apply_chat_template
(
result
=
apply_hf_chat_template
(
tokenizer
,
conversation
=
mock_request
.
messages
,
tokenize
=
False
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
chat_template
=
mock_request
.
chat_template
or
template_content
)
continue_final_message
=
mock_request
.
continue_final_message
,
)
# Test assertion
assert
result
==
expected_output
,
(
...
...
tests/entrypoints/openai/test_chunked_prompt.py
0 → 100644
View file @
ad385667
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--max-num-seqs"
,
"128"
,
"--enable-chunked-prefill"
,
"--max-num-batched-tokens"
,
"1000"
,
# large prompts create a lot of output
"--disable-log-requests"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
prompt
=
"What is the capital of France?"
*
400
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
5
,
)
tokens_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
tokens_received
+=
1
assert
chunk
.
choices
[
0
].
text
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
*
400
}]
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
True
,
top_logprobs
=
5
,
)
tokens_received
=
0
empty_chunks_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
if
chunk
.
choices
[
0
].
delta
.
content
==
""
:
# when there is no tokens generated
assert
chunk
.
usage
.
completion_tokens
==
0
assert
chunk
.
choices
[
0
].
logprobs
is
None
empty_chunks_received
+=
1
else
:
tokens_received
+=
1
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
assert
empty_chunks_received
<=
1
tests/entrypoints/openai/test_cli_args.py
0 → 100644
View file @
ad385667
import
json
import
pytest
from
vllm.entrypoints.openai.cli_args
import
(
make_arg_parser
,
validate_parsed_serve_args
)
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.utils
import
FlexibleArgumentParser
from
...utils
import
VLLM_PATH
LORA_MODULE
=
{
"name"
:
"module2"
,
"path"
:
"/path/to/module2"
,
"base_model_name"
:
"llama"
}
CHATML_JINJA_PATH
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
CHATML_JINJA_PATH
.
exists
()
@
pytest
.
fixture
def
serve_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
return
make_arg_parser
(
parser
)
### Tests for Lora module parsing
def
test_valid_key_value_format
(
serve_parser
):
# Test old format: name=path
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
assert
args
.
lora_modules
==
expected
def
test_valid_json_format
(
serve_parser
):
# Test valid JSON format input
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
def
test_invalid_json_format
(
serve_parser
):
# Test invalid JSON format input, missing closing brace
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
def
test_invalid_type_error
(
serve_parser
):
# Test type error when values are not JSON or key=value
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'invalid_format'
# This is not JSON or key=value format
])
def
test_invalid_json_field
(
serve_parser
):
# Test valid JSON format but missing required fields
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module4"}'
# Missing required 'path' field
])
def
test_empty_values
(
serve_parser
):
# Test when no LoRA modules are provided
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
''
])
assert
args
.
lora_modules
==
[]
def
test_multiple_valid_inputs
(
serve_parser
):
# Test multiple valid inputs (both old and JSON format)
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
### Tests for serve argument validation that run prior to loading
def
test_enable_auto_choice_passes_without_tool_call_parser
(
serve_parser
):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
])
with
pytest
.
raises
(
TypeError
):
validate_parsed_serve_args
(
args
)
def
test_enable_auto_choice_passes_with_tool_call_parser
(
serve_parser
):
"""Ensure validation passes with tool choice enabled with a call parser"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"mistral"
,
])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_happy_paths
(
serve_parser
):
"""Ensure validation passes if the chat template exists"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
CHATML_JINJA_PATH
.
absolute
().
as_posix
()])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_sad_paths
(
serve_parser
):
"""Ensure validation fails if the chat template doesn't exist"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
"does/not/exist"
])
with
pytest
.
raises
(
ValueError
):
validate_parsed_serve_args
(
args
)
tests/entrypoints/openai/test_completion.py
View file @
ad385667
...
...
@@ -3,11 +3,12 @@ import json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
from
typing
import
Dict
,
List
,
Optional
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
...
...
@@ -87,15 +88,19 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
""
,
"--disable-frontend-multiprocessing"
])
def
server
(
default_server_args
,
request
):
if
request
.
param
:
default_server_args
.
append
(
request
.
param
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
...
...
@@ -132,6 +137,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
assert
completion
.
choices
[
0
].
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
...
...
@@ -269,6 +275,37 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name, prompt_logprobs"
,
[(
MODEL_NAME
,
-
1
),
(
MODEL_NAME
,
0
),
(
MODEL_NAME
,
1
),
(
MODEL_NAME
,
None
)])
async
def
test_prompt_logprobs_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
params
:
Dict
=
{
"prompt"
:
[
"A robot may not injure another robot"
,
"My name is"
],
"model"
:
model_name
,
}
if
prompt_logprobs
is
not
None
:
params
[
"extra_body"
]
=
{
"prompt_logprobs"
:
prompt_logprobs
}
if
prompt_logprobs
is
not
None
and
prompt_logprobs
<
0
:
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
**
params
)
else
:
completion
=
await
client
.
completions
.
create
(
**
params
)
if
prompt_logprobs
is
not
None
:
assert
completion
.
choices
[
0
].
prompt_logprobs
is
not
None
assert
len
(
completion
.
choices
[
0
].
prompt_logprobs
)
>
0
assert
completion
.
choices
[
1
].
prompt_logprobs
is
not
None
assert
len
(
completion
.
choices
[
1
].
prompt_logprobs
)
>
0
else
:
assert
completion
.
choices
[
0
].
prompt_logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
...
...
@@ -466,8 +503,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but
not necessary
# for official client.
# NOTE: this has to be true for n > 1 in vLLM, but
#
not necessary
for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
...
...
tests/entrypoints/openai/test_disable_mp.py
deleted
100644 → 0
View file @
be0967c1
"""
Repeat of tests in test_completion.py with the non-mp backend.
"""
# imports for guided decoding tests
import
json
import
re
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
List
import
jsonschema
import
openai
# use the official client for correctness check
import
pytest
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
# need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS
=
8
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
,
zephyr_pa_files
):
return
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--max-num-seqs"
,
"128"
,
"--enforce-eager"
,
# lora config
"--enable-lora"
,
"--lora-modules"
,
f
"zephyr-lora=
{
zephyr_lora_files
}
"
,
f
"zephyr-lora2=
{
zephyr_lora_added_tokens_files
}
"
,
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
# pa config
"--enable-prompt-adapter"
,
"--prompt-adapters"
,
f
"zephyr-pa=
{
zephyr_pa_files
}
"
,
f
"zephyr-pa2=
{
zephyr_pa_files
}
"
,
"--max-prompt-adapters"
,
"2"
,
"--max-prompt-adapter-token"
,
"128"
,
"--disable-frontend-multiprocessing"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name,num_virtual_tokens"
,
[(
MODEL_NAME
,
0
),
(
"zephyr-lora"
,
0
),
(
"zephyr-lora2"
,
0
),
(
"zephyr-pa"
,
PA_NUM_VIRTUAL_TOKENS
),
(
"zephyr-pa2"
,
PA_NUM_VIRTUAL_TOKENS
)],
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
num_virtual_tokens
:
int
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
6
+
num_virtual_tokens
,
total_tokens
=
11
+
num_virtual_tokens
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
"zephyr-lora2"
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should appear in tokenized prompt
assert
completion
.
choices
[
0
].
text
.
startswith
(
"<unk><unk>vllm1vllm2vllm3"
)
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras, then test prompt adapters
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
,
"zephyr-pa"
,
"zephyr-pa2"
],
)
async
def
test_no_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
None
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# just test 1 lora and 1 pa hereafter
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_zero_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
0
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
==
1
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_some_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
5
,
)
choice
=
completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
.
token_logprobs
is
not
None
assert
choice
.
logprobs
.
top_logprobs
is
not
None
assert
5
<=
len
(
choice
.
logprobs
.
top_logprobs
[
0
])
<=
6
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_too_many_completion_logprobs
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
21
,
)
...
with
pytest
.
raises
(
(
openai
.
BadRequestError
,
openai
.
APIError
)):
# test using token IDs
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
# vLLM has higher default max_logprobs (20 instead of 5) to support
# both Completion API and Chat Completion API
logprobs
=
30
,
stream
=
True
,
)
async
for
chunk
in
stream
:
...
# the server should still work afterwards
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is an LLM?"
single_completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
)
single_output
=
single_completion
.
choices
[
0
].
text
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
"length"
assert
chunk
.
choices
[
0
].
text
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_completion_stream_options
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
prompt
=
"What is the capital of France?"
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": False, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
None
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": False}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
False
,
})
async
for
chunk
in
stream
:
if
chunk
.
choices
[
0
].
finish_reason
is
None
:
assert
chunk
.
usage
is
None
else
:
assert
chunk
.
usage
is
None
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=True, stream_options=
# {"include_usage": True, "continuous_usage_stats": True}
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
})
async
for
chunk
in
stream
:
assert
chunk
.
usage
is
not
None
assert
chunk
.
usage
.
prompt_tokens
>
0
assert
chunk
.
usage
.
completion_tokens
>
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
final_chunk
=
await
stream
.
__anext__
()
assert
final_chunk
.
usage
is
not
None
assert
final_chunk
.
usage
.
prompt_tokens
>
0
assert
final_chunk
.
usage
.
completion_tokens
>
0
assert
final_chunk
.
usage
.
total_tokens
==
(
final_chunk
.
usage
.
prompt_tokens
+
final_chunk
.
usage
.
completion_tokens
)
assert
final_chunk
.
choices
==
[]
# Test stream=False, stream_options=
# {"include_usage": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
# Test stream=False, stream_options=
# {"include_usage": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": None}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
None
})
# Test stream=False, stream_options=
# {"continuous_usage_stats": True}
with
pytest
.
raises
(
BadRequestError
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"continuous_usage_stats"
:
True
})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_batch_completions
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test both text and token IDs
for
prompts
in
([
"Hello, my name is"
]
*
2
,
[[
0
,
0
,
0
,
0
,
0
]]
*
2
):
# test simple list
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
n
=
2
,
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
batch
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompts
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
)
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
@
pytest
.
mark
.
asyncio
async
def
test_logits_bias
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
5
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
token_id
=
1000
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token_id
):
100
},
seed
=
42
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
5
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
expected_tokens
=
tokenizer
(
tokenizer
.
decode
([
token_id
]
*
5
),
add_special_tokens
=
False
)[
"input_ids"
]
assert
all
([
response
==
expected
for
response
,
expected
in
zip
(
response_tokens
,
expected_tokens
)
])
# Test ban
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
)
response_tokens
=
tokenizer
(
completion
.
choices
[
0
].
text
,
add_special_tokens
=
False
)[
"input_ids"
]
first_response
=
completion
.
choices
[
0
].
text
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
logit_bias
=
{
str
(
token
):
-
100
for
token
in
response_tokens
},
)
assert
first_response
!=
completion
.
choices
[
0
].
text
@
pytest
.
mark
.
asyncio
async
def
test_allowed_token_ids
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
"Hello, my name is"
max_tokens
=
1
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# Test exclusive selection
allowed_ids
=
[
21555
,
21557
,
21558
]
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
temperature
=
0.0
,
seed
=
42
,
extra_body
=
dict
(
allowed_token_ids
=
allowed_ids
),
logprobs
=
1
,
)
response_tokens
=
completion
.
choices
[
0
].
logprobs
.
tokens
assert
len
(
response_tokens
)
==
1
assert
tokenizer
.
convert_tokens_to_ids
(
response_tokens
)[
0
]
in
allowed_ids
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_json_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
output_json
=
json
.
loads
(
completion
.
choices
[
i
].
text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_regex_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_regex
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
,
n
=
3
,
temperature
=
1.0
,
max_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
3
for
i
in
range
(
3
):
assert
re
.
fullmatch
(
sample_regex
,
completion
.
choices
[
i
].
text
)
is
not
None
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_choice_completion
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"The best language for type-safe systems programming is "
,
n
=
2
,
temperature
=
1.0
,
max_tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
assert
completion
.
id
is
not
None
assert
len
(
completion
.
choices
)
==
2
for
i
in
range
(
2
):
assert
completion
.
choices
[
i
].
text
in
sample_guided_choice
@
pytest
.
mark
.
asyncio
async
def
test_guided_grammar
(
client
:
openai
.
AsyncOpenAI
,
sample_sql_statements
):
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
temperature
=
1.0
,
max_tokens
=
500
,
extra_body
=
dict
(
guided_grammar
=
sample_sql_statements
))
content
=
completion
.
choices
[
0
].
text
# use Lark to parse the output, and make sure it's a valid parse tree
from
lark
import
Lark
parser
=
Lark
(
sample_sql_statements
)
parser
.
parse
(
content
)
# remove spaces for comparison b/c we removed them in the grammar
ground_truth
=
"SELECT col_1 from table_1 where col_1 = 1"
.
replace
(
" "
,
""
)
assert
content
.
strip
()
==
ground_truth
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
# first test base model, then test loras
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-lora2"
],
)
@
pytest
.
mark
.
parametrize
(
"logprobs_arg"
,
[
1
,
0
])
async
def
test_echo_logprob_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
logprobs_arg
:
int
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
# test using text and token IDs
for
prompt
in
(
"Hello, my name is"
,
[
0
,
0
,
0
,
0
,
0
]):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
echo
=
True
,
logprobs
=
logprobs_arg
)
prompt_text
=
tokenizer
.
decode
(
prompt
)
if
isinstance
(
prompt
,
list
)
else
prompt
assert
re
.
search
(
r
"^"
+
prompt_text
,
completion
.
choices
[
0
].
text
)
logprobs
=
completion
.
choices
[
0
].
logprobs
assert
logprobs
is
not
None
assert
len
(
logprobs
.
text_offset
)
>
5
assert
(
len
(
logprobs
.
token_logprobs
)
>
5
and
logprobs
.
token_logprobs
[
0
]
is
None
)
assert
(
len
(
logprobs
.
top_logprobs
)
>
5
and
logprobs
.
top_logprobs
[
0
]
is
None
)
for
top_logprobs
in
logprobs
.
top_logprobs
[
1
:]:
assert
max
(
logprobs_arg
,
1
)
<=
len
(
top_logprobs
)
<=
logprobs_arg
+
1
assert
len
(
logprobs
.
tokens
)
>
5
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"outlines"
,
"lm-format-enforcer"
])
async
def
test_guided_decoding_type_error
(
client
:
openai
.
AsyncOpenAI
,
guided_decoding_backend
:
str
,
sample_json_schema
,
sample_regex
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example JSON that fits this schema: 42"
,
extra_body
=
dict
(
guided_json
=
42
,
guided_decoding_backend
=
guided_decoding_backend
))
with
pytest
.
raises
(
openai
.
BadRequestError
):
_
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
tests/entrypoints/openai/test_embedding.py
View file @
ad385667
...
...
@@ -3,6 +3,7 @@ import base64
import
numpy
as
np
import
openai
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
...
...
@@ -24,10 +25,10 @@ def embedding_server():
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
def
embedding_client
(
embedding_server
)
:
return
embedding_server
.
get_
async_client
()
@
pytest
_
asyncio
.
fixture
async
def
embedding_client
(
embedding_server
):
async
with
embedding_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
...
...
@@ -128,9 +129,79 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
for
data
in
responses_base64
.
data
:
decoded_responses_base64_data
.
append
(
np
.
frombuffer
(
base64
.
b64decode
(
data
.
embedding
),
dtype
=
"float"
).
tolist
())
dtype
=
"float
32
"
).
tolist
())
assert
responses_float
.
data
[
0
].
embedding
==
decoded_responses_base64_data
[
0
]
assert
responses_float
.
data
[
1
].
embedding
==
decoded_responses_base64_data
[
1
]
# Default response is float32 decoded from base64 by OpenAI Client
responses_default
=
await
embedding_client
.
embeddings
.
create
(
input
=
input_texts
,
model
=
model_name
)
assert
responses_float
.
data
[
0
].
embedding
==
responses_default
.
data
[
0
].
embedding
assert
responses_float
.
data
[
1
].
embedding
==
responses_default
.
data
[
1
].
embedding
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding_truncation
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
# test single embedding
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
input_tokens
=
[
1
,
24428
,
289
,
18341
,
26165
,
285
,
19323
,
283
,
289
,
26789
,
3871
,
28728
,
9901
,
340
,
2229
,
385
,
340
,
315
,
28741
,
28804
,
2
]
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_tokens
,
extra_body
=
{
"truncate_prompt_tokens"
:
10
})
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
EMBEDDING_MODEL_NAME
],
)
async
def
test_single_embedding_truncation_invalid
(
embedding_client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
input_texts
=
[
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?"
,
]
with
pytest
.
raises
(
openai
.
BadRequestError
):
embeddings
=
await
embedding_client
.
embeddings
.
create
(
model
=
model_name
,
input
=
input_texts
,
extra_body
=
{
"truncate_prompt_tokens"
:
8193
})
assert
"error"
in
embeddings
.
object
assert
"truncate_prompt_tokens value is greater than max_model_len. "
\
"Please, select a smaller truncation size."
in
embeddings
.
message
tests/entrypoints/openai/test_encoder_decoder.py
0 → 100644
View file @
ad385667
import
openai
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"facebook/bart-base"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--dtype"
,
"bfloat16"
,
"--enforce-eager"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
temperature
=
0.0
)
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
"length"
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
5
,
prompt_tokens
=
2
,
total_tokens
=
7
)
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
temperature
=
0.0
,
)
assert
len
(
completion
.
choices
[
0
].
text
)
>=
1
tests/entrypoints/openai/test_lora_lineage.py
0 → 100644
View file @
ad385667
import
json
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_with_lora_modules_json
(
zephyr_lora_files
):
# Define the json format LoRA module configurations
lora_module_1
=
{
"name"
:
"zephyr-lora"
,
"path"
:
zephyr_lora_files
,
"base_model_name"
:
MODEL_NAME
}
lora_module_2
=
{
"name"
:
"zephyr-lora2"
,
"path"
:
zephyr_lora_files
,
"base_model_name"
:
MODEL_NAME
}
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--enable-lora"
,
"--lora-modules"
,
json
.
dumps
(
lora_module_1
),
json
.
dumps
(
lora_module_2
),
"--max-lora-rank"
,
"64"
,
"--max-cpu-loras"
,
"2"
,
"--max-num-seqs"
,
"64"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client_for_lora_lineage
(
server_with_lora_modules_json
):
async
with
server_with_lora_modules_json
.
get_async_client
(
)
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_check_lora_lineage
(
client_for_lora_lineage
:
openai
.
AsyncOpenAI
,
zephyr_lora_files
):
models
=
await
client_for_lora_lineage
.
models
.
list
()
models
=
models
.
data
served_model
=
models
[
0
]
lora_models
=
models
[
1
:]
assert
served_model
.
id
==
MODEL_NAME
assert
served_model
.
root
==
MODEL_NAME
assert
served_model
.
parent
is
None
assert
all
(
lora_model
.
root
==
zephyr_lora_files
for
lora_model
in
lora_models
)
assert
all
(
lora_model
.
parent
==
MODEL_NAME
for
lora_model
in
lora_models
)
assert
lora_models
[
0
].
id
==
"zephyr-lora"
assert
lora_models
[
1
].
id
==
"zephyr-lora2"
Prev
1
…
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment