Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4d3a2c28
Commit
4d3a2c28
authored
Dec 30, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.5' into v0.6.5-dev
parents
92ec5d8e
2d1b9baa
Changes
430
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1288 additions
and
422 deletions
+1288
-422
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+94
-0
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+5
-40
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+2
-128
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+2
-3
tests/entrypoints/llm/test_gpu_utilization.py
tests/entrypoints/llm/test_gpu_utilization.py
+27
-0
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+128
-25
tests/entrypoints/llm/test_init.py
tests/entrypoints/llm/test_init.py
+23
-0
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+34
-6
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+16
-1
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+33
-27
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+37
-5
tests/entrypoints/openai/test_async_tokenization.py
tests/entrypoints/openai/test_async_tokenization.py
+139
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+142
-20
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+111
-9
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+111
-71
tests/entrypoints/openai/test_chat_echo.py
tests/entrypoints/openai/test_chat_echo.py
+80
-0
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+23
-7
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_chunked_prompt.py
+127
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+109
-69
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+45
-11
No files found.
Too many changes to show.
To preserve performance only
430 of 430+
files are displayed.
Plain diff
Email patch
tests/entrypoints/llm/test_chat.py
0 → 100644
View file @
4d3a2c28
from
typing
import
List
import
os
import
pytest
from
vllm
import
LLM
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_encode.py
View file @
4d3a2c28
...
...
@@ -4,9 +4,8 @@ from typing import List
import
pytest
import
os
from
vllm
import
LLM
,
EmbeddingRequestOutput
,
PoolingParams
from
...conftest
import
cleanup
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
...
...
@@ -43,29 +42,14 @@ def llm():
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
List
[
Embedd
ingRequestOutput
],
o2
:
List
[
Embedd
ingRequestOutput
]):
def
assert_outputs_equal
(
o1
:
List
[
Pool
ingRequestOutput
],
o2
:
List
[
Pool
ingRequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
prompt
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
prompt
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
({
"prompt"
:
prompt
},
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
...
@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
PROMPTS
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
pooling_params
=
pooling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
...
...
tests/entrypoints/llm/test_generate.py
View file @
4d3a2c28
...
...
@@ -5,9 +5,7 @@ import os
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
...
...
@@ -42,30 +40,13 @@ def llm():
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
List
[
RequestOutput
],
o2
:
List
[
RequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
({
"prompt"
:
prompt
},
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
...
@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
PROMPTS
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
sampling_params
=
sampling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
...
...
@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
4d3a2c28
...
...
@@ -6,9 +6,8 @@ import os
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
...conftest
import
cleanup
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
...
...
@@ -41,7 +40,7 @@ def llm():
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
tests/entrypoints/llm/test_gpu_utilization.py
0 → 100644
View file @
4d3a2c28
import
os
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
def
test_gpu_memory_utilization
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms
=
[
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
gpu_memory_utilization
=
0.3
,
enforce_eager
=
True
)
for
i
in
range
(
3
)
]
for
llm
in
llms
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
tests/entrypoints/llm/test_guided_generate.py
View file @
4d3a2c28
...
...
@@ -6,11 +6,11 @@ import jsonschema
import
pytest
import
os
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
...conftest
import
cleanup
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
...
...
@@ -25,7 +25,7 @@ def llm():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
mark
.
skip_global_cleanup
...
...
@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
)
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_json
=
sample_json_schema
))
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
...
...
@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm):
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_complex_json_completion
(
sample_complex_json_schema
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_complex_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an assignment grade "
f
"that fits this schema:
{
sample_complex_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_complex_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_definition_json_completion
(
sample_definition_json_schema
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_definition_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for solving 8x + 7 = -23 "
f
"that fits this schema:
{
sample_definition_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_definition_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
choice
=
sample_guided_choice
)
)
outputs
=
llm
.
generate
(
prompts
=
"The best language for type-safe systems programming is "
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_choice
=
sample_guided_choice
))
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_statements
)
)
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_grammar
=
sample_sql_statements
)
)
)
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm):
assert
generated_text
.
strip
()
==
ground_truth
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_options_request_deprecation_warning
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"guided_options_request"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_validation_against_both_guided_decoding_options
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
with
pytest
.
raises
(
ValueError
,
match
=
"Cannot set both"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_json_object
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
100
,
guided_decoding
=
GuidedDecodingParams
(
json_object
=
True
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a JSON object describing a person with name "
"and age for John Smith who is 31 years old."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
generated_text
=
output
.
outputs
[
0
].
text
print
(
generated_text
)
assert
generated_text
is
not
None
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
tests/entrypoints/llm/test_init.py
0 → 100644
View file @
4d3a2c28
import
os
import
pytest
from
vllm
import
LLM
from
...utils
import
error_on_warning
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
def
test_pos_args_deprecated
():
with
error_on_warning
(
DeprecationWarning
):
LLM
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
error_on_warning
(
DeprecationWarning
):
LLM
(
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer', 'tokenizer_mode'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
,
"auto"
)
tests/entrypoints/llm/test_lazy_outlines.py
View file @
4d3a2c28
import
sys
import
os
from
contextlib
import
nullcontext
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
def
test_lazy_outlines
(
sample_regex
):
"""If users don't use guided decoding, outlines should not be imported.
"""
def
run_normal
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex):
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
...
...
@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex):
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
# Destroy the LLM object and free up the GPU memory.
del
llm
cleanup_dist_env_and_memory
()
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
...
...
@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex):
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
def
test_lazy_outlines
(
sample_regex
):
"""If users don't use guided decoding, outlines should not be imported.
"""
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
module_name
=
"outlines"
# In CI, we only check finally if the module is imported.
# If it is indeed imported, we can rerun the test with `use_blame=True`,
# which will trace every function call to find the first import location,
# and help find the root cause.
# We don't run it in CI by default because it is slow.
use_blame
=
False
context
=
blame
(
lambda
:
module_name
in
sys
.
modules
)
if
use_blame
else
nullcontext
()
with
context
as
result
:
run_normal
()
run_lmfe
(
sample_regex
)
if
use_blame
:
assert
isinstance
(
result
,
BlameResult
)
print
(
f
"the first import location is:
\n
{
result
.
trace_stack
}
"
)
assert
module_name
not
in
sys
.
modules
,
(
f
"Module
{
module_name
}
is imported. To see the first"
f
" import location, run the test with `use_blame=True`."
)
tests/entrypoints/llm/test_prompt_validation.py
View file @
4d3a2c28
...
...
@@ -5,7 +5,22 @@ from vllm import LLM
from
...utils
import
models_path_prefix
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
test_empty_prompt
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
4d3a2c28
"""Tests for HF_HUB_OFFLINE mode"""
import
importlib
import
sys
import
weakref
import
os
import
pytest
from
vllm
import
LLM
from
...conftest
import
cleanup
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm
=
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
MODEL_CONFIGS
=
[
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.20
,
"max_model_len"
:
64
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
64
,
"tensor_parallel_size"
:
1
,
},
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mistral-7B-Instruct-v0.1"
),
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.95
,
"max_model_len"
:
64
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
64
,
"tensor_parallel_size"
:
1
,
"tokenizer_mode"
:
"mistral"
,
},
]
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
cache_models
():
# Cache model files first
for
model_config
in
MODEL_CONFIGS
:
LLM
(
**
model_config
)
cleanup_dist_env_and_memory
()
cleanup
()
yield
@
pytest
.
mark
.
skip_global_cleanup
def
test_offline_mode
(
llm
:
LLM
,
monkeypatch
):
# we use the llm fixture to ensure the model files are in-cache
del
llm
@
pytest
.
mark
.
usefixtures
(
"cache_models"
)
def
test_offline_mode
(
monkeypatch
):
# Set HF to offline mode and ensure we can still construct an LLM
try
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules
()
# Cached model files should be used in offline mode
LLM
(
model
=
MODEL_NAME
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
for
model_config
in
MODEL_CONFIGS
:
LLM
(
**
model_config
)
finally
:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
...
...
tests/entrypoints/openai/test_accuracy.py
View file @
4d3a2c28
...
...
@@ -11,6 +11,8 @@ import lm_eval
import
pytest
import
os
from
vllm.platforms
import
current_platform
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
...
...
@@ -19,22 +21,33 @@ TASK = "gsm8k"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
4096
"
,
"--disable-log-requests"
]
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
2048
"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
[],
# Default
[
"--enable-chunked-prefill"
],
# Chunked
[
"--num-scheduler-steps"
,
"8"
],
# MS
[
"--num-scheduler-steps"
,
"8"
,
"--multi-step-stream-outputs"
]
# MS+Stream
]
MAX_WAIT_SECONDS
=
None
if
current_platform
.
is_tpu
():
MORE_ARGS_LIST
=
[
[],
# Default
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
]
MAX_WAIT_SECONDS
=
600
def
run_test
(
more_args
):
"""Run the end to end accuracy test."""
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy
(
more_args
):
args
=
list
(
DEFAULT_ARGS
)
args
.
extend
(
more_args
)
print
(
f
"Running with:
{
args
}
"
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
max_wait_seconds
=
MAX_WAIT_SECONDS
)
as
remote_server
:
url
=
f
"
{
remote_server
.
url_for
(
'v1'
)
}
/completions"
model_args
=
(
...
...
@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args):
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"V1 currently only supported on CUDA"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
run_test
([])
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
,
more_args
):
"""Run with the V0 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
run_test
(
more_args
)
tests/entrypoints/openai/test_async_tokenization.py
0 → 100644
View file @
4d3a2c28
import
asyncio
import
contextlib
import
random
import
time
from
typing
import
Callable
import
os
import
openai
import
pytest
import
pytest_asyncio
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
"--load-format"
,
"dummy"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
ids
=
[
"completion"
,
"chat"
],
argnames
=
[
"create_func_gen"
,
"content_body"
],
argvalues
=
[
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
" "
.
join
([
'A'
]
*
10_000
)
}),
(
lambda
x
:
x
.
chat
.
completions
.
create
,
{
"messages"
:
[{
"role"
:
"user"
,
"content"
:
" "
.
join
([
'A'
]
*
10_000
)
}]
}),
],
)
async
def
test_with_and_without_truncate
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
,
create_func_gen
:
Callable
,
content_body
:
dict
,
):
create_func
=
create_func_gen
(
client
)
body
=
{
"model"
:
MODEL_NAME
,
**
content_body
,
"max_tokens"
:
10
}
num_requests
=
10
truncate_prompt_tokens
=
([
1000
]
*
(
num_requests
//
2
)
+
[
None
]
*
(
num_requests
-
num_requests
//
2
))
random
.
shuffle
(
truncate_prompt_tokens
)
bodies
=
[{
**
body
,
"extra_body"
:
{
'truncate_prompt_tokens'
:
t
}
}
for
t
in
truncate_prompt_tokens
]
async
def
get_status_code
(
**
kwargs
):
try
:
await
create_func
(
**
kwargs
)
return
200
except
openai
.
APIStatusError
as
e
:
return
e
.
status_code
responses
=
await
asyncio
.
gather
(
*
[
get_status_code
(
**
b
)
for
b
in
bodies
])
assert
500
not
in
responses
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
ids
=
[
"single completion"
,
"multiple completions"
,
"chat"
],
argnames
=
[
"create_func_gen"
,
"content_body"
],
argvalues
=
[
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
" "
.
join
([
'A'
]
*
300_000
)
}),
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
[
" "
.
join
([
'A'
]
*
300_000
)]
*
2
}),
(
lambda
x
:
x
.
chat
.
completions
.
create
,
{
"messages"
:
[{
"role"
:
"user"
,
"content"
:
" "
.
join
([
'A'
]
*
300_000
)
}]
}),
],
)
async
def
test_healthcheck_response_time
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
,
create_func_gen
:
Callable
,
content_body
:
dict
,
):
num_requests
=
50
create_func
=
create_func_gen
(
client
)
body
=
{
"model"
:
MODEL_NAME
,
**
content_body
,
"max_tokens"
:
10
}
def
get_response_time
(
url
):
start_time
=
time
.
monotonic
()
res
=
requests
.
get
(
url
)
end_time
=
time
.
monotonic
()
assert
res
.
status_code
==
200
return
end_time
-
start_time
no_load_response_time
=
get_response_time
(
server
.
url_for
(
"health"
))
tasks
=
[
asyncio
.
create_task
(
create_func
(
**
body
))
for
_
in
range
(
num_requests
)
]
await
asyncio
.
sleep
(
1
)
# give the tasks a chance to start running
load_response_time
=
get_response_time
(
server
.
url_for
(
"health"
))
with
contextlib
.
suppress
(
openai
.
APIStatusError
):
await
asyncio
.
gather
(
*
tasks
)
assert
load_response_time
<
100
*
no_load_response_time
assert
load_response_time
<
0.1
tests/entrypoints/openai/test_audio.py
View file @
4d3a2c28
...
...
@@ -24,8 +24,11 @@ def server():
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"4096"
,
"2048"
,
"--max-num-seqs"
,
"5"
,
"--enforce-eager"
,
"--trust-remote-code"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
...
...
@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
...
@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded(
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
...
...
@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_audio"
,
"input_audio"
:
{
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
...
@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_chat_streaming_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_audio"
,
"input_audio"
:
{
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
...
...
@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
...
...
@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
}
},
{
"type"
:
"audio_url"
,
"audio_url"
:
{
"url"
:
audio_url
"type"
:
"input_audio"
,
"input_audio"
:
{
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
},
{
...
...
@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
)
...
...
tests/entrypoints/openai/test_basic.py
View file @
4d3a2c28
import
asyncio
from
http
import
HTTPStatus
from
typing
import
List
import
openai
import
pytest
...
...
@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
@
pytest
.
fixture
(
scope
=
'module'
)
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
List
[
str
]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if
not
hasattr
(
request
,
"param"
):
return
[]
val
=
request
.
param
if
isinstance
(
val
,
str
):
return
[
val
]
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
(
server_args
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
...
...
@@ -24,6 +62,7 @@ def server():
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
*
server_args
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
@@ -36,20 +75,83 @@ async def client(server):
yield
async_client
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/version"
)
async
def
test_show_version
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"version"
))
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
base_url
+
"/health"
)
async
def
test_check_health
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"health"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([
"--max-model-len"
,
"10100"
],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
(
[
"--disable-frontend-multiprocessing"
,
"--max-model-len"
,
"10100"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_request_cancellation
(
server
:
RemoteOpenAIServer
):
# clunky test: send an ungodly amount of load in with short timeouts
# then ensure that it still responds quickly afterwards
chat_input
=
[{
"role"
:
"user"
,
"content"
:
"Write a long story"
}]
client
=
server
.
get_async_client
(
timeout
=
0.5
)
tasks
=
[]
# Request about 2 million tokens
for
_
in
range
(
200
):
task
=
asyncio
.
create_task
(
client
.
chat
.
completions
.
create
(
messages
=
chat_input
,
model
=
MODEL_NAME
,
max_tokens
=
10000
,
extra_body
=
{
"min_tokens"
:
10000
}))
tasks
.
append
(
task
)
done
,
pending
=
await
asyncio
.
wait
(
tasks
,
return_when
=
asyncio
.
ALL_COMPLETED
)
# Make sure all requests were sent to the server and timed out
# (We don't want to hide other errors like 400s that would invalidate this
# test)
assert
len
(
pending
)
==
0
for
d
in
done
:
with
pytest
.
raises
(
openai
.
APITimeoutError
):
d
.
result
()
# If the server had not cancelled all the other requests, then it would not
# be able to respond to this one within the timeout
client
=
server
.
get_async_client
(
timeout
=
5
)
response
=
await
client
.
chat
.
completions
.
create
(
messages
=
chat_input
,
model
=
MODEL_NAME
,
max_tokens
=
10
)
assert
len
(
response
.
choices
)
==
1
tests/entrypoints/openai/test_chat.py
View file @
4d3a2c28
...
...
@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"typeof/zephyr-7b-beta-lora"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
False
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
False
)
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
...
...
@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
True
,
top_logprobs
=
0
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
True
,
top_logprobs
=
0
)
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
...
...
@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
True
,
top_logprobs
=
5
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
5
,
temperature
=
0.0
,
logprobs
=
True
,
top_logprobs
=
5
)
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
...
...
@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
((
openai
.
BadRequestError
,
openai
.
APIError
)):
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
21
,
stream
=
True
)
...
...
@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
30
,
stream
=
False
)
# the server should still work afterwards
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
stream
=
False
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
stream
=
False
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
...
@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
...
...
@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
...
@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
...
...
@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
...
...
@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
# "continuous_usage_stats": False}}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
...
...
@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
...
...
@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
...
...
@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
extra_body
=
dict
(
min_tokens
=
10
),
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
"continuous_usage_stats"
:
True
,
},
)
last_completion_tokens
=
0
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
last_completion_tokens
==
0
or
\
chunk
.
usage
.
completion_tokens
>
last_completion_tokens
or
\
(
not
chunk
.
choices
and
chunk
.
usage
.
completion_tokens
==
last_completion_tokens
)
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
last_completion_tokens
=
chunk
.
usage
.
completion_tokens
assert
last_completion_tokens
==
10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
...
...
@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
...
...
@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
...
...
@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
max_
completion_
tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
20
,
max_
completion_
tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
...
...
@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
...
...
@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tool_choice
=
{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
...
...
@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
"name"
:
"nondefined_function_name"
}
})
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
{})
@
pytest
.
mark
.
asyncio
...
...
@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_schema
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
(
'what is 1+1? please respond with a JSON object, '
'the format is {"result": 2}'
)
"role"
:
"user"
,
"content"
:
prompt
}],
)
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
with
pytest
.
raises
((
json
.
JSONDecodeError
,
AssertionError
)):
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
response_format
=
{
"type"
:
"json_schema"
,
...
...
@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"system
"
,
"content"
:
"You are a helpful assistant.
"
,
"extra_field"
:
"0"
,
}],
# type: ignore
temperature
=
0
,
seed
=
0
)
assert
"extra_forbidd
en
"
i
n
exc_info
.
value
.
messag
e
async
def
test_extra_fields
_allowed
(
client
:
openai
.
AsyncOpenAI
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?
"
,
"extra_field"
:
"0
"
,
}],
# type: ignore
temperature
=
0
,
seed
=
0
)
content
=
resp
.
choices
[
0
].
message
.
content
assert
cont
en
t
i
s
not
Non
e
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_chat_echo.py
0 → 100644
View file @
4d3a2c28
from
typing
import
NamedTuple
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# # any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--enforce-eager"
,
"--max-model-len"
,
"4080"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
class
TestCase
(
NamedTuple
):
model_name
:
str
echo
:
bool
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"test_case"
,
[
TestCase
(
model_name
=
MODEL_NAME
,
echo
=
True
),
TestCase
(
model_name
=
MODEL_NAME
,
echo
=
False
)
],
)
async
def
test_chat_session_with_echo_and_continue_final_message
(
client
:
openai
.
AsyncOpenAI
,
test_case
:
TestCase
):
saying
:
str
=
"Here is a common saying about apple. An apple a day, keeps"
# test echo with continue_final_message parameter
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
test_case
.
model_name
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"tell me a common saying"
},
{
"role"
:
"assistant"
,
"content"
:
saying
}],
extra_body
=
{
"echo"
:
test_case
.
echo
,
"continue_final_message"
:
True
,
"add_generation_prompt"
:
False
})
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"stop"
message
=
choice
.
message
if
test_case
.
echo
:
assert
message
.
content
is
not
None
and
saying
in
message
.
content
else
:
assert
message
.
content
is
not
None
and
saying
not
in
message
.
content
assert
message
.
role
==
"assistant"
tests/entrypoints/openai/test_chat_template.py
View file @
4d3a2c28
...
...
@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
True
,
"""<|im_start|>user
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
...
...
@@ -21,12 +21,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
"""
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
"""<|im_start|>user
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
False
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of"""
)
What is the capital of"""
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""
),
]
TEST_MESSAGES
=
[
...
...
@@ -43,6 +51,10 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
},
]
ASSISTANT_MESSAGE_TO_CONTINUE
=
{
'role'
:
'assistant'
,
'content'
:
'The capital of'
}
def
test_load_chat_template
():
...
...
@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike():
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
"model,template,add_generation_prompt,
continue_final_message,
expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
continue_final_message
,
expected_output
):
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
template_content
=
load_chat_template
(
chat_template
=
template
)
...
...
@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
model
=
model
,
messages
=
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
)
messages
=
TEST_MESSAGES
+
[
ASSISTANT_MESSAGE_TO_CONTINUE
]
if
continue_final_message
else
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
)
# Call the function and get the result
result
=
apply_hf_chat_template
(
...
...
@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation
=
mock_request
.
messages
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
continue_final_message
=
mock_request
.
continue_final_message
,
)
# Test assertion
...
...
tests/entrypoints/openai/test_chunked_prompt.py
0 → 100644
View file @
4d3a2c28
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--max-num-seqs"
,
"128"
,
"--enable-chunked-prefill"
,
"--max-num-batched-tokens"
,
"1000"
,
# large prompts create a lot of output
"--disable-log-requests"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
prompt
=
"What is the capital of France?"
*
400
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
5
,
)
tokens_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
tokens_received
+=
1
assert
chunk
.
choices
[
0
].
text
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
*
400
}]
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
True
,
top_logprobs
=
5
,
)
tokens_received
=
0
empty_chunks_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
if
chunk
.
choices
[
0
].
delta
.
content
==
""
:
# when there is no tokens generated
assert
chunk
.
usage
.
completion_tokens
==
0
assert
chunk
.
choices
[
0
].
logprobs
is
None
empty_chunks_received
+=
1
else
:
tokens_received
+=
1
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
assert
empty_chunks_received
<=
1
tests/entrypoints/openai/test_cli_args.py
View file @
4d3a2c28
import
json
import
unittest
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
import
pytest
from
vllm.entrypoints.openai.cli_args
import
(
make_arg_parser
,
validate_parsed_serve_args
)
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.utils
import
FlexibleArgumentParser
from
...utils
import
VLLM_PATH
LORA_MODULE
=
{
"name"
:
"module2"
,
"path"
:
"/path/to/module2"
,
"base_model_name"
:
"llama"
}
CHATML_JINJA_PATH
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
CHATML_JINJA_PATH
.
exists
()
class
TestLoraParserAction
(
unittest
.
TestCase
):
@
pytest
.
fixture
def
serve_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
return
make_arg_parser
(
parser
)
def
setUp
(
self
):
# Setting up argparse parser for tests
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
self
.
parser
=
make_arg_parser
(
parser
)
def
test_valid_key_value_format
(
self
):
# Test old format: name=path
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
### Tests for Lora module parsing
def
test_valid_key_value_format
(
serve_parser
):
# Test old format: name=path
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
assert
args
.
lora_modules
==
expected
def
test_valid_json_format
(
serve_parser
):
# Test valid JSON format input
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
def
test_invalid_json_format
(
serve_parser
):
# Test invalid JSON format input, missing closing brace
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_valid_json_format
(
self
):
# Test valid JSON format input
args
=
self
.
parser
.
parse_args
([
def
test_invalid_type_error
(
serve_parser
):
# Test type error when values are not JSON or key=value
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
'invalid_format'
# This is not JSON or key=value format
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_invalid_json_format
(
self
):
# Test invalid JSON format input, missing closing brace
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
def
test_invalid_type_error
(
self
):
# Test type error when values are not JSON or key=value
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'invalid_format'
# This is not JSON or key=value format
])
def
test_invalid_json_field
(
self
):
# Test valid JSON format but missing required fields
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module4"}'
# Missing required 'path' field
])
def
test_empty_values
(
self
):
# Test when no LoRA modules are provided
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
''
])
self
.
assertEqual
(
args
.
lora_modules
,
[])
def
test_multiple_valid_inputs
(
self
):
# Test multiple valid inputs (both old and JSON format)
args
=
self
.
parser
.
parse_args
([
def
test_invalid_json_field
(
serve_parser
):
# Test valid JSON format but missing required fields
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
'{"name": "module4"}'
# Missing required 'path' field
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
if
__name__
==
'__main__'
:
unittest
.
main
()
def
test_empty_values
(
serve_parser
):
# Test when no LoRA modules are provided
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
''
])
assert
args
.
lora_modules
==
[]
def
test_multiple_valid_inputs
(
serve_parser
):
# Test multiple valid inputs (both old and JSON format)
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
### Tests for serve argument validation that run prior to loading
def
test_enable_auto_choice_passes_without_tool_call_parser
(
serve_parser
):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
])
with
pytest
.
raises
(
TypeError
):
validate_parsed_serve_args
(
args
)
def
test_enable_auto_choice_passes_with_tool_call_parser
(
serve_parser
):
"""Ensure validation passes with tool choice enabled with a call parser"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"mistral"
,
])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_happy_paths
(
serve_parser
):
"""Ensure validation passes if the chat template exists"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
CHATML_JINJA_PATH
.
absolute
().
as_posix
()])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_sad_paths
(
serve_parser
):
"""Ensure validation fails if the chat template doesn't exist"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
"does/not/exist"
])
with
pytest
.
raises
(
ValueError
):
validate_parsed_serve_args
(
args
)
tests/entrypoints/openai/test_completion.py
View file @
4d3a2c28
...
...
@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
# Added tokens should not appear in tokenized prompt
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"out of vocabulary"
):
# Added tokens should be rejected by the base model
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
]
,
echo
=
True
,
max_tokens
=
5
,
temperature
=
0.0
,
)
@
pytest
.
mark
.
asyncio
...
...
@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_parallel_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
"""Streaming for parallel sampling.
The tokens from multiple samples, are flattened into a single stream,
with an index to indicate which sample the token belongs to.
"""
prompt
=
"What is an LLM?"
n
=
3
max_tokens
=
5
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
n
=
n
,
stream
=
True
)
chunks
:
List
[
List
[
str
]]
=
[[]
for
i
in
range
(
n
)]
finish_reason_count
=
0
async
for
chunk
in
stream
:
index
=
chunk
.
choices
[
0
].
index
text
=
chunk
.
choices
[
0
].
text
chunks
[
index
].
append
(
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
assert
finish_reason_count
==
n
for
chunk
in
chunks
:
assert
len
(
chunk
)
==
max_tokens
print
(
""
.
join
(
chunk
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
...
...
@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but
not necessary
# for official client.
# NOTE: this has to be true for n > 1 in vLLM, but
#
not necessary
for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
...
...
Prev
1
…
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment