Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4d3a2c28
Commit
4d3a2c28
authored
Dec 30, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.5' into v0.6.5-dev
parents
92ec5d8e
2d1b9baa
Changes
430
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1288 additions
and
422 deletions
+1288
-422
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+94
-0
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+5
-40
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+2
-128
tests/entrypoints/llm/test_generate_multiple_loras.py
tests/entrypoints/llm/test_generate_multiple_loras.py
+2
-3
tests/entrypoints/llm/test_gpu_utilization.py
tests/entrypoints/llm/test_gpu_utilization.py
+27
-0
tests/entrypoints/llm/test_guided_generate.py
tests/entrypoints/llm/test_guided_generate.py
+128
-25
tests/entrypoints/llm/test_init.py
tests/entrypoints/llm/test_init.py
+23
-0
tests/entrypoints/llm/test_lazy_outlines.py
tests/entrypoints/llm/test_lazy_outlines.py
+34
-6
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+16
-1
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+33
-27
tests/entrypoints/openai/test_accuracy.py
tests/entrypoints/openai/test_accuracy.py
+37
-5
tests/entrypoints/openai/test_async_tokenization.py
tests/entrypoints/openai/test_async_tokenization.py
+139
-0
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+142
-20
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+111
-9
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+111
-71
tests/entrypoints/openai/test_chat_echo.py
tests/entrypoints/openai/test_chat_echo.py
+80
-0
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+23
-7
tests/entrypoints/openai/test_chunked_prompt.py
tests/entrypoints/openai/test_chunked_prompt.py
+127
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+109
-69
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+45
-11
No files found.
Too many changes to show.
To preserve performance only
430 of 430+
files are displayed.
Plain diff
Email patch
tests/entrypoints/llm/test_chat.py
0 → 100644
View file @
4d3a2c28
from
typing
import
List
import
os
import
pytest
from
vllm
import
LLM
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_encode.py
View file @
4d3a2c28
...
@@ -4,9 +4,8 @@ from typing import List
...
@@ -4,9 +4,8 @@ from typing import List
import
pytest
import
pytest
import
os
import
os
from
vllm
import
LLM
,
EmbeddingRequestOutput
,
PoolingParams
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...conftest
import
cleanup
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)
...
@@ -43,29 +42,14 @@ def llm():
...
@@ -43,29 +42,14 @@ def llm():
del
llm
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
List
[
Embedd
ingRequestOutput
],
def
assert_outputs_equal
(
o1
:
List
[
Pool
ingRequestOutput
],
o2
:
List
[
Embedd
ingRequestOutput
]):
o2
:
List
[
Pool
ingRequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
prompt
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
prompt
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
({
"prompt"
:
prompt
},
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...
@@ -81,25 +65,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
encode
(
prompts
=
PROMPTS
,
pooling_params
=
pooling_params
)
v2_output
=
llm
.
encode
(
PROMPTS
,
pooling_params
=
pooling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
encode
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
pooling_params
=
pooling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
pooling_params
=
PoolingParams
()
pooling_params
=
PoolingParams
()
...
...
tests/entrypoints/llm/test_generate.py
View file @
4d3a2c28
...
@@ -5,9 +5,7 @@ import os
...
@@ -5,9 +5,7 @@ import os
import
pytest
import
pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
...
@@ -42,30 +40,13 @@ def llm():
...
@@ -42,30 +40,13 @@ def llm():
del
llm
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
def
assert_outputs_equal
(
o1
:
List
[
RequestOutput
],
o2
:
List
[
RequestOutput
]):
def
assert_outputs_equal
(
o1
:
List
[
RequestOutput
],
o2
:
List
[
RequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt'
,
PROMPTS
)
def
test_v1_v2_api_consistency_single_prompt_string
(
llm
:
LLM
,
prompt
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
({
"prompt"
:
prompt
},
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
@
pytest
.
mark
.
parametrize
(
'prompt_token_ids'
,
TOKEN_IDS
)
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
def
test_v1_v2_api_consistency_single_prompt_tokens
(
llm
:
LLM
,
...
@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...
@@ -81,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
assert_outputs_equal
(
v1_output
,
v2_output
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_string
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'prompts'"
):
v1_output
=
llm
.
generate
(
prompts
=
PROMPTS
,
sampling_params
=
sampling_params
)
v2_output
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
sampling_params
)
assert_outputs_equal
(
v1_output
,
v2_output
)
v2_output
=
llm
.
generate
(
[{
"prompt"
:
p
}
for
p
in
PROMPTS
],
sampling_params
=
sampling_params
,
)
assert_outputs_equal
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
def
test_v1_v2_api_consistency_multi_prompt_tokens
(
llm
:
LLM
):
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
)
...
@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
...
@@ -143,90 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
))
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
),
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_generate_multiple_loras.py
View file @
4d3a2c28
...
@@ -6,9 +6,8 @@ import os
...
@@ -6,9 +6,8 @@ import os
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
...conftest
import
cleanup
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
...
@@ -41,7 +40,7 @@ def llm():
...
@@ -41,7 +40,7 @@ def llm():
del
llm
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
tests/entrypoints/llm/test_gpu_utilization.py
0 → 100644
View file @
4d3a2c28
import
os
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
models_path_prefix
def
test_gpu_memory_utilization
():
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# makes sure gpu_memory_utilization is per-instance limit,
# not a global limit
llms
=
[
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
gpu_memory_utilization
=
0.3
,
enforce_eager
=
True
)
for
i
in
range
(
3
)
]
for
llm
in
llms
:
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
tests/entrypoints/llm/test_guided_generate.py
View file @
4d3a2c28
...
@@ -6,11 +6,11 @@ import jsonschema
...
@@ -6,11 +6,11 @@ import jsonschema
import
pytest
import
pytest
import
os
import
os
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
...conftest
import
cleanup
from
vllm.sampling_params
import
GuidedDecodingParams
,
SamplingParams
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
...
@@ -25,7 +25,7 @@ def llm():
...
@@ -25,7 +25,7 @@ def llm():
with
llm
.
deprecate_legacy_api
():
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
yield
weakref
.
proxy
(
llm
)
del
llm
del
llm
cleanup
()
cleanup
_dist_env_and_memory
()
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
...
@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm):
...
@@ -33,14 +33,12 @@ def test_guided_regex(sample_regex, llm):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
[
prompts
=
[
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
"
]
*
2
,
]
*
2
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm):
...
@@ -59,15 +57,13 @@ def test_guided_json_completion(sample_json_schema, llm):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
temperature
=
1.0
,
max_tokens
=
1000
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
[
prompts
=
[
f
"Give an example JSON for an employee profile "
f
"Give an example JSON for an employee profile "
f
"that fits this schema:
{
sample_json_schema
}
"
f
"that fits this schema:
{
sample_json_schema
}
"
]
*
2
,
]
*
2
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_json
=
sample_json_schema
))
assert
outputs
is
not
None
assert
outputs
is
not
None
...
@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm):
...
@@ -83,17 +79,72 @@ def test_guided_json_completion(sample_json_schema, llm):
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_complex_json_completion
(
sample_complex_json_schema
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_complex_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for an assignment grade "
f
"that fits this schema:
{
sample_complex_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_complex_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_definition_json_completion
(
sample_definition_json_schema
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_definition_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
f
"Give an example JSON for solving 8x + 7 = -23 "
f
"that fits this schema:
{
sample_definition_json_schema
}
"
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_definition_json_schema
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
):
def
test_guided_choice_completion
(
sample_guided_choice
,
llm
):
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
)
guided_decoding
=
GuidedDecodingParams
(
choice
=
sample_guided_choice
)
)
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
"The best language for type-safe systems programming is "
,
prompts
=
"The best language for type-safe systems programming is "
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
use_tqdm
=
True
)
guided_options_request
=
dict
(
guided_choice
=
sample_guided_choice
))
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm):
...
@@ -114,13 +165,13 @@ def test_guided_grammar(sample_sql_statements, llm):
temperature
=
0.8
,
temperature
=
0.8
,
top_p
=
0.95
,
top_p
=
0.95
,
max_tokens
=
1000
,
max_tokens
=
1000
,
)
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_statements
)
)
outputs
=
llm
.
generate
(
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql state that select col_1 from "
prompts
=
(
"Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"
),
"table_1 where it is equals to 1"
),
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_grammar
=
sample_sql_statements
)
)
)
assert
outputs
is
not
None
assert
outputs
is
not
None
for
output
in
outputs
:
for
output
in
outputs
:
...
@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm):
...
@@ -142,3 +193,55 @@ def test_guided_grammar(sample_sql_statements, llm):
assert
generated_text
.
strip
()
==
ground_truth
assert
generated_text
.
strip
()
==
ground_truth
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_options_request_deprecation_warning
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"guided_options_request"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_validation_against_both_guided_decoding_options
(
sample_regex
,
llm
):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
with
pytest
.
raises
(
ValueError
,
match
=
"Cannot set both"
):
llm
.
generate
(
prompts
=
"This should fail"
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
guided_options_request
=
dict
(
guided_regex
=
sample_regex
))
@
pytest
.
mark
.
skip_global_cleanup
def
test_guided_json_object
(
llm
):
sampling_params
=
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
100
,
guided_decoding
=
GuidedDecodingParams
(
json_object
=
True
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a JSON object describing a person with name "
"and age for John Smith who is 31 years old."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
generated_text
=
output
.
outputs
[
0
].
text
print
(
generated_text
)
assert
generated_text
is
not
None
# Parse to verify it is valid JSON
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
tests/entrypoints/llm/test_init.py
0 → 100644
View file @
4d3a2c28
import
os
import
pytest
from
vllm
import
LLM
from
...utils
import
error_on_warning
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
def
test_pos_args_deprecated
():
with
error_on_warning
(
DeprecationWarning
):
LLM
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
error_on_warning
(
DeprecationWarning
):
LLM
(
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer', 'tokenizer_mode'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
,
"auto"
)
tests/entrypoints/llm/test_lazy_outlines.py
View file @
4d3a2c28
import
sys
import
sys
import
os
import
os
from
contextlib
import
nullcontext
from
vllm_test_utils
import
BlameResult
,
blame
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
def
test_lazy_outlines
(
sample_regex
):
def
run_normal
():
"""If users don't use guided decoding, outlines should not be imported.
"""
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
"The president of the United States is"
,
"The president of the United States is"
,
...
@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex):
...
@@ -16,6 +18,7 @@ def test_lazy_outlines(sample_regex):
]
]
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
# Create an LLM without guided decoding as a baseline.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
enforce_eager
=
True
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.3
)
gpu_memory_utilization
=
0.3
)
...
@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex):
...
@@ -25,9 +28,13 @@ def test_lazy_outlines(sample_regex):
generated_text
=
output
.
outputs
[
0
].
text
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
# make sure outlines is not imported
# Destroy the LLM object and free up the GPU memory.
assert
'outlines'
not
in
sys
.
modules
del
llm
cleanup_dist_env_and_memory
()
def
run_lmfe
(
sample_regex
):
# Create an LLM with guided decoding enabled.
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
enforce_eager
=
True
,
enforce_eager
=
True
,
guided_decoding_backend
=
"lm-format-enforcer"
,
guided_decoding_backend
=
"lm-format-enforcer"
,
...
@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex):
...
@@ -46,5 +53,26 @@ def test_lazy_outlines(sample_regex):
generated_text
=
output
.
outputs
[
0
].
text
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
def
test_lazy_outlines
(
sample_regex
):
"""If users don't use guided decoding, outlines should not be imported.
"""
# make sure outlines is not imported
# make sure outlines is not imported
assert
'outlines'
not
in
sys
.
modules
module_name
=
"outlines"
# In CI, we only check finally if the module is imported.
# If it is indeed imported, we can rerun the test with `use_blame=True`,
# which will trace every function call to find the first import location,
# and help find the root cause.
# We don't run it in CI by default because it is slow.
use_blame
=
False
context
=
blame
(
lambda
:
module_name
in
sys
.
modules
)
if
use_blame
else
nullcontext
()
with
context
as
result
:
run_normal
()
run_lmfe
(
sample_regex
)
if
use_blame
:
assert
isinstance
(
result
,
BlameResult
)
print
(
f
"the first import location is:
\n
{
result
.
trace_stack
}
"
)
assert
module_name
not
in
sys
.
modules
,
(
f
"Module
{
module_name
}
is imported. To see the first"
f
" import location, run the test with `use_blame=True`."
)
tests/entrypoints/llm/test_prompt_validation.py
View file @
4d3a2c28
...
@@ -5,7 +5,22 @@ from vllm import LLM
...
@@ -5,7 +5,22 @@ from vllm import LLM
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def
test_empty_prompt
():
def
test_empty_prompt
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
))
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
)
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
4d3a2c28
"""Tests for HF_HUB_OFFLINE mode"""
"""Tests for HF_HUB_OFFLINE mode"""
import
importlib
import
importlib
import
sys
import
sys
import
weakref
import
os
import
os
import
pytest
import
pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...conftest
import
cleanup
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
@
pytest
.
fixture
(
scope
=
"module"
)
MODEL_CONFIGS
=
[
def
llm
():
{
# pytest caches the fixture so we use weakref.proxy to
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
# enable garbage collection
"enforce_eager"
:
True
,
llm
=
LLM
(
model
=
MODEL_NAME
,
"gpu_memory_utilization"
:
0.20
,
max_num_batched_tokens
=
4096
,
"max_model_len"
:
64
,
tensor_parallel_size
=
1
,
"max_num_batched_tokens"
:
64
,
gpu_memory_utilization
=
0.10
,
"max_num_seqs"
:
64
,
enforce_eager
=
True
)
"tensor_parallel_size"
:
1
,
},
{
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mistral-7B-Instruct-v0.1"
),
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.95
,
"max_model_len"
:
64
,
"max_num_batched_tokens"
:
64
,
"max_num_seqs"
:
64
,
"tensor_parallel_size"
:
1
,
"tokenizer_mode"
:
"mistral"
,
},
]
with
llm
.
deprecate_legacy_api
():
yield
weakref
.
proxy
(
llm
)
del
llm
@
pytest
.
fixture
(
scope
=
"module"
)
def
cache_models
():
# Cache model files first
for
model_config
in
MODEL_CONFIGS
:
LLM
(
**
model_config
)
cleanup_dist_env_and_memory
()
cleanup
()
yield
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
def
test_offline_mode
(
llm
:
LLM
,
monkeypatch
):
@
pytest
.
mark
.
usefixtures
(
"cache_models"
)
# we use the llm fixture to ensure the model files are in-cache
def
test_offline_mode
(
monkeypatch
):
del
llm
# Set HF to offline mode and ensure we can still construct an LLM
# Set HF to offline mode and ensure we can still construct an LLM
try
:
try
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
# Need to re-import huggingface_hub and friends to setup offline mode
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules
()
_re_import_modules
()
# Cached model files should be used in offline mode
# Cached model files should be used in offline mode
LLM
(
model
=
MODEL_NAME
,
for
model_config
in
MODEL_CONFIGS
:
max_num_batched_tokens
=
4096
,
LLM
(
**
model_config
)
tensor_parallel_size
=
1
,
gpu_memory_utilization
=
0.10
,
enforce_eager
=
True
)
finally
:
finally
:
# Reset the environment after the test
# Reset the environment after the test
# NB: Assuming tests are run in online mode
# NB: Assuming tests are run in online mode
...
...
tests/entrypoints/openai/test_accuracy.py
View file @
4d3a2c28
...
@@ -11,6 +11,8 @@ import lm_eval
...
@@ -11,6 +11,8 @@ import lm_eval
import
pytest
import
pytest
import
os
import
os
from
vllm.platforms
import
current_platform
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
...
@@ -19,22 +21,33 @@ TASK = "gsm8k"
...
@@ -19,22 +21,33 @@ TASK = "gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
RTOL
=
0.03
EXPECTED_VALUE
=
0.58
EXPECTED_VALUE
=
0.58
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
4096
"
,
"--disable-log-requests"
]
DEFAULT_ARGS
=
[
"--max-model-len"
,
"
2048
"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
MORE_ARGS_LIST
=
[
[],
# Default
[
"--enable-chunked-prefill"
],
# Chunked
[
"--enable-chunked-prefill"
],
# Chunked
[
"--num-scheduler-steps"
,
"8"
],
# MS
[
"--num-scheduler-steps"
,
"8"
],
# MS
[
"--num-scheduler-steps"
,
"8"
,
"--multi-step-stream-outputs"
]
# MS+Stream
[
"--num-scheduler-steps"
,
"8"
,
"--multi-step-stream-outputs"
]
# MS+Stream
]
]
MAX_WAIT_SECONDS
=
None
if
current_platform
.
is_tpu
():
MORE_ARGS_LIST
=
[
[],
# Default
# ["--num-scheduler-steps", "8"], # Multi-step << currently fails
]
MAX_WAIT_SECONDS
=
600
def
run_test
(
more_args
):
"""Run the end to end accuracy test."""
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy
(
more_args
):
args
=
list
(
DEFAULT_ARGS
)
args
=
list
(
DEFAULT_ARGS
)
args
.
extend
(
more_args
)
args
.
extend
(
more_args
)
print
(
f
"Running with:
{
args
}
"
)
print
(
f
"Running with:
{
args
}
"
)
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
max_wait_seconds
=
MAX_WAIT_SECONDS
)
as
remote_server
:
url
=
f
"
{
remote_server
.
url_for
(
'v1'
)
}
/completions"
url
=
f
"
{
remote_server
.
url_for
(
'v1'
)
}
/completions"
model_args
=
(
model_args
=
(
...
@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args):
...
@@ -52,3 +65,22 @@ def test_lm_eval_accuracy(more_args):
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
assert
(
measured_value
-
RTOL
<
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
and
measured_value
+
RTOL
>
EXPECTED_VALUE
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
),
f
"Expected:
{
EXPECTED_VALUE
}
| Measured:
{
measured_value
}
"
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"V1 currently only supported on CUDA"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
run_test
([])
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
,
more_args
):
"""Run with the V0 Engine."""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
run_test
(
more_args
)
tests/entrypoints/openai/test_async_tokenization.py
0 → 100644
View file @
4d3a2c28
import
asyncio
import
contextlib
import
random
import
time
from
typing
import
Callable
import
os
import
openai
import
pytest
import
pytest_asyncio
import
requests
from
tests.utils
import
RemoteOpenAIServer
from
...utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
# noqa: F811
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"128"
,
"--load-format"
,
"dummy"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
ids
=
[
"completion"
,
"chat"
],
argnames
=
[
"create_func_gen"
,
"content_body"
],
argvalues
=
[
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
" "
.
join
([
'A'
]
*
10_000
)
}),
(
lambda
x
:
x
.
chat
.
completions
.
create
,
{
"messages"
:
[{
"role"
:
"user"
,
"content"
:
" "
.
join
([
'A'
]
*
10_000
)
}]
}),
],
)
async
def
test_with_and_without_truncate
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
,
create_func_gen
:
Callable
,
content_body
:
dict
,
):
create_func
=
create_func_gen
(
client
)
body
=
{
"model"
:
MODEL_NAME
,
**
content_body
,
"max_tokens"
:
10
}
num_requests
=
10
truncate_prompt_tokens
=
([
1000
]
*
(
num_requests
//
2
)
+
[
None
]
*
(
num_requests
-
num_requests
//
2
))
random
.
shuffle
(
truncate_prompt_tokens
)
bodies
=
[{
**
body
,
"extra_body"
:
{
'truncate_prompt_tokens'
:
t
}
}
for
t
in
truncate_prompt_tokens
]
async
def
get_status_code
(
**
kwargs
):
try
:
await
create_func
(
**
kwargs
)
return
200
except
openai
.
APIStatusError
as
e
:
return
e
.
status_code
responses
=
await
asyncio
.
gather
(
*
[
get_status_code
(
**
b
)
for
b
in
bodies
])
assert
500
not
in
responses
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
ids
=
[
"single completion"
,
"multiple completions"
,
"chat"
],
argnames
=
[
"create_func_gen"
,
"content_body"
],
argvalues
=
[
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
" "
.
join
([
'A'
]
*
300_000
)
}),
(
lambda
x
:
x
.
completions
.
create
,
{
"prompt"
:
[
" "
.
join
([
'A'
]
*
300_000
)]
*
2
}),
(
lambda
x
:
x
.
chat
.
completions
.
create
,
{
"messages"
:
[{
"role"
:
"user"
,
"content"
:
" "
.
join
([
'A'
]
*
300_000
)
}]
}),
],
)
async
def
test_healthcheck_response_time
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
,
create_func_gen
:
Callable
,
content_body
:
dict
,
):
num_requests
=
50
create_func
=
create_func_gen
(
client
)
body
=
{
"model"
:
MODEL_NAME
,
**
content_body
,
"max_tokens"
:
10
}
def
get_response_time
(
url
):
start_time
=
time
.
monotonic
()
res
=
requests
.
get
(
url
)
end_time
=
time
.
monotonic
()
assert
res
.
status_code
==
200
return
end_time
-
start_time
no_load_response_time
=
get_response_time
(
server
.
url_for
(
"health"
))
tasks
=
[
asyncio
.
create_task
(
create_func
(
**
body
))
for
_
in
range
(
num_requests
)
]
await
asyncio
.
sleep
(
1
)
# give the tasks a chance to start running
load_response_time
=
get_response_time
(
server
.
url_for
(
"health"
))
with
contextlib
.
suppress
(
openai
.
APIStatusError
):
await
asyncio
.
gather
(
*
tasks
)
assert
load_response_time
<
100
*
no_load_response_time
assert
load_response_time
<
0.1
tests/entrypoints/openai/test_audio.py
View file @
4d3a2c28
...
@@ -24,8 +24,11 @@ def server():
...
@@ -24,8 +24,11 @@ def server():
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
"--max-model-len"
,
"--max-model-len"
,
"4096"
,
"2048"
,
"--max-num-seqs"
,
"5"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--trust-remote-code"
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -69,11 +72,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
}]
}]
# test single completion
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
10
,
messages
=
messages
,
logprobs
=
True
,
max_completion_tokens
=
10
,
top_logprobs
=
5
)
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
...
@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -92,7 +96,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
)
)
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded(
...
@@ -124,11 +128,12 @@ async def test_single_chat_session_audio_base64encoded(
}]
}]
# test single completion
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
10
,
messages
=
messages
,
logprobs
=
True
,
max_completion_tokens
=
10
,
top_logprobs
=
5
)
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
...
@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded(
...
@@ -147,7 +152,62 @@ async def test_single_chat_session_audio_base64encoded(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
)
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_audio"
,
"input_audio"
:
{
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
202
,
total_tokens
=
212
)
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
10
assert
message
.
role
==
"assistant"
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
message
.
content
})
# test multi-turn dialogue
messages
.
append
({
"role"
:
"user"
,
"content"
:
"express your result in json"
})
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
)
)
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
...
@@ -179,7 +239,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
output
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
...
@@ -189,7 +249,67 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
:
List
[
str
]
=
[]
finish_reason_count
=
0
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
role
:
assert
delta
.
role
==
"assistant"
if
delta
.
content
:
chunks
.
append
(
delta
.
content
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
# finish reason should only return in last block
assert
finish_reason_count
==
1
assert
chunk
.
choices
[
0
].
finish_reason
==
stop_reason
assert
delta
.
content
assert
""
.
join
(
chunks
)
==
output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_chat_streaming_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"input_audio"
,
"input_audio"
:
{
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
},
{
"type"
:
"text"
,
"text"
:
"What's happening in this audio?"
},
],
}]
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
stop_reason
=
chat_completion
.
choices
[
0
].
finish_reason
# test streaming
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
...
@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
...
@@ -214,7 +334,8 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
):
audio_url
:
str
,
base64_encoded_audio
:
Dict
[
str
,
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
...
@@ -227,9 +348,10 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
}
}
},
},
{
{
"type"
:
"audio_url"
,
"type"
:
"input_audio"
,
"audio_url"
:
{
"input_audio"
:
{
"url"
:
audio_url
"data"
:
base64_encoded_audio
[
audio_url
],
"format"
:
"wav"
}
}
},
},
{
{
...
@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
...
@@ -243,7 +365,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
...
...
tests/entrypoints/openai/test_basic.py
View file @
4d3a2c28
import
asyncio
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
List
import
openai
import
openai
import
pytest
import
pytest
...
@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
...
@@ -13,8 +15,44 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
@
pytest
.
fixture
(
scope
=
'module'
)
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
List
[
str
]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
>>> ],
>>> ],
>>> indirect=True,
>>> )
>>> def test_foo(server, client):
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
if
not
hasattr
(
request
,
"param"
):
return
[]
val
=
request
.
param
if
isinstance
(
val
,
str
):
return
[
val
]
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
(
server_args
):
args
=
[
args
=
[
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
...
@@ -24,6 +62,7 @@ def server():
...
@@ -24,6 +62,7 @@ def server():
"--enforce-eager"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
"128"
,
"128"
,
*
server_args
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
@@ -36,20 +75,83 @@ async def client(server):
...
@@ -36,20 +75,83 @@ async def client(server):
yield
async_client
yield
async_client
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_show_version
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_show_version
(
server
:
RemoteOpenAIServer
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
server
.
url_for
(
"version"
))
response
=
requests
.
get
(
base_url
+
"/version"
)
response
.
raise_for_status
()
response
.
raise_for_status
()
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
assert
response
.
json
()
==
{
"version"
:
VLLM_VERSION
}
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
([
"--disable-frontend-multiprocessing"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_check_health
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_check_health
(
server
:
RemoteOpenAIServer
):
base_url
=
str
(
client
.
base_url
)[:
-
3
].
strip
(
"/"
)
response
=
requests
.
get
(
server
.
url_for
(
"health"
))
response
=
requests
.
get
(
base_url
+
"/health"
)
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([
"--max-model-len"
,
"10100"
],
id
=
"default-frontend-multiprocessing"
),
pytest
.
param
(
[
"--disable-frontend-multiprocessing"
,
"--max-model-len"
,
"10100"
],
id
=
"disable-frontend-multiprocessing"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_request_cancellation
(
server
:
RemoteOpenAIServer
):
# clunky test: send an ungodly amount of load in with short timeouts
# then ensure that it still responds quickly afterwards
chat_input
=
[{
"role"
:
"user"
,
"content"
:
"Write a long story"
}]
client
=
server
.
get_async_client
(
timeout
=
0.5
)
tasks
=
[]
# Request about 2 million tokens
for
_
in
range
(
200
):
task
=
asyncio
.
create_task
(
client
.
chat
.
completions
.
create
(
messages
=
chat_input
,
model
=
MODEL_NAME
,
max_tokens
=
10000
,
extra_body
=
{
"min_tokens"
:
10000
}))
tasks
.
append
(
task
)
done
,
pending
=
await
asyncio
.
wait
(
tasks
,
return_when
=
asyncio
.
ALL_COMPLETED
)
# Make sure all requests were sent to the server and timed out
# (We don't want to hide other errors like 400s that would invalidate this
# test)
assert
len
(
pending
)
==
0
for
d
in
done
:
with
pytest
.
raises
(
openai
.
APITimeoutError
):
d
.
result
()
# If the server had not cancelled all the other requests, then it would not
# be able to respond to this one within the timeout
client
=
server
.
get_async_client
(
timeout
=
5
)
response
=
await
client
.
chat
.
completions
.
create
(
messages
=
chat_input
,
model
=
MODEL_NAME
,
max_tokens
=
10
)
assert
len
(
response
.
choices
)
==
1
tests/entrypoints/openai/test_chat.py
View file @
4d3a2c28
...
@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
...
@@ -17,9 +17,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"typeof/zephyr-7b-beta-lora"
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
...
@@ -69,11 +66,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
"content"
:
"what is 1+1?"
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
5
,
messages
=
messages
,
temperature
=
0.0
,
max_completion_tokens
=
5
,
logprobs
=
False
)
temperature
=
0.0
,
logprobs
=
False
)
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
None
assert
choice
.
logprobs
is
None
...
@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
...
@@ -94,12 +92,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
"content"
:
"what is 1+1?"
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
5
,
messages
=
messages
,
temperature
=
0.0
,
max_completion_tokens
=
5
,
logprobs
=
True
,
temperature
=
0.0
,
top_logprobs
=
0
)
logprobs
=
True
,
top_logprobs
=
0
)
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
...
@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
...
@@ -121,12 +120,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
"content"
:
"what is 1+1?"
"content"
:
"what is 1+1?"
}]
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
5
,
messages
=
messages
,
temperature
=
0.0
,
max_completion_tokens
=
5
,
logprobs
=
True
,
temperature
=
0.0
,
top_logprobs
=
5
)
logprobs
=
True
,
top_logprobs
=
5
)
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
logprobs
is
not
None
assert
choice
.
logprobs
is
not
None
...
@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -153,7 +153,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
((
openai
.
BadRequestError
,
openai
.
APIError
)):
with
pytest
.
raises
((
openai
.
BadRequestError
,
openai
.
APIError
)):
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
logprobs
=
True
,
top_logprobs
=
21
,
top_logprobs
=
21
,
stream
=
True
)
stream
=
True
)
...
@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -163,16 +163,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
logprobs
=
True
,
top_logprobs
=
30
,
top_logprobs
=
30
,
stream
=
False
)
stream
=
False
)
# the server should still work afterwards
# the server should still work afterwards
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
10
,
messages
=
messages
,
stream
=
False
)
max_completion_tokens
=
10
,
stream
=
False
)
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
...
@@ -275,11 +276,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
}]
}]
# test single completion
# test single completion
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
model_name
,
max_tokens
=
10
,
messages
=
messages
,
logprobs
=
True
,
max_completion_tokens
=
10
,
top_logprobs
=
5
)
logprobs
=
True
,
top_logprobs
=
5
)
assert
chat_completion
.
id
is
not
None
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
assert
len
(
chat_completion
.
choices
)
==
1
...
@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
...
@@ -298,7 +300,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
)
)
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
assert
message
.
content
is
not
None
and
len
(
message
.
content
)
>=
0
...
@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
...
@@ -323,7 +325,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
output
=
chat_completion
.
choices
[
0
].
message
.
content
output
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
...
@@ -333,7 +335,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
stream
=
await
client
.
chat
.
completions
.
create
(
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
...
@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -377,7 +379,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
False
})
stream_options
=
{
"include_usage"
:
False
})
...
@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -388,7 +390,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
# "continuous_usage_stats": False}}
# "continuous_usage_stats": False}}
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
stream_options
=
{
...
@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -417,7 +419,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
False
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
None
})
stream_options
=
{
"include_usage"
:
None
})
...
@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -427,7 +429,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
False
,
stream
=
False
,
stream_options
=
{
"include_usage"
:
True
})
stream_options
=
{
"include_usage"
:
True
})
...
@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
...
@@ -437,19 +439,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_completion_tokens
=
10
,
extra_body
=
dict
(
min_tokens
=
10
),
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
stream_options
=
{
stream_options
=
{
"include_usage"
:
True
,
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
"continuous_usage_stats"
:
True
,
},
},
)
)
last_completion_tokens
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
last_completion_tokens
==
0
or
\
chunk
.
usage
.
completion_tokens
>
last_completion_tokens
or
\
(
not
chunk
.
choices
and
chunk
.
usage
.
completion_tokens
==
last_completion_tokens
)
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
chunk
.
usage
.
completion_tokens
)
last_completion_tokens
=
chunk
.
usage
.
completion_tokens
assert
last_completion_tokens
==
10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
...
@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
...
@@ -474,7 +486,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
...
@@ -488,7 +500,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
...
@@ -515,7 +527,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
...
@@ -533,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
...
@@ -561,7 +573,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
20
,
max_
completion_
tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
...
@@ -573,7 +585,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
20
,
max_
completion_
tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
guided_decoding_backend
=
guided_decoding_backend
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
...
@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -621,7 +633,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
10
,
max_
completion_
tokens
=
10
,
logprobs
=
True
,
logprobs
=
True
,
top_logprobs
=
5
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
...
@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -658,7 +670,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
chat_completion
=
await
client
.
chat
.
completions
.
create
(
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
...
@@ -692,7 +704,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
stream
=
await
client
.
chat
.
completions
.
create
(
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -748,7 +760,7 @@ async def test_required_tool_use_not_yet_supported(
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported(
...
@@ -763,7 +775,7 @@ async def test_required_tool_use_not_yet_supported(
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
...
@@ -794,7 +806,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
with
pytest
.
raises
(
openai
.
BadRequestError
):
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tool_choice
=
{
tool_choice
=
{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
...
@@ -807,7 +819,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
messages
,
messages
=
messages
,
max_tokens
=
1000
,
max_
completion_
tokens
=
1000
,
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
"function"
:
{
"function"
:
{
...
@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
...
@@ -822,6 +834,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
"name"
:
"nondefined_function_name"
"name"
:
"nondefined_function_name"
}
}
})
})
with
pytest
.
raises
(
openai
.
BadRequestError
):
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
1000
,
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"dummy_function_name"
,
"description"
:
"This is a dummy function"
,
"parameters"
:
sample_json_schema
}
}],
tool_choice
=
{})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
...
@@ -846,14 +872,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_response_format_json_schema
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_response_format_json_schema
(
client
:
openai
.
AsyncOpenAI
):
prompt
=
'what is 1+1? The format is "result": 2'
# Check that this prompt cannot lead to a valid JSON without json_schema
for
_
in
range
(
2
):
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
[{
messages
=
[{
"role"
:
"role"
:
"user"
,
"user"
,
"content"
:
prompt
"content"
:
(
'what is 1+1? please respond with a JSON object, '
}],
'the format is {"result": 2}'
)
)
content
=
resp
.
choices
[
0
].
message
.
content
assert
content
is
not
None
with
pytest
.
raises
((
json
.
JSONDecodeError
,
AssertionError
)):
loaded
=
json
.
loads
(
content
)
assert
loaded
==
{
"result"
:
2
},
loaded
for
_
in
range
(
2
):
resp
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
}],
response_format
=
{
response_format
=
{
"type"
:
"json_schema"
,
"type"
:
"json_schema"
,
...
@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
...
@@ -878,19 +918,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_extra_fields
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_extra_fields
_allowed
(
client
:
openai
.
AsyncOpenAI
):
with
pytest
.
raises
(
BadRequestError
)
as
exc_info
:
resp
=
await
client
.
chat
.
completions
.
create
(
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
messages
=
[{
messages
=
[{
"role"
:
"user"
,
"role"
:
"system
"
,
"content"
:
"what is 1+1?
"
,
"content"
:
"You are a helpful assistant.
"
,
"extra_field"
:
"0
"
,
"extra_field"
:
"0"
,
}],
# type: ignore
}],
# type: ignore
temperature
=
0
,
temperature
=
0
,
seed
=
0
)
seed
=
0
)
content
=
resp
.
choices
[
0
].
message
.
content
assert
"extra_forbidd
en
"
i
n
exc_info
.
value
.
messag
e
assert
cont
en
t
i
s
not
Non
e
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_chat_echo.py
0 → 100644
View file @
4d3a2c28
from
typing
import
NamedTuple
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# # any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--enforce-eager"
,
"--max-model-len"
,
"4080"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
class
TestCase
(
NamedTuple
):
model_name
:
str
echo
:
bool
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"test_case"
,
[
TestCase
(
model_name
=
MODEL_NAME
,
echo
=
True
),
TestCase
(
model_name
=
MODEL_NAME
,
echo
=
False
)
],
)
async
def
test_chat_session_with_echo_and_continue_final_message
(
client
:
openai
.
AsyncOpenAI
,
test_case
:
TestCase
):
saying
:
str
=
"Here is a common saying about apple. An apple a day, keeps"
# test echo with continue_final_message parameter
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
test_case
.
model_name
,
messages
=
[{
"role"
:
"user"
,
"content"
:
"tell me a common saying"
},
{
"role"
:
"assistant"
,
"content"
:
saying
}],
extra_body
=
{
"echo"
:
test_case
.
echo
,
"continue_final_message"
:
True
,
"add_generation_prompt"
:
False
})
assert
chat_completion
.
id
is
not
None
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"stop"
message
=
choice
.
message
if
test_case
.
echo
:
assert
message
.
content
is
not
None
and
saying
in
message
.
content
else
:
assert
message
.
content
is
not
None
and
saying
not
in
message
.
content
assert
message
.
role
==
"assistant"
tests/entrypoints/openai/test_chat_template.py
View file @
4d3a2c28
...
@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists()
...
@@ -13,7 +13,7 @@ assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
True
,
"""<|im_start|>user
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
...
@@ -21,12 +21,20 @@ Hi there!<|im_end|>
...
@@ -21,12 +21,20 @@ Hi there!<|im_end|>
What is the capital of<|im_end|>
What is the capital of<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
"""
),
"""
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
"""<|im_start|>user
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
False
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
Hi there!<|im_end|>
Hi there!<|im_end|>
<|im_start|>user
<|im_start|>user
What is the capital of"""
)
What is the capital of"""
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
chatml_jinja_path
,
False
,
True
,
"""<|im_start|>user
Hello<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>
<|im_start|>user
What is the capital of<|im_end|>
<|im_start|>assistant
The capital of"""
),
]
]
TEST_MESSAGES
=
[
TEST_MESSAGES
=
[
...
@@ -43,6 +51,10 @@ TEST_MESSAGES = [
...
@@ -43,6 +51,10 @@ TEST_MESSAGES = [
'content'
:
'What is the capital of'
'content'
:
'What is the capital of'
},
},
]
]
ASSISTANT_MESSAGE_TO_CONTINUE
=
{
'role'
:
'assistant'
,
'content'
:
'The capital of'
}
def
test_load_chat_template
():
def
test_load_chat_template
():
...
@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike():
...
@@ -74,10 +86,10 @@ def test_no_load_chat_template_literallike():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,expected_output"
,
"model,template,add_generation_prompt,
continue_final_message,
expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
MODEL_TEMPLATE_GENERATON_OUTPUT
)
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
expected_output
):
continue_final_message
,
expected_output
):
# Initialize the tokenizer
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model
)
template_content
=
load_chat_template
(
chat_template
=
template
)
template_content
=
load_chat_template
(
chat_template
=
template
)
...
@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -85,8 +97,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Create a mock request object using keyword arguments
# Create a mock request object using keyword arguments
mock_request
=
ChatCompletionRequest
(
mock_request
=
ChatCompletionRequest
(
model
=
model
,
model
=
model
,
messages
=
TEST_MESSAGES
,
messages
=
TEST_MESSAGES
+
[
ASSISTANT_MESSAGE_TO_CONTINUE
]
add_generation_prompt
=
add_generation_prompt
)
if
continue_final_message
else
TEST_MESSAGES
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
)
# Call the function and get the result
# Call the function and get the result
result
=
apply_hf_chat_template
(
result
=
apply_hf_chat_template
(
...
@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -94,6 +109,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
conversation
=
mock_request
.
messages
,
conversation
=
mock_request
.
messages
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
continue_final_message
=
mock_request
.
continue_final_message
,
)
)
# Test assertion
# Test assertion
...
...
tests/entrypoints/openai/test_chunked_prompt.py
0 → 100644
View file @
4d3a2c28
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
# any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceH4/zephyr-7b-beta"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
# lora config below
"--max-num-seqs"
,
"128"
,
"--enable-chunked-prefill"
,
"--max-num-batched-tokens"
,
"1000"
,
# large prompts create a lot of output
"--disable-log-requests"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
prompt
=
"What is the capital of France?"
*
400
stream
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
prompt
=
prompt
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
5
,
)
tokens_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
tokens_received
+=
1
assert
chunk
.
choices
[
0
].
text
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
@
pytest
.
mark
.
asyncio
async
def
test_chat_completion_stream_options_and_logprobs_with_long_prompts
(
client
:
openai
.
AsyncOpenAI
):
# Test stream with long prompt
messages
=
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"What is the capital of France?"
*
400
}]
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
5
,
temperature
=
0.0
,
stream
=
True
,
stream_options
=
{
"include_usage"
:
True
,
"continuous_usage_stats"
:
True
,
},
logprobs
=
True
,
top_logprobs
=
5
,
)
tokens_received
=
0
empty_chunks_received
=
0
finished
=
False
async
for
chunk
in
stream
:
assert
chunk
.
usage
.
prompt_tokens
>=
0
assert
chunk
.
usage
.
completion_tokens
>=
0
assert
chunk
.
usage
.
total_tokens
==
(
chunk
.
usage
.
prompt_tokens
+
chunk
.
usage
.
completion_tokens
)
if
not
finished
:
if
chunk
.
choices
[
0
].
delta
.
content
==
""
:
# when there is no tokens generated
assert
chunk
.
usage
.
completion_tokens
==
0
assert
chunk
.
choices
[
0
].
logprobs
is
None
empty_chunks_received
+=
1
else
:
tokens_received
+=
1
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finished
=
True
if
finished
:
assert
chunk
.
usage
.
completion_tokens
==
tokens_received
assert
empty_chunks_received
<=
1
tests/entrypoints/openai/test_cli_args.py
View file @
4d3a2c28
import
json
import
json
import
unittest
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
import
pytest
from
vllm.entrypoints.openai.cli_args
import
(
make_arg_parser
,
validate_parsed_serve_args
)
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.entrypoints.openai.serving_engine
import
LoRAModulePath
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
from
...utils
import
VLLM_PATH
LORA_MODULE
=
{
LORA_MODULE
=
{
"name"
:
"module2"
,
"name"
:
"module2"
,
"path"
:
"/path/to/module2"
,
"path"
:
"/path/to/module2"
,
"base_model_name"
:
"llama"
"base_model_name"
:
"llama"
}
}
CHATML_JINJA_PATH
=
VLLM_PATH
/
"examples/template_chatml.jinja"
assert
CHATML_JINJA_PATH
.
exists
()
class
TestLoraParserAction
(
unittest
.
TestCase
):
@
pytest
.
fixture
def
serve_parser
():
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
return
make_arg_parser
(
parser
)
def
setUp
(
self
):
# Setting up argparse parser for tests
parser
=
FlexibleArgumentParser
(
description
=
"vLLM's remote OpenAI server."
)
self
.
parser
=
make_arg_parser
(
parser
)
def
test_valid_key_value_format
(
self
):
### Tests for Lora module parsing
# Test old format: name=path
def
test_valid_key_value_format
(
serve_parser
):
args
=
self
.
parser
.
parse_args
([
# Test old format: name=path
'--lora-modules'
,
args
=
serve_parser
.
parse_args
([
'module1=/path/to/module1'
,
'--lora-modules'
,
'module1=/path/to/module1'
,
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
assert
args
.
lora_modules
==
expected
def
test_valid_json_format
(
serve_parser
):
# Test valid JSON format input
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
def
test_invalid_json_format
(
serve_parser
):
# Test invalid JSON format input, missing closing brace
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
)]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
def
test_valid_json_format
(
self
):
# Test valid JSON format input
def
test_invalid_type_error
(
serve_parser
):
args
=
self
.
parser
.
parse_args
([
# Test type error when values are not JSON or key=value
with
pytest
.
raises
(
SystemExit
):
serve_parser
.
parse_args
([
'--lora-modules'
,
'--lora-modules'
,
json
.
dumps
(
LORA_MODULE
),
'invalid_format'
# This is not JSON or key=value format
])
])
expected
=
[
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
def
test_invalid_json_field
(
serve_parser
):
base_model_name
=
'llama'
)
# Test valid JSON format but missing required fields
]
with
pytest
.
raises
(
SystemExit
):
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
serve_parser
.
parse_args
([
def
test_invalid_json_format
(
self
):
# Test invalid JSON format input, missing closing brace
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module3", "path": "/path/to/module3"'
])
def
test_invalid_type_error
(
self
):
# Test type error when values are not JSON or key=value
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'invalid_format'
# This is not JSON or key=value format
])
def
test_invalid_json_field
(
self
):
# Test valid JSON format but missing required fields
with
self
.
assertRaises
(
SystemExit
):
self
.
parser
.
parse_args
([
'--lora-modules'
,
'{"name": "module4"}'
# Missing required 'path' field
])
def
test_empty_values
(
self
):
# Test when no LoRA modules are provided
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
''
])
self
.
assertEqual
(
args
.
lora_modules
,
[])
def
test_multiple_valid_inputs
(
self
):
# Test multiple valid inputs (both old and JSON format)
args
=
self
.
parser
.
parse_args
([
'--lora-modules'
,
'--lora-modules'
,
'module1=/path/to/module1'
,
'{"name": "module4"}'
# Missing required 'path' field
json
.
dumps
(
LORA_MODULE
),
])
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
self
.
assertEqual
(
args
.
lora_modules
,
expected
)
if
__name__
==
'__main__'
:
def
test_empty_values
(
serve_parser
):
unittest
.
main
()
# Test when no LoRA modules are provided
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
''
])
assert
args
.
lora_modules
==
[]
def
test_multiple_valid_inputs
(
serve_parser
):
# Test multiple valid inputs (both old and JSON format)
args
=
serve_parser
.
parse_args
([
'--lora-modules'
,
'module1=/path/to/module1'
,
json
.
dumps
(
LORA_MODULE
),
])
expected
=
[
LoRAModulePath
(
name
=
'module1'
,
path
=
'/path/to/module1'
),
LoRAModulePath
(
name
=
'module2'
,
path
=
'/path/to/module2'
,
base_model_name
=
'llama'
)
]
assert
args
.
lora_modules
==
expected
### Tests for serve argument validation that run prior to loading
def
test_enable_auto_choice_passes_without_tool_call_parser
(
serve_parser
):
"""Ensure validation fails if tool choice is enabled with no call parser"""
# If we enable-auto-tool-choice, explode with no tool-call-parser
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
])
with
pytest
.
raises
(
TypeError
):
validate_parsed_serve_args
(
args
)
def
test_enable_auto_choice_passes_with_tool_call_parser
(
serve_parser
):
"""Ensure validation passes with tool choice enabled with a call parser"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"mistral"
,
])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_happy_paths
(
serve_parser
):
"""Ensure validation passes if the chat template exists"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
CHATML_JINJA_PATH
.
absolute
().
as_posix
()])
validate_parsed_serve_args
(
args
)
def
test_chat_template_validation_for_sad_paths
(
serve_parser
):
"""Ensure validation fails if the chat template doesn't exist"""
args
=
serve_parser
.
parse_args
(
args
=
[
"--chat-template"
,
"does/not/exist"
])
with
pytest
.
raises
(
ValueError
):
validate_parsed_serve_args
(
args
)
tests/entrypoints/openai/test_completion.py
View file @
4d3a2c28
...
@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
...
@@ -160,15 +160,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
async
def
test_added_lora_tokens_base_model
(
client
:
openai
.
AsyncOpenAI
):
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"out of vocabulary"
):
model
=
MODEL_NAME
,
# Added tokens should be rejected by the base model
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
],
await
client
.
completions
.
create
(
echo
=
True
,
model
=
MODEL_NAME
,
max_tokens
=
5
,
prompt
=
[
0
,
0
,
32000
,
32001
,
32002
]
,
temperature
=
0.0
,
echo
=
True
,
)
max_tokens
=
5
,
# Added tokens should not appear in tokenized prompt
temperature
=
0.0
,
assert
"vllm"
not
in
completion
.
choices
[
0
].
text
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -343,6 +343,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
assert
""
.
join
(
chunks
)
==
single_output
assert
""
.
join
(
chunks
)
==
single_output
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
"zephyr-lora"
,
"zephyr-pa"
],
)
async
def
test_parallel_streaming
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
"""Streaming for parallel sampling.
The tokens from multiple samples, are flattened into a single stream,
with an index to indicate which sample the token belongs to.
"""
prompt
=
"What is an LLM?"
n
=
3
max_tokens
=
5
stream
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
prompt
,
max_tokens
=
max_tokens
,
n
=
n
,
stream
=
True
)
chunks
:
List
[
List
[
str
]]
=
[[]
for
i
in
range
(
n
)]
finish_reason_count
=
0
async
for
chunk
in
stream
:
index
=
chunk
.
choices
[
0
].
index
text
=
chunk
.
choices
[
0
].
text
chunks
[
index
].
append
(
text
)
if
chunk
.
choices
[
0
].
finish_reason
is
not
None
:
finish_reason_count
+=
1
assert
finish_reason_count
==
n
for
chunk
in
chunks
:
assert
len
(
chunk
)
==
max_tokens
print
(
""
.
join
(
chunk
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name"
,
"model_name"
,
...
@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
...
@@ -506,8 +540,8 @@ async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
extra_body
=
dict
(
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but
not necessary
# NOTE: this has to be true for n > 1 in vLLM, but
# for official client.
#
not necessary
for official client.
use_beam_search
=
True
),
use_beam_search
=
True
),
)
)
assert
len
(
batch
.
choices
)
==
4
assert
len
(
batch
.
choices
)
==
4
...
...
Prev
1
…
17
18
19
20
21
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment