Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
352 additions
and
113 deletions
+352
-113
tests/entrypoints/llm/test_prompt_validation.py
tests/entrypoints/llm/test_prompt_validation.py
+2
-7
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+30
-25
tests/entrypoints/openai/correctness/test_lmeval.py
tests/entrypoints/openai/correctness/test_lmeval.py
+4
-3
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-2
tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
...nai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+1
-3
tests/entrypoints/openai/reasoning_parsers/utils.py
tests/entrypoints/openai/reasoning_parsers/utils.py
+7
-7
tests/entrypoints/openai/test_audio.py
tests/entrypoints/openai/test_audio.py
+10
-14
tests/entrypoints/openai/test_basic.py
tests/entrypoints/openai/test_basic.py
+49
-2
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+36
-4
tests/entrypoints/openai/test_chat_echo.py
tests/entrypoints/openai/test_chat_echo.py
+0
-3
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+2
-0
tests/entrypoints/openai/test_chat_with_tool_reasoning.py
tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+145
-0
tests/entrypoints/openai/test_cli_args.py
tests/entrypoints/openai/test_cli_args.py
+1
-1
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+4
-4
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+14
-14
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+2
-2
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+10
-2
tests/entrypoints/openai/test_pooling.py
tests/entrypoints/openai/test_pooling.py
+2
-2
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+2
-4
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+30
-14
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/entrypoints/llm/test_prompt_validation.py
View file @
469e903b
...
@@ -5,7 +5,6 @@ import os
...
@@ -5,7 +5,6 @@ import os
from
vllm
import
LLM
from
vllm
import
LLM
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
from
vllm.config
import
LoadFormat
@
pytest
.
fixture
(
autouse
=
True
)
@
pytest
.
fixture
(
autouse
=
True
)
...
@@ -17,17 +16,13 @@ def v1(run_with_both_engines):
...
@@ -17,17 +16,13 @@ def v1(run_with_both_engines):
def
test_empty_prompt
():
def
test_empty_prompt
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"openai-community/gpt2"
),
enforce_eager
=
True
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
with
pytest
.
raises
(
ValueError
,
match
=
'Prompt cannot be empty'
):
llm
.
generate
([
""
])
llm
.
generate
([
""
])
@
pytest
.
mark
.
skip_v1
@
pytest
.
mark
.
skip_v1
def
test_out_of_vocab_token
():
def
test_out_of_vocab_token
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"gpt2"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"openai-community/gpt2"
),
enforce_eager
=
True
)
load_format
=
LoadFormat
.
RUNAI_STREAMER
,
enforce_eager
=
True
)
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
with
pytest
.
raises
(
ValueError
,
match
=
'out of vocabulary'
):
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
llm
.
generate
({
"prompt_token_ids"
:
[
999999
]})
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
469e903b
...
@@ -56,32 +56,37 @@ def cache_models():
...
@@ -56,32 +56,37 @@ def cache_models():
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
usefixtures
(
"cache_models"
)
@
pytest
.
mark
.
usefixtures
(
"cache_models"
)
def
test_offline_mode
(
monkeypatch
):
def
test_offline_mode
(
monkeypatch
:
pytest
.
MonkeyPatch
):
# Set HF to offline mode and ensure we can still construct an LLM
# Set HF to offline mode and ensure we can still construct an LLM
try
:
with
monkeypatch
.
context
()
as
m
:
monkeypatch
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
try
:
monkeypatch
.
setenv
(
"VLLM_NO_USAGE_STATS"
,
"1"
)
m
.
setenv
(
"HF_HUB_OFFLINE"
,
"1"
)
m
.
setenv
(
"VLLM_NO_USAGE_STATS"
,
"1"
)
def
disable_connect
(
*
args
,
**
kwargs
):
raise
RuntimeError
(
"No http calls allowed"
)
def
disable_connect
(
*
args
,
**
kwargs
):
raise
RuntimeError
(
"No http calls allowed"
)
monkeypatch
.
setattr
(
urllib3
.
connection
.
HTTPConnection
,
"connect"
,
disable_connect
)
m
.
setattr
(
monkeypatch
.
setattr
(
urllib3
.
connection
.
HTTPSConnection
,
"connect"
,
urllib3
.
connection
.
HTTPConnection
,
disable_connect
)
"connect"
,
disable_connect
,
# Need to re-import huggingface_hub and friends to setup offline mode
)
_re_import_modules
()
m
.
setattr
(
# Cached model files should be used in offline mode
urllib3
.
connection
.
HTTPSConnection
,
for
model_config
in
MODEL_CONFIGS
:
"connect"
,
LLM
(
**
model_config
)
disable_connect
,
finally
:
)
# Reset the environment after the test
# NB: Assuming tests are run in online mode
# Need to re-import huggingface_hub
monkeypatch
.
delenv
(
"HF_HUB_OFFLINE"
)
# and friends to setup offline mode
monkeypatch
.
delenv
(
"VLLM_NO_USAGE_STATS"
)
_re_import_modules
()
_re_import_modules
()
# Cached model files should be used in offline mode
pass
for
model_config
in
MODEL_CONFIGS
:
LLM
(
**
model_config
)
finally
:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules
()
def
_re_import_modules
():
def
_re_import_modules
():
...
...
tests/entrypoints/openai/correctness/test_lmeval.py
View file @
469e903b
...
@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500
...
@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500
TASK
=
"gsm8k"
TASK
=
"gsm8k"
FILTER
=
"exact_match,strict-match"
FILTER
=
"exact_match,strict-match"
RTOL
=
0.03
RTOL
=
0.03
EXPECTED_VALUE
=
0.5
8
EXPECTED_VALUE
=
0.5
4
DEFAULT_ARGS
=
[
"--max-model-len"
,
"4096"
,
"--disable-log-requests"
]
DEFAULT_ARGS
=
[
"--max-model-len"
,
"4096"
,
"--disable-log-requests"
]
MORE_ARGS_LIST
=
[
MORE_ARGS_LIST
=
[
[],
# Default
[],
# Default
...
@@ -71,7 +71,7 @@ def run_test(more_args):
...
@@ -71,7 +71,7 @@ def run_test(more_args):
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
()
and
not
current_platform
.
is_tpu
(),
and
not
current_platform
.
is_tpu
(),
reason
=
"V1 currently only supported on CUDA and TPU"
)
reason
=
"V1 currently only supported on CUDA and TPU"
)
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
):
def
test_lm_eval_accuracy_v1_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Run with the V1 Engine."""
"""Run with the V1 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
...
@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
@
pytest
.
mark
.
parametrize
(
"more_args"
,
MORE_ARGS_LIST
)
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
,
more_args
):
def
test_lm_eval_accuracy_v0_engine
(
monkeypatch
:
pytest
.
MonkeyPatch
,
more_args
):
"""Run with the V0 Engine."""
"""Run with the V0 Engine."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
...
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
469e903b
...
@@ -10,7 +10,6 @@ import asyncio
...
@@ -10,7 +10,6 @@ import asyncio
import
io
import
io
import
time
import
time
from
statistics
import
mean
,
median
from
statistics
import
mean
,
median
from
typing
import
List
import
librosa
import
librosa
import
pytest
import
pytest
...
@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
...
@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
audio
,
sr
=
data
[
0
][
"audio"
][
"array"
],
data
[
0
][
"audio"
][
"sampling_rate"
]
audio
,
sr
=
data
[
0
][
"audio"
][
"array"
],
data
[
0
][
"audio"
][
"sampling_rate"
]
_
=
await
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
""
)
_
=
await
bound_transcribe
(
model
,
sem
,
client
,
(
audio
,
sr
),
""
)
tasks
:
L
ist
[
asyncio
.
Task
]
=
[]
tasks
:
l
ist
[
asyncio
.
Task
]
=
[]
for
sample
in
data
:
for
sample
in
data
:
audio
,
sr
=
sample
[
"audio"
][
"array"
],
sample
[
"audio"
][
"sampling_rate"
]
audio
,
sr
=
sample
[
"audio"
][
"array"
],
sample
[
"audio"
][
"sampling_rate"
]
task
=
asyncio
.
create_task
(
task
=
asyncio
.
create_task
(
...
...
tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
pytest
import
pytest
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -180,7 +178,7 @@ def test_reasoning(
...
@@ -180,7 +178,7 @@ def test_reasoning(
):
):
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
# decode everything to tokens
output_tokens
:
L
ist
[
str
]
=
[
output_tokens
:
l
ist
[
str
]
=
[
tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
...
...
tests/entrypoints/openai/reasoning_parsers/utils.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
typing
import
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
)
DeltaMessage
)
...
@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor:
...
@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor:
def
run_reasoning_extraction
(
def
run_reasoning_extraction
(
reasoning_parser
:
ReasoningParser
,
reasoning_parser
:
ReasoningParser
,
model_output
:
L
ist
[
str
],
model_output
:
l
ist
[
str
],
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
streaming
:
bool
=
False
,
streaming
:
bool
=
False
,
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
if
streaming
:
if
streaming
:
reconstructor
=
run_reasoning_extraction_streaming
(
reconstructor
=
run_reasoning_extraction_streaming
(
reasoning_parser
,
reasoning_parser
,
...
@@ -55,9 +55,9 @@ def run_reasoning_extraction(
...
@@ -55,9 +55,9 @@ def run_reasoning_extraction(
def
run_reasoning_extraction_nonstreaming
(
def
run_reasoning_extraction_nonstreaming
(
reasoning_parser
:
ReasoningParser
,
reasoning_parser
:
ReasoningParser
,
model_output
:
L
ist
[
str
],
model_output
:
l
ist
[
str
],
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
request
=
request
or
ChatCompletionRequest
(
messages
=
[],
model
=
"test-model"
)
request
=
request
or
ChatCompletionRequest
(
messages
=
[],
model
=
"test-model"
)
return
reasoning_parser
.
extract_reasoning_content
(
return
reasoning_parser
.
extract_reasoning_content
(
model_output
=
''
.
join
(
model_output
),
request
=
request
)
model_output
=
''
.
join
(
model_output
),
request
=
request
)
...
@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(
...
@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(
def
run_reasoning_extraction_streaming
(
def
run_reasoning_extraction_streaming
(
reasoning_parser
:
ReasoningParser
,
reasoning_parser
:
ReasoningParser
,
model_deltas
:
L
ist
[
str
],
model_deltas
:
l
ist
[
str
],
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
request
:
Union
[
ChatCompletionRequest
,
None
]
=
None
,
)
->
StreamingReasoningReconstructor
:
)
->
StreamingReasoningReconstructor
:
request
=
request
or
ChatCompletionRequest
(
messages
=
[],
model
=
"test-model"
)
request
=
request
or
ChatCompletionRequest
(
messages
=
[],
model
=
"test-model"
)
reconstructor
=
StreamingReasoningReconstructor
()
reconstructor
=
StreamingReasoningReconstructor
()
previous_text
=
""
previous_text
=
""
previous_tokens
:
L
ist
[
int
]
=
[]
previous_tokens
:
l
ist
[
int
]
=
[]
for
delta
in
model_deltas
:
for
delta
in
model_deltas
:
token_delta
=
[
token_delta
=
[
reasoning_parser
.
vocab
.
get
(
token
)
reasoning_parser
.
vocab
.
get
(
token
)
...
...
tests/entrypoints/openai/test_audio.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Dict
,
List
import
openai
import
openai
import
pytest
import
pytest
import
os
import
os
...
@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [
...
@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
args
=
[
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"--max-model-len"
,
"2048"
,
"2048"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
...
@@ -41,7 +37,7 @@ async def client(server):
...
@@ -41,7 +37,7 @@ async def client(server):
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_audio
()
->
D
ict
[
str
,
str
]:
def
base64_encoded_audio
()
->
d
ict
[
str
,
str
]:
return
{
return
{
audio_url
:
encode_audio_base64
(
*
fetch_audio
(
audio_url
))
audio_url
:
encode_audio_base64
(
*
fetch_audio
(
audio_url
))
for
audio_url
in
TEST_AUDIO_URLS
for
audio_url
in
TEST_AUDIO_URLS
...
@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
...
@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_audio_base64encoded
(
async
def
test_single_chat_session_audio_base64encoded
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
D
ict
[
str
,
str
]):
base64_encoded_audio
:
d
ict
[
str
,
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded(
...
@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded(
...
@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded(
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_single_chat_session_input_audio
(
async
def
test_single_chat_session_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
D
ict
[
str
,
str
]):
base64_encoded_audio
:
d
ict
[
str
,
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
"user"
,
"user"
,
...
@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio(
...
@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio(
choice
=
chat_completion
.
choices
[
0
]
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
assert
choice
.
finish_reason
==
"length"
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
assert
chat_completion
.
usage
==
openai
.
types
.
CompletionUsage
(
completion_tokens
=
10
,
prompt_tokens
=
20
1
,
total_tokens
=
21
1
)
completion_tokens
=
10
,
prompt_tokens
=
20
2
,
total_tokens
=
21
2
)
message
=
choice
.
message
message
=
choice
.
message
message
=
chat_completion
.
choices
[
0
].
message
message
=
chat_completion
.
choices
[
0
].
message
...
@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
...
@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
chunks
:
L
ist
[
str
]
=
[]
chunks
:
l
ist
[
str
]
=
[]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
...
@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
...
@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_chat_streaming_input_audio
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_chat_streaming_input_audio
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
model_name
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
D
ict
[
str
,
base64_encoded_audio
:
d
ict
[
str
,
str
]):
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
...
@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
chunks
:
L
ist
[
str
]
=
[]
chunks
:
l
ist
[
str
]
=
[]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
...
@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
...
@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
@
pytest
.
mark
.
parametrize
(
"audio_url"
,
TEST_AUDIO_URLS
)
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
async
def
test_multi_audio_input
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
audio_url
:
str
,
audio_url
:
str
,
base64_encoded_audio
:
D
ict
[
str
,
str
]):
base64_encoded_audio
:
d
ict
[
str
,
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
...
tests/entrypoints/openai/test_basic.py
View file @
469e903b
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
import
asyncio
import
asyncio
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
List
import
openai
import
openai
import
pytest
import
pytest
...
@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
...
@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@
pytest
.
fixture
(
scope
=
'module'
)
@
pytest
.
fixture
(
scope
=
'module'
)
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
L
ist
[
str
]:
def
server_args
(
request
:
pytest
.
FixtureRequest
)
->
l
ist
[
str
]:
""" Provide extra arguments to the server via indirect parametrization
""" Provide extra arguments to the server via indirect parametrization
Usage:
Usage:
...
@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
...
@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
extra_headers
=
{
extra_headers
=
{
"Content-Type"
:
"application/x-www-form-urlencoded"
"Content-Type"
:
"application/x-www-form-urlencoded"
})
})
@
pytest
.
mark
.
parametrize
(
"server_args"
,
[
pytest
.
param
([
"--enable-server-load-tracking"
],
id
=
"enable-server-load-tracking"
)
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_server_load
(
server
:
RemoteOpenAIServer
):
# Check initial server load
response
=
requests
.
get
(
server
.
url_for
(
"load"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
json
().
get
(
"server_load"
)
==
0
def
make_long_completion_request
():
return
requests
.
post
(
server
.
url_for
(
"v1/completions"
),
headers
=
{
"Content-Type"
:
"application/json"
},
json
=
{
"prompt"
:
"Give me a long story"
,
"max_tokens"
:
1000
,
"temperature"
:
0
,
},
)
# Start the completion request in a background thread.
completion_future
=
asyncio
.
create_task
(
asyncio
.
to_thread
(
make_long_completion_request
))
# Give a short delay to ensure the request has started.
await
asyncio
.
sleep
(
0.1
)
# Check server load while the completion request is running.
response
=
requests
.
get
(
server
.
url_for
(
"load"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
json
().
get
(
"server_load"
)
==
1
# Wait for the completion request to finish.
await
completion_future
await
asyncio
.
sleep
(
0.1
)
# Check server load after the completion request has finished.
response
=
requests
.
get
(
server
.
url_for
(
"load"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
json
().
get
(
"server_load"
)
==
0
tests/entrypoints/openai/test_chat.py
View file @
469e903b
...
@@ -3,13 +3,14 @@
...
@@ -3,13 +3,14 @@
# imports for guided decoding tests
# imports for guided decoding tests
import
json
import
json
import
re
import
re
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Optional
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
os
import
os
import
pytest_asyncio
import
pytest_asyncio
import
requests
import
torch
import
torch
from
openai
import
BadRequestError
from
openai
import
BadRequestError
...
@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
...
@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
async
def
test_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
prompt_logprobs
:
Optional
[
int
]):
params
:
D
ict
=
{
params
:
d
ict
=
{
"messages"
:
[{
"messages"
:
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
"content"
:
"You are a helpful assistant."
...
@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
...
@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
)
)
async
def
test_more_than_one_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_more_than_one_prompt_logprobs_chat
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
model_name
:
str
):
params
:
D
ict
=
{
params
:
d
ict
=
{
"messages"
:
[{
"messages"
:
[{
"role"
:
"system"
,
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
"content"
:
"You are a helpful assistant."
...
@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
...
@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
chunks
:
L
ist
[
str
]
=
[]
chunks
:
l
ist
[
str
]
=
[]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
delta
=
chunk
.
choices
[
0
].
delta
...
@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):
...
@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
assert
(
"greater_than_equal"
in
exc_info
.
value
.
message
or
"less_than_equal"
in
exc_info
.
value
.
message
)
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
async
def
test_http_chat_wo_model_name
(
server
:
RemoteOpenAIServer
):
url
=
f
"http://localhost:
{
server
.
port
}
/v1/chat/completions"
headers
=
{
"Content-Type"
:
"application/json"
,
}
data
=
{
# model_name is avoided here.
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
"max_tokens"
:
5
}
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
response_data
=
response
.
json
()
print
(
response_data
)
choice
=
response_data
.
get
(
"choices"
)[
0
]
message
=
choice
.
get
(
"message"
)
assert
message
is
not
None
content
=
message
.
get
(
"content"
)
assert
content
is
not
None
assert
len
(
content
)
>
0
tests/entrypoints/openai/test_chat_echo.py
View file @
469e903b
...
@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
...
@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
# # any model with a chat template should work here
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -23,8 +22,6 @@ def server():
...
@@ -23,8 +22,6 @@ def server():
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"4080"
,
"4080"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
tests/entrypoints/openai/test_chat_template.py
View file @
469e903b
...
@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Call the function and get the result
# Call the function and get the result
result
=
apply_hf_chat_template
(
result
=
apply_hf_chat_template
(
tokenizer
,
tokenizer
,
trust_remote_code
=
True
,
conversation
=
mock_request
.
messages
,
conversation
=
mock_request
.
messages
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
chat_template
=
mock_request
.
chat_template
or
template_content
,
tools
=
None
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
add_generation_prompt
=
mock_request
.
add_generation_prompt
,
continue_final_message
=
mock_request
.
continue_final_message
,
continue_final_message
=
mock_request
.
continue_final_message
,
)
)
...
...
tests/entrypoints/openai/test_chat_with_tool_reasoning.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
from
...utils
import
RemoteOpenAIServer
# a reasoning and tool calling model
MODEL_NAME
=
"Qwen/QwQ-32B"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
# noqa: F811
args
=
[
"--max-model-len"
,
"8192"
,
"--enforce-eager"
,
"--enable-reasoning"
,
"--reasoning-parser"
,
"deepseek_r1"
,
"--enable-auto-tool-choice"
,
"--tool-call-parser"
,
"hermes"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
TOOLS
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
,
"description"
:
"The city to find the weather for, e.g. 'San Francisco'"
},
"state"
:
{
"type"
:
"string"
,
"description"
:
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit"
:
{
"type"
:
"string"
,
"description"
:
"The unit to fetch the temperature in"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
]
}
},
"required"
:
[
"city"
,
"state"
,
"unit"
]
}
}
}]
MESSAGES
=
[{
"role"
:
"user"
,
"content"
:
"Hi! How are you doing today?"
},
{
"role"
:
"assistant"
,
"content"
:
"I'm doing well! How can I help you?"
},
{
"role"
:
"user"
,
"content"
:
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
FUNC_NAME
=
"get_current_weather"
FUNC_ARGS
=
"""{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
def
extract_reasoning_and_calls
(
chunks
:
list
):
reasoning_content
=
""
tool_call_idx
=
-
1
arguments
=
[]
function_names
=
[]
for
chunk
in
chunks
:
if
chunk
.
choices
[
0
].
delta
.
tool_calls
:
tool_call
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
]
if
tool_call
.
index
!=
tool_call_idx
:
tool_call_idx
=
chunk
.
choices
[
0
].
delta
.
tool_calls
[
0
].
index
arguments
.
append
(
""
)
function_names
.
append
(
""
)
if
tool_call
.
function
:
if
tool_call
.
function
.
name
:
function_names
[
tool_call_idx
]
=
tool_call
.
function
.
name
if
tool_call
.
function
.
arguments
:
arguments
[
tool_call_idx
]
+=
tool_call
.
function
.
arguments
else
:
if
hasattr
(
chunk
.
choices
[
0
].
delta
,
"reasoning_content"
):
reasoning_content
+=
chunk
.
choices
[
0
].
delta
.
reasoning_content
return
reasoning_content
,
arguments
,
function_names
# test streaming
@
pytest
.
mark
.
asyncio
async
def
test_chat_streaming_of_tool_and_reasoning
(
client
:
openai
.
AsyncOpenAI
):
stream
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
MESSAGES
,
tools
=
TOOLS
,
temperature
=
0.0
,
stream
=
True
,
)
chunks
=
[]
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
)
reasoning_content
,
arguments
,
function_names
=
extract_reasoning_and_calls
(
chunks
)
assert
len
(
reasoning_content
)
>
0
assert
len
(
function_names
)
>
0
and
function_names
[
0
]
==
FUNC_NAME
assert
len
(
arguments
)
>
0
and
arguments
[
0
]
==
FUNC_ARGS
# test full generate
@
pytest
.
mark
.
asyncio
async
def
test_chat_full_of_tool_and_reasoning
(
client
:
openai
.
AsyncOpenAI
):
tool_calls
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
MESSAGES
,
tools
=
TOOLS
,
temperature
=
0.0
,
stream
=
False
,
)
assert
len
(
tool_calls
.
choices
[
0
].
message
.
reasoning_content
)
>
0
assert
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
name
\
==
FUNC_NAME
assert
tool_calls
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
.
arguments
\
==
FUNC_ARGS
tests/entrypoints/openai/test_cli_args.py
View file @
469e903b
...
@@ -26,7 +26,7 @@ def serve_parser():
...
@@ -26,7 +26,7 @@ def serve_parser():
return
make_arg_parser
(
parser
)
return
make_arg_parser
(
parser
)
### Tests for Lo
ra
module parsing
### Tests for Lo
RA
module parsing
def
test_valid_key_value_format
(
serve_parser
):
def
test_valid_key_value_format
(
serve_parser
):
# Test old format: name=path
# Test old format: name=path
args
=
serve_parser
.
parse_args
([
args
=
serve_parser
.
parse_args
([
...
...
tests/entrypoints/openai/test_completion.py
View file @
469e903b
...
@@ -5,7 +5,7 @@ import json
...
@@ -5,7 +5,7 @@ import json
import
re
import
re
import
shutil
import
shutil
from
tempfile
import
TemporaryDirectory
from
tempfile
import
TemporaryDirectory
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Optional
import
jsonschema
import
jsonschema
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
...
@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
...
@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
async
def
test_prompt_logprobs_completion
(
client
:
openai
.
AsyncOpenAI
,
async
def
test_prompt_logprobs_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
model_name
:
str
,
prompt_logprobs
:
Optional
[
int
]):
prompt_logprobs
:
Optional
[
int
]):
params
:
D
ict
=
{
params
:
d
ict
=
{
"prompt"
:
[
"A robot may not injure another robot"
,
"My name is"
],
"prompt"
:
[
"A robot may not injure another robot"
,
"My name is"
],
"model"
:
model_name
,
"model"
:
model_name
,
}
}
...
@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
...
@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
)
stream
=
True
)
chunks
:
L
ist
[
str
]
=
[]
chunks
:
l
ist
[
str
]
=
[]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
...
@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
...
@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
n
=
n
,
n
=
n
,
stream
=
True
)
stream
=
True
)
chunks
:
L
ist
[
L
ist
[
str
]]
=
[[]
for
i
in
range
(
n
)]
chunks
:
l
ist
[
l
ist
[
str
]]
=
[[]
for
i
in
range
(
n
)]
finish_reason_count
=
0
finish_reason_count
=
0
async
for
chunk
in
stream
:
async
for
chunk
in
stream
:
index
=
chunk
.
choices
[
0
].
index
index
=
chunk
.
choices
[
0
].
index
...
...
tests/entrypoints/openai/test_embedding.py
View file @
469e903b
...
@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
...
@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/
e5-mistral-7b-instruct
"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"intfloat/
multilingual-e5-small
"
)
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
...
@@ -28,7 +28,7 @@ def server():
...
@@ -28,7 +28,7 @@ def server():
"bfloat16"
,
"bfloat16"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"
819
2"
,
"
51
2"
,
"--chat-template"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
DUMMY_CHAT_TEMPLATE
,
]
]
...
@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
9
assert
embeddings
.
usage
.
prompt_tokens
==
11
assert
embeddings
.
usage
.
total_tokens
==
9
assert
embeddings
.
usage
.
total_tokens
==
11
# test using token IDs
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
...
@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
prompt_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
assert
embeddings
.
usage
.
total_tokens
==
5
...
@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_batch_embedding
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test
L
ist[str]
# test
l
ist[str]
input_texts
=
[
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
"Stars twinkle brightly in the night sky."
...
@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
)
==
3
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
3
2
assert
embeddings
.
usage
.
prompt_tokens
==
3
3
assert
embeddings
.
usage
.
total_tokens
==
3
2
assert
embeddings
.
usage
.
total_tokens
==
3
3
# test
L
ist[
L
ist[int]]
# test
l
ist[
l
ist[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
[
25
,
32
,
64
,
77
]]
embedding_response
=
await
client
.
embeddings
.
create
(
embedding_response
=
await
client
.
embeddings
.
create
(
...
@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
...
@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
)
==
4
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
prompt_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
assert
embeddings
.
usage
.
total_tokens
==
17
...
@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
...
@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
...
@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
...
@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert
embeddings
.
id
is
not
None
assert
embeddings
.
id
is
not
None
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
)
==
1
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
4096
assert
len
(
embeddings
.
data
[
0
].
embedding
)
==
384
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
completion_tokens
==
0
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
prompt_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
assert
embeddings
.
usage
.
total_tokens
==
10
...
...
tests/entrypoints/openai/test_encoder_decoder.py
View file @
469e903b
...
@@ -7,7 +7,7 @@ import pytest_asyncio
...
@@ -7,7 +7,7 @@ import pytest_asyncio
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
...utils
import
RemoteOpenAIServer
,
models_path_prefix
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from
vllm.attention.backends.utils
import
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from
vllm.
util
s
import
is_hip
from
vllm.
platform
s
import
current_platform
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-base"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-base"
)
...
@@ -30,7 +30,7 @@ async def client(server):
...
@@ -30,7 +30,7 @@ async def client(server):
yield
async_client
yield
async_client
@
pytest
.
mark
.
skipif
(
is_hip
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
)
reason
=
STR_NOT_IMPL_ENC_DEC_ROCM_HIP
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
...
...
tests/entrypoints/openai/test_metrics.py
View file @
469e903b
...
@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [
...
@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:iteration_tokens_total"
,
"vllm:iteration_tokens_total"
,
"vllm:cache_config_info"
,
"vllm:request_success_total"
,
"vllm:request_success_total"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_bucket"
,
...
@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [
...
@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_count"
,
"vllm:request_generation_tokens_count"
,
"vllm:request_params_n_sum"
,
"vllm:request_params_n_bucket"
,
"vllm:request_params_n_count"
,
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_to_first_token_seconds_count"
,
...
@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
...
@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
if
use_v1
:
if
use_v1
:
pytest
.
skip
(
"Skipping test on vllm V1"
)
pytest
.
skip
(
"Skipping test on vllm V1"
)
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/
e5-mistral-7b-instruct
", "input": "You are a helpful assistant."}}"""
# noqa: E501
input_batch
=
"""{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/
multilingual-e5-small
", "input": "You are a helpful assistant."}}"""
# noqa: E501
#base_url = "0.0.0.0"
#base_url = "0.0.0.0"
base_url
=
"localhost"
base_url
=
"localhost"
...
@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
...
@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
"-o"
,
"-o"
,
output_file
.
name
,
output_file
.
name
,
"--model"
,
"--model"
,
os
.
path
.
join
(
models_path_prefix
,
"intfloat/
e5-mistral-7b-instruct
"
),
os
.
path
.
join
(
models_path_prefix
,
"intfloat/
multilingual-e5-small
"
),
"--enable-metrics"
,
"--enable-metrics"
,
"--url"
,
"--url"
,
base_url
,
base_url
,
...
...
tests/entrypoints/openai/test_pooling.py
View file @
469e903b
...
@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_batch_pooling
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
async
def
test_batch_pooling
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
# test
L
ist[str]
# test
l
ist[str]
input_texts
=
[
input_texts
=
[
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"The cat sat on the mat."
,
"A feline was resting on a rug."
,
"Stars twinkle brightly in the night sky."
"Stars twinkle brightly in the night sky."
...
@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
assert
poolings
.
usage
.
prompt_tokens
==
25
assert
poolings
.
usage
.
prompt_tokens
==
25
assert
poolings
.
usage
.
total_tokens
==
25
assert
poolings
.
usage
.
total_tokens
==
25
# test
L
ist[
L
ist[int]]
# test
l
ist[
l
ist[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
[
25
,
32
,
64
,
77
]]
[
25
,
32
,
64
,
77
]]
response
=
requests
.
post
(
response
=
requests
.
post
(
...
...
tests/entrypoints/openai/test_rerank.py
View file @
469e903b
...
@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse
...
@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"BAAI/bge-reranker-base"
MODEL_NAME
=
"BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
]
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_rerank_texts
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
def
test_rerank_texts
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
query
=
"What is the capital of France?"
query
=
"What is the capital of France?"
...
@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
...
@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
assert
rerank
.
results
[
1
].
relevance_score
<=
0.01
assert
rerank
.
results
[
1
].
relevance_score
<=
0.01
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_top_n
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
def
test_top_n
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
query
=
"What is the capital of France?"
query
=
"What is the capital of France?"
...
@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
...
@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
assert
rerank
.
results
[
1
].
relevance_score
<=
0.01
assert
rerank
.
results
[
1
].
relevance_score
<=
0.01
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_rerank_max_model_len
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
def
test_rerank_max_model_len
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
View file @
469e903b
...
@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME
...
@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server_with_return_tokens_as_token_ids_flag
(
def
server_fixture
(
request
,
default_server_args
):
# noqa: F811
default_server_args
):
# noqa: F811
use_server_flag
=
request
.
param
args_with_flag
=
default_server_args
+
[
"--return-tokens-as-token-ids"
]
if
use_server_flag
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args_with_flag
)
as
remote_server
:
args_with_flag
=
default_server_args
+
[
"--return-tokens-as-token-ids"
]
yield
remote_server
with
RemoteOpenAIServer
(
MODEL_NAME
,
args_with_flag
)
as
remote_server
:
yield
(
remote_server
,
True
)
else
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
(
remote_server
,
False
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"server_fixture"
,
[
True
,
False
],
indirect
=
True
)
async
def
test_completion_return_tokens_as_token_ids_completion
(
async
def
test_completion_return_tokens_as_token_ids_completion
(
server_with_return_tokens_as_token_ids_flag
):
server_fixture
):
async
with
server_with_return_tokens_as_token_ids_flag
.
get_async_client
(
server
,
use_server_flag
=
server_fixture
)
as
client
:
request_args
=
{}
if
not
use_server_flag
:
request_args
[
"return_tokens_as_token_ids"
]
=
True
async
with
server
.
get_async_client
()
as
client
:
completion
=
await
client
.
completions
.
create
(
completion
=
await
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
...
@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
...
@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
echo
=
True
,
echo
=
True
,
temperature
=
0
,
temperature
=
0
,
max_tokens
=
10
,
max_tokens
=
10
,
logprobs
=
1
)
logprobs
=
1
,
extra_body
=
request_args
)
text
=
completion
.
choices
[
0
].
text
text
=
completion
.
choices
[
0
].
text
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
token_strs
=
completion
.
choices
[
0
].
logprobs
.
tokens
...
@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
...
@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_chat_return_tokens_as_token_ids_completion
(
@
pytest
.
mark
.
parametrize
(
"server_fixture"
,
[
True
,
False
],
indirect
=
True
)
server_with_return_tokens_as_token_ids_flag
):
async
def
test_chat_return_tokens_as_token_ids_completion
(
server_fixture
):
async
with
server_with_return_tokens_as_token_ids_flag
.
get_async_client
(
server
,
use_server_flag
=
server_fixture
)
as
client
:
request_args
=
{}
if
not
use_server_flag
:
request_args
[
"return_tokens_as_token_ids"
]
=
True
async
with
server
.
get_async_client
()
as
client
:
response
=
await
client
.
chat
.
completions
.
create
(
response
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
# Include Unicode characters to test for dividing a single
# Include Unicode characters to test for dividing a single
...
@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
...
@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
}],
}],
temperature
=
0
,
temperature
=
0
,
max_tokens
=
8
,
max_tokens
=
8
,
logprobs
=
True
)
logprobs
=
True
,
extra_body
=
request_args
)
text
=
response
.
choices
[
0
].
message
.
content
text
=
response
.
choices
[
0
].
message
.
content
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
MODEL_NAME
)
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment