Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99324e25
Commit
99324e25
authored
Jul 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.2' into v0.9.2-ori
parents
cc7f22a8
a5dd03c1
Changes
475
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
662 additions
and
129 deletions
+662
-129
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+52
-56
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+50
-10
tests/engine/test_options.py
tests/engine/test_options.py
+0
-3
tests/entrypoints/llm/test_encode.py
tests/entrypoints/llm/test_encode.py
+20
-4
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+4
-1
tests/entrypoints/openai/correctness/test_mteb_embed.py
tests/entrypoints/openai/correctness/test_mteb_embed.py
+6
-10
tests/entrypoints/openai/correctness/test_mteb_score.py
tests/entrypoints/openai/correctness/test_mteb_score.py
+62
-0
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+2
-2
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+54
-0
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+8
-0
tests/entrypoints/openai/test_optional_middleware.py
tests/entrypoints/openai/test_optional_middleware.py
+116
-0
tests/entrypoints/openai/test_pooling.py
tests/entrypoints/openai/test_pooling.py
+11
-4
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+8
-0
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+9
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+3
-2
tests/entrypoints/openai/test_transcription_validation.py
tests/entrypoints/openai/test_transcription_validation.py
+53
-24
tests/entrypoints/openai/test_translation_validation.py
tests/entrypoints/openai/test_translation_validation.py
+172
-0
tests/entrypoints/openai/test_video.py
tests/entrypoints/openai/test_video.py
+1
-1
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+27
-4
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+4
-8
No files found.
Too many changes to show.
To preserve performance only
475 of 475+
files are displayed.
Plain diff
Email patch
tests/distributed/test_sequence_parallel.py
View file @
99324e25
...
@@ -28,7 +28,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
...
@@ -28,7 +28,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
class
ParallelSetup
(
NamedTuple
):
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
tp_size
:
int
pp_size
:
int
pp_size
:
int
sp_
enable
d
:
bool
enable
_fusion
:
bool
eager_mode
:
bool
eager_mode
:
bool
chunked_prefill
:
bool
chunked_prefill
:
bool
...
@@ -67,49 +67,18 @@ class SPTestSettings:
...
@@ -67,49 +67,18 @@ class SPTestSettings:
task
:
TaskOption
=
"auto"
,
task
:
TaskOption
=
"auto"
,
load_format
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
):
):
parallel_setups
=
[]
for
eager_mode_val
in
[
False
,
True
]:
for
pp_multiplier
in
[
1
,
2
]:
for
chunked_prefill_val
in
[
False
,
True
]:
parallel_setups
.
append
(
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_multiplier
*
pp_base
,
enable_fusion
=
False
,
eager_mode
=
eager_mode_val
,
chunked_prefill
=
chunked_prefill_val
))
return
SPTestSettings
(
return
SPTestSettings
(
parallel_setups
=
[
parallel_setups
=
parallel_setups
,
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
True
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
False
),
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
2
*
pp_base
,
sp_enabled
=
True
,
eager_mode
=
True
,
chunked_prefill
=
True
)
],
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
task
=
task
,
...
@@ -126,19 +95,44 @@ class SPTestSettings:
...
@@ -126,19 +95,44 @@ class SPTestSettings:
multi_node_only
:
bool
=
False
,
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
load_format
:
Optional
[
str
]
=
None
,
):
):
parallel_setups
=
[]
for
eager_mode_val
in
[
False
,
True
]:
for
pp_multiplier
in
[
1
,
2
]:
for
chunked_prefill_val
in
[
False
,
True
]:
parallel_setups
.
append
(
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_multiplier
*
pp_base
,
enable_fusion
=
False
,
eager_mode
=
eager_mode_val
,
chunked_prefill
=
chunked_prefill_val
))
return
SPTestSettings
(
return
SPTestSettings
(
parallel_setups
=
[
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
test_options
=
SPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
@
staticmethod
def
fp8_quant
(
*
,
tp_base
:
int
=
2
,
pp_base
:
int
=
1
,
task
:
TaskOption
=
"auto"
,
multi_node_only
:
bool
=
False
,
load_format
:
Optional
[
str
]
=
None
,
):
parallel_setups
=
[]
for
fusion_val
in
[
False
,
True
]:
parallel_setups
.
append
(
ParallelSetup
(
tp_size
=
tp_base
,
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_base
,
pp_size
=
pp_base
,
sp_enabled
=
True
,
enable_fusion
=
fusion_val
,
eager_mode
=
False
,
eager_mode
=
True
,
chunked_prefill
=
False
),
chunked_prefill
=
False
))
ParallelSetup
(
tp_size
=
tp_base
,
return
SPTestSettings
(
pp_size
=
2
*
pp_base
,
parallel_setups
=
parallel_setups
,
sp_enabled
=
True
,
eager_mode
=
False
,
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
vllm_major_versions
=
[
"1"
,
"1"
],
vllm_major_versions
=
[
"1"
,
"1"
],
task
=
task
,
task
=
task
,
...
@@ -171,7 +165,7 @@ def _compare_sp(
...
@@ -171,7 +165,7 @@ def _compare_sp(
(
(
tp_size
,
tp_size
,
pp_size
,
pp_size
,
sp_
enable
d
,
enable
_fusion
,
eager_mode
,
eager_mode
,
chunked_prefill
,
chunked_prefill
,
)
=
parallel_setup
)
=
parallel_setup
...
@@ -240,9 +234,9 @@ def _compare_sp(
...
@@ -240,9 +234,9 @@ def _compare_sp(
'compile_sizes'
:
[
4
,
8
],
'compile_sizes'
:
[
4
,
8
],
'splitting_ops'
:
[],
'splitting_ops'
:
[],
'pass_config'
:
{
'pass_config'
:
{
'enable_sequence_parallelism'
:
sp_enabled
,
'enable_sequence_parallelism'
:
True
,
'enable_fusion'
:
enable_fusion
,
'enable_noop'
:
True
,
'enable_noop'
:
True
,
'enable_fusion'
:
True
,
},
},
}
}
...
@@ -291,12 +285,14 @@ def _compare_sp(
...
@@ -291,12 +285,14 @@ def _compare_sp(
SP_TEXT_GENERATION_MODELS
=
{
SP_TEXT_GENERATION_MODELS
=
{
# [Decoder-only]
# [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct"
:
SPTestSettings
.
fast
(),
"meta-llama/Llama-3.2-1B-Instruct"
:
SPTestSettings
.
fast
(),
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
:
SPTestSettings
.
fp8_quant
(),
}
}
SP_TEST_MODELS
=
[
SP_TEST_MODELS
=
[
# TODO support other models
# TODO support other models
# [LANGUAGE GENERATION]
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
]
]
...
...
tests/engine/test_arg_utils.py
View file @
99324e25
...
@@ -231,6 +231,38 @@ def test_limit_mm_per_prompt_parser(arg, expected):
...
@@ -231,6 +231,38 @@ def test_limit_mm_per_prompt_parser(arg, expected):
assert
args
.
limit_mm_per_prompt
==
expected
assert
args
.
limit_mm_per_prompt
==
expected
@
pytest
.
mark
.
parametrize
(
(
"arg"
,
"expected"
),
[
(
None
,
dict
()),
(
'{"video": {"num_frames": 123} }'
,
{
"video"
:
{
"num_frames"
:
123
}
}),
(
'{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }'
,
# noqa
{
"video"
:
{
"num_frames"
:
123
,
"fps"
:
1.0
,
"foo"
:
"bar"
},
"image"
:
{
"foo"
:
"bar"
}
}),
])
def
test_media_io_kwargs_parser
(
arg
,
expected
):
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
if
arg
is
None
:
args
=
parser
.
parse_args
([])
else
:
args
=
parser
.
parse_args
([
"--media-io-kwargs"
,
arg
])
assert
args
.
media_io_kwargs
==
expected
def
test_compilation_config
():
def
test_compilation_config
():
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
parser
=
EngineArgs
.
add_cli_args
(
FlexibleArgumentParser
())
...
@@ -239,32 +271,40 @@ def test_compilation_config():
...
@@ -239,32 +271,40 @@ def test_compilation_config():
assert
args
.
compilation_config
==
CompilationConfig
()
assert
args
.
compilation_config
==
CompilationConfig
()
# set to O3
# set to O3
args
=
parser
.
parse_args
([
"-O
3
"
])
args
=
parser
.
parse_args
([
"-O
0
"
])
assert
args
.
compilation_config
.
level
==
3
assert
args
.
compilation_config
.
level
==
0
# set to O 3 (space)
# set to O 3 (space)
args
=
parser
.
parse_args
([
"-O"
,
"
3
"
])
args
=
parser
.
parse_args
([
"-O"
,
"
1
"
])
assert
args
.
compilation_config
.
level
==
3
assert
args
.
compilation_config
.
level
==
1
# set to O 3 (equals)
# set to O 3 (equals)
args
=
parser
.
parse_args
([
"-O=3"
])
args
=
parser
.
parse_args
([
"-O=2"
])
assert
args
.
compilation_config
.
level
==
2
# set to O.level 3
args
=
parser
.
parse_args
([
"-O.level"
,
"3"
])
assert
args
.
compilation_config
.
level
==
3
assert
args
.
compilation_config
.
level
==
3
# set to string form of a dict
# set to string form of a dict
args
=
parser
.
parse_args
([
args
=
parser
.
parse_args
([
"--compilation-config"
,
"-O"
,
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}'
,
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}'
,
])
])
assert
(
args
.
compilation_config
.
level
==
3
and
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
]
and
not
args
.
compilation_config
.
use_inductor
)
# set to string form of a dict
# set to string form of a dict
args
=
parser
.
parse_args
([
args
=
parser
.
parse_args
([
"--compilation-config="
"--compilation-config="
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}'
,
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}'
,
])
])
assert
(
args
.
compilation_config
.
level
==
3
and
assert
(
args
.
compilation_config
.
level
==
3
and
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
])
args
.
compilation_config
.
cudagraph_capture_sizes
==
[
1
,
2
,
4
,
8
]
and
args
.
compilation_config
.
use_inductor
)
def
test_prefix_cache_default
():
def
test_prefix_cache_default
():
...
...
tests/engine/test_options.py
View file @
99324e25
...
@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
...
@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
ctx
=
(
nullcontext
()
if
enable_prompt_embeds
else
pytest
.
raises
(
ctx
=
(
nullcontext
()
if
enable_prompt_embeds
else
pytest
.
raises
(
ValueError
,
match
=
"set `--enable-prompt-embeds`"
))
ValueError
,
match
=
"set `--enable-prompt-embeds`"
))
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm
=
LLM
(
llm
=
LLM
(
model
=
model
,
model
=
model
,
enable_prompt_embeds
=
enable_prompt_embeds
,
enable_prompt_embeds
=
enable_prompt_embeds
,
...
...
tests/entrypoints/llm/test_encode.py
View file @
99324e25
...
@@ -8,6 +8,8 @@ import pytest
...
@@ -8,6 +8,8 @@ import pytest
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm
import
LLM
,
PoolingParams
,
PoolingRequestOutput
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
...models.utils
import
check_embeddings_close
MODEL_NAME
=
"intfloat/multilingual-e5-small"
MODEL_NAME
=
"intfloat/multilingual-e5-small"
PROMPTS
=
[
PROMPTS
=
[
...
@@ -27,6 +29,14 @@ TOKEN_IDS = [
...
@@ -27,6 +29,14 @@ TOKEN_IDS = [
]
]
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
llm
():
def
llm
():
# pytest caches the fixture so we use weakref.proxy to
# pytest caches the fixture so we use weakref.proxy to
...
@@ -46,9 +56,15 @@ def llm():
...
@@ -46,9 +56,15 @@ def llm():
cleanup_dist_env_and_memory
()
cleanup_dist_env_and_memory
()
def
assert_outputs_
equal
(
o1
:
list
[
PoolingRequestOutput
],
def
assert_outputs_
match
(
o1
:
list
[
PoolingRequestOutput
],
o2
:
list
[
PoolingRequestOutput
]):
o2
:
list
[
PoolingRequestOutput
]):
assert
[
o
.
outputs
for
o
in
o1
]
==
[
o
.
outputs
for
o
in
o2
]
check_embeddings_close
(
embeddings_0_lst
=
[
o
.
outputs
.
data
for
o
in
o1
],
embeddings_1_lst
=
[
o
.
outputs
.
data
for
o
in
o2
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
...
@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
...
@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
v2_output
=
llm
.
encode
({
"prompt_token_ids"
:
prompt_token_ids
},
v2_output
=
llm
.
encode
({
"prompt_token_ids"
:
prompt_token_ids
},
pooling_params
=
pooling_params
)
pooling_params
=
pooling_params
)
assert_outputs_
equal
(
v1_output
,
v2_output
)
assert_outputs_
match
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
...
@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
...
@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
}
for
p
in
TOKEN_IDS
],
}
for
p
in
TOKEN_IDS
],
pooling_params
=
pooling_params
,
pooling_params
=
pooling_params
,
)
)
assert_outputs_
equal
(
v1_output
,
v2_output
)
assert_outputs_
match
(
v1_output
,
v2_output
)
@
pytest
.
mark
.
skip_global_cleanup
@
pytest
.
mark
.
skip_global_cleanup
...
...
tests/entrypoints/llm/test_generate.py
View file @
99324e25
...
@@ -125,4 +125,7 @@ def test_max_model_len():
...
@@ -125,4 +125,7 @@ def test_max_model_len():
for
output
in
outputs
:
for
output
in
outputs
:
num_total_tokens
=
len
(
output
.
prompt_token_ids
)
+
len
(
num_total_tokens
=
len
(
output
.
prompt_token_ids
)
+
len
(
output
.
outputs
[
0
].
token_ids
)
output
.
outputs
[
0
].
token_ids
)
assert
num_total_tokens
==
max_model_len
# Total tokens must not exceed max_model_len.
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert
num_total_tokens
<=
max_model_len
tests/entrypoints/openai/correctness/test_mteb.py
→
tests/entrypoints/openai/correctness/test_mteb
_embed
.py
View file @
99324e25
...
@@ -7,34 +7,30 @@ import pytest
...
@@ -7,34 +7,30 @@ import pytest
from
tests.models.language.pooling.mteb_utils
import
(
MTEB_EMBED_TASKS
,
from
tests.models.language.pooling.mteb_utils
import
(
MTEB_EMBED_TASKS
,
MTEB_EMBED_TOL
,
MTEB_EMBED_TOL
,
OpenAIClientMtebEncoder
,
OpenAIClientMtebEncoder
,
run_mteb_embed_task
,
run_mteb_embed_task
)
run_mteb_embed_task_st
)
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
MODEL_NAME
=
"BAAI/bge-m3"
MODEL_NAME
=
"intfloat/e5-small"
DTYPE
=
"float16"
MAIN_SCORE
=
0.7422994752439667
MAIN_SCORE
=
0.7873427091972599
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
args
=
[
"--task"
,
"embed"
,
"--dtype"
,
DTYPE
,
"--enforce-eager"
,
"--task"
,
"embed"
,
"--enforce-eager"
,
"--disable-uvicorn-access-log"
"--max-model-len"
,
"512"
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
yield
remote_server
def
test_mteb
(
server
):
def
test_mteb
_embed
(
server
):
client
=
server
.
get_client
()
client
=
server
.
get_client
()
encoder
=
OpenAIClientMtebEncoder
(
MODEL_NAME
,
client
)
encoder
=
OpenAIClientMtebEncoder
(
MODEL_NAME
,
client
)
vllm_main_score
=
run_mteb_embed_task
(
encoder
,
MTEB_EMBED_TASKS
)
vllm_main_score
=
run_mteb_embed_task
(
encoder
,
MTEB_EMBED_TASKS
)
st_main_score
=
MAIN_SCORE
or
run_mteb_embed_task_st
(
st_main_score
=
MAIN_SCORE
MODEL_NAME
,
MTEB_EMBED_TASKS
)
print
(
"VLLM main score: "
,
vllm_main_score
)
print
(
"VLLM main score: "
,
vllm_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
...
...
tests/entrypoints/openai/correctness/test_mteb_score.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
# yapf conflicts with isort for this block
# yapf: disable
from
tests.models.language.pooling.mteb_utils
import
(
MTEB_RERANK_LANGS
,
MTEB_RERANK_TASKS
,
MTEB_RERANK_TOL
,
RerankClientMtebEncoder
,
ScoreClientMtebEncoder
,
mteb_test_rerank_models_hf
,
run_mteb_rerank
)
# yapf: enable
from
tests.utils
import
RemoteOpenAIServer
os
.
environ
[
"VLLM_LOGGING_LEVEL"
]
=
"WARNING"
MODEL_NAME
=
"cross-encoder/ms-marco-MiniLM-L-6-v2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--task"
,
"score"
,
"--enforce-eager"
,
"--disable-uvicorn-access-log"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
st_main_score
(
hf_runner
):
# The main score related to the version of the dependency.
# So we need to recalculate every time.
main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
MODEL_NAME
)
return
main_score
def
test_mteb_score
(
server
,
st_main_score
):
url
=
server
.
url_for
(
"score"
)
encoder
=
ScoreClientMtebEncoder
(
MODEL_NAME
,
url
)
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
MTEB_RERANK_LANGS
)
print
(
"VLLM main score: "
,
vllm_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
MTEB_RERANK_TOL
)
def
test_mteb_rerank
(
server
,
st_main_score
):
url
=
server
.
url_for
(
"rerank"
)
encoder
=
RerankClientMtebEncoder
(
MODEL_NAME
,
url
)
vllm_main_score
=
run_mteb_rerank
(
encoder
,
MTEB_RERANK_TASKS
,
MTEB_RERANK_LANGS
)
print
(
"VLLM main score: "
,
vllm_main_score
)
print
(
"SentenceTransformer main score: "
,
st_main_score
)
print
(
"Difference: "
,
st_main_score
-
vllm_main_score
)
assert
st_main_score
==
pytest
.
approx
(
vllm_main_score
,
abs
=
MTEB_RERANK_TOL
)
tests/entrypoints/openai/test_chat_template.py
View file @
99324e25
...
@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
...
@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert
chatml_jinja_path
.
exists
()
assert
chatml_jinja_path
.
exists
()
# Define models, templates, and their corresponding expected outputs
# Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT
=
[
MODEL_TEMPLATE_GENERAT
I
ON_OUTPUT
=
[
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
(
"facebook/opt-125m"
,
chatml_jinja_path
,
True
,
False
,
"""<|im_start|>user
Hello<|im_end|>
Hello<|im_end|>
<|im_start|>assistant
<|im_start|>assistant
...
@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
...
@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model,template,add_generation_prompt,continue_final_message,expected_output"
,
"model,template,add_generation_prompt,continue_final_message,expected_output"
,
MODEL_TEMPLATE_GENERATON_OUTPUT
)
MODEL_TEMPLATE_GENERAT
I
ON_OUTPUT
)
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
def
test_get_gen_prompt
(
model
,
template
,
add_generation_prompt
,
continue_final_message
,
expected_output
):
continue_final_message
,
expected_output
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
...
...
tests/entrypoints/openai/test_completion.py
View file @
99324e25
...
@@ -779,3 +779,57 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
...
@@ -779,3 +779,57 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
prompt
=
"Give an example string that fits this regex"
,
prompt
=
"Give an example string that fits this regex"
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_json
=
sample_json_schema
))
guided_json
=
sample_json_schema
))
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name,stream,echo"
,
[
(
MODEL_NAME
,
False
,
False
),
(
MODEL_NAME
,
False
,
True
),
(
MODEL_NAME
,
True
,
False
),
(
MODEL_NAME
,
True
,
True
)
# should not raise BadRequestError error
],
)
async
def
test_echo_stream_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
stream
:
bool
,
echo
:
bool
):
saying
:
str
=
"Hello, my name is"
result
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
saying
,
max_tokens
=
10
,
temperature
=
0.0
,
echo
=
echo
,
stream
=
stream
)
stop_reason
=
"length"
if
not
stream
:
completion
=
result
assert
completion
.
id
is
not
None
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
choice
=
completion
.
choices
[
0
]
assert
len
(
choice
.
text
)
>=
5
assert
choice
.
finish_reason
==
stop_reason
if
echo
:
assert
choice
.
text
is
not
None
and
saying
in
choice
.
text
else
:
assert
choice
.
text
is
not
None
and
saying
not
in
choice
.
text
else
:
chunks
:
list
[
str
]
=
[]
final_finish_reason
=
None
async
for
chunk
in
result
:
if
chunk
.
choices
and
chunk
.
choices
[
0
].
text
:
chunks
.
append
(
chunk
.
choices
[
0
].
text
)
if
chunk
.
choices
and
chunk
.
choices
[
0
].
finish_reason
:
final_finish_reason
=
chunk
.
choices
[
0
].
finish_reason
assert
final_finish_reason
==
stop_reason
content
=
""
.
join
(
chunks
)
if
echo
:
assert
content
is
not
None
and
saying
in
content
else
:
assert
content
is
not
None
and
saying
not
in
content
tests/entrypoints/openai/test_embedding.py
View file @
99324e25
...
@@ -21,6 +21,14 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
...
@@ -21,6 +21,14 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
DTYPE
=
"bfloat16"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
args
=
[
...
...
tests/entrypoints/openai/test_optional_middleware.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for middleware that's off by default and can be toggled through
server arguments, mainly --api-key and --enable-request-id-headers.
"""
from
http
import
HTTPStatus
import
pytest
import
requests
from
...utils
import
RemoteOpenAIServer
# Use a small embeddings model for faster startup and smaller memory footprint.
# Since we are not testing any chat functionality,
# using a chat capable model is overkill.
MODEL_NAME
=
"intfloat/multilingual-e5-small"
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
request
:
pytest
.
FixtureRequest
):
passed_params
=
[]
if
hasattr
(
request
,
"param"
):
passed_params
=
request
.
param
if
isinstance
(
passed_params
,
str
):
passed_params
=
[
passed_params
]
args
=
[
"--task"
,
"embed"
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--max-model-len"
,
"512"
,
"--enforce-eager"
,
"--max-num-seqs"
,
"2"
,
*
passed_params
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
mark
.
asyncio
async
def
test_no_api_token
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"v1/models"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
asyncio
async
def
test_no_request_id_header
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"health"
))
assert
"X-Request-Id"
not
in
response
.
headers
@
pytest
.
mark
.
parametrize
(
"server"
,
[[
"--api-key"
,
"test"
]],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_missing_api_token
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"v1/models"
))
assert
response
.
status_code
==
HTTPStatus
.
UNAUTHORIZED
@
pytest
.
mark
.
parametrize
(
"server"
,
[[
"--api-key"
,
"test"
]],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_passed_api_token
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"v1/models"
),
headers
=
{
"Authorization"
:
"Bearer test"
})
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
parametrize
(
"server"
,
[[
"--api-key"
,
"test"
]],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_not_v1_api_token
(
server
:
RemoteOpenAIServer
):
# Authorization check is skipped for any paths that
# don't start with /v1 (e.g. /v1/chat/completions).
response
=
requests
.
get
(
server
.
url_for
(
"health"
))
assert
response
.
status_code
==
HTTPStatus
.
OK
@
pytest
.
mark
.
parametrize
(
"server"
,
[
"--enable-request-id-headers"
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_enable_request_id_header
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"health"
))
assert
"X-Request-Id"
in
response
.
headers
assert
len
(
response
.
headers
.
get
(
"X-Request-Id"
,
""
))
==
32
@
pytest
.
mark
.
parametrize
(
"server"
,
[
"--enable-request-id-headers"
],
indirect
=
True
,
)
@
pytest
.
mark
.
asyncio
async
def
test_custom_request_id_header
(
server
:
RemoteOpenAIServer
):
response
=
requests
.
get
(
server
.
url_for
(
"health"
),
headers
=
{
"X-Request-Id"
:
"Custom"
})
assert
"X-Request-Id"
in
response
.
headers
assert
response
.
headers
.
get
(
"X-Request-Id"
)
==
"Custom"
tests/entrypoints/openai/test_pooling.py
View file @
99324e25
...
@@ -7,6 +7,7 @@ import numpy as np
...
@@ -7,6 +7,7 @@ import numpy as np
import
pytest
import
pytest
import
requests
import
requests
from
tests.models.utils
import
check_embeddings_close
from
vllm.entrypoints.openai.protocol
import
PoolingResponse
from
vllm.entrypoints.openai.protocol
import
PoolingResponse
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
...
@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
np
.
frombuffer
(
base64
.
b64decode
(
data
.
data
),
np
.
frombuffer
(
base64
.
b64decode
(
data
.
data
),
dtype
=
"float32"
).
tolist
())
dtype
=
"float32"
).
tolist
())
assert
responses_float
.
data
[
0
].
data
==
decoded_responses_base64_data
[
0
]
check_embeddings_close
(
assert
responses_float
.
data
[
1
].
data
==
decoded_responses_base64_data
[
1
]
embeddings_0_lst
=
[
d
.
data
for
d
in
responses_float
.
data
],
embeddings_1_lst
=
decoded_responses_base64_data
,
name_0
=
"float32"
,
name_1
=
"base64"
)
# Default response is float32 decoded from base64 by OpenAI Client
# Default response is float32 decoded from base64 by OpenAI Client
default_response
=
requests
.
post
(
default_response
=
requests
.
post
(
...
@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
...
@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
default_response
.
raise_for_status
()
default_response
.
raise_for_status
()
responses_default
=
PoolingResponse
.
model_validate
(
default_response
.
json
())
responses_default
=
PoolingResponse
.
model_validate
(
default_response
.
json
())
assert
responses_float
.
data
[
0
].
data
==
responses_default
.
data
[
0
].
data
check_embeddings_close
(
assert
responses_float
.
data
[
1
].
data
==
responses_default
.
data
[
1
].
data
embeddings_0_lst
=
[
d
.
data
for
d
in
responses_default
.
data
],
embeddings_1_lst
=
[
d
.
data
for
d
in
responses_default
.
data
],
name_0
=
"float32"
,
name_1
=
"base64"
)
tests/entrypoints/openai/test_rerank.py
View file @
99324e25
...
@@ -12,6 +12,14 @@ MODEL_NAME = "BAAI/bge-reranker-base"
...
@@ -12,6 +12,14 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE
=
"bfloat16"
DTYPE
=
"bfloat16"
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
args
=
[
"--enforce-eager"
,
"--max-model-len"
,
"100"
,
"--dtype"
,
DTYPE
]
...
...
tests/entrypoints/openai/test_score.py
View file @
99324e25
...
@@ -11,6 +11,15 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
...
@@ -11,6 +11,15 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
(
autouse
=
True
)
def
v1
(
run_with_both_engines
):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS
=
[
MODELS
=
[
{
{
"name"
:
"BAAI/bge-reranker-v2-m3"
,
"name"
:
"BAAI/bge-reranker-v2-m3"
,
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
99324e25
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
import
asyncio
import
asyncio
from
contextlib
import
suppress
from
contextlib
import
suppress
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
from
typing
import
Any
,
Optional
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
...
@@ -40,6 +40,7 @@ class MockModelConfig:
...
@@ -40,6 +40,7 @@ class MockModelConfig:
allowed_local_media_path
:
str
=
""
allowed_local_media_path
:
str
=
""
encoder_config
=
None
encoder_config
=
None
generation_config
:
str
=
"auto"
generation_config
:
str
=
"auto"
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
field
(
default_factory
=
dict
)
def
get_diff_sampling_param
(
self
):
def
get_diff_sampling_param
(
self
):
return
self
.
diff_sampling_param
or
{}
return
self
.
diff_sampling_param
or
{}
...
...
tests/entrypoints/openai/test_transcription_validation.py
View file @
99324e25
...
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
...
@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
model_name
=
"openai/whisper-large-v3-turbo"
model_name
=
"openai/whisper-large-v3-turbo"
server_args
=
[
"--enforce-eager"
]
server_args
=
[
"--enforce-eager"
]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
prompt
=
"THE FIRST WORDS I SPOKE"
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
client
=
remote_server
.
get_async_client
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
...
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
...
@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
temperature
=
0.0
)
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
"Mary had a little lamb,"
in
out
assert
"Mary had a little lamb,"
in
out
# This should "force" whisper to continue prompt in all caps
transcription_wprompt
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
prompt
=
prompt
,
temperature
=
0.0
)
out_capital
=
json
.
loads
(
transcription_wprompt
)[
'text'
]
assert
prompt
not
in
out_capital
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -74,19 +63,31 @@ async def test_bad_requests(mary_had_lamb):
...
@@ -74,19 +63,31 @@ async def test_bad_requests(mary_had_lamb):
language
=
"hh"
,
language
=
"hh"
,
temperature
=
0.0
)
temperature
=
0.0
)
# Expect audio too long: repeat the timeseries
mary_had_lamb
.
seek
(
0
)
@
pytest
.
mark
.
asyncio
audio
,
sr
=
librosa
.
load
(
mary_had_lamb
)
async
def
test_long_audio_request
(
mary_had_lamb
):
repeated_audio
=
np
.
tile
(
audio
,
10
)
model_name
=
"openai/whisper-large-v3-turbo"
# Repeated audio to buffer
server_args
=
[
"--enforce-eager"
]
buffer
=
io
.
BytesIO
()
sf
.
write
(
buffer
,
repeated_audio
,
sr
,
format
=
'WAV'
)
mary_had_lamb
.
seek
(
0
)
buffer
.
seek
(
0
)
audio
,
sr
=
librosa
.
load
(
mary_had_lamb
)
with
pytest
.
raises
(
openai
.
BadRequestError
):
# Add small silence after each audio for repeatability in the split process
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
audio
=
np
.
pad
(
audio
,
(
0
,
1600
))
file
=
buffer
,
repeated_audio
=
np
.
tile
(
audio
,
10
)
language
=
"en"
,
# Repeated audio to buffer
temperature
=
0.0
)
buffer
=
io
.
BytesIO
()
sf
.
write
(
buffer
,
repeated_audio
,
sr
,
format
=
'WAV'
)
buffer
.
seek
(
0
)
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
buffer
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
out
.
count
(
"Mary had a little lamb"
)
==
10
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -226,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
...
@@ -226,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
extra_body
=
dict
(
seed
=
42
))
extra_body
=
dict
(
seed
=
42
))
assert
greedy_transcription
.
text
!=
transcription
.
text
assert
greedy_transcription
.
text
!=
transcription
.
text
@
pytest
.
mark
.
asyncio
async
def
test_audio_prompt
(
mary_had_lamb
):
model_name
=
"openai/whisper-large-v3-turbo"
server_args
=
[
"--enforce-eager"
]
prompt
=
"This is a speech, recorded in a phonograph."
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
#Prompts should not omit the part of original prompt while transcribing.
prefix
=
"The first words I spoke in the original phonograph"
client
=
remote_server
.
get_async_client
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
prefix
in
out
transcription_wprompt
=
await
client
.
audio
.
transcriptions
.
create
(
model
=
model_name
,
file
=
mary_had_lamb
,
language
=
"en"
,
response_format
=
"text"
,
prompt
=
prompt
,
temperature
=
0.0
)
out_prompt
=
json
.
loads
(
transcription_wprompt
)[
'text'
]
assert
prefix
in
out_prompt
tests/entrypoints/openai/test_translation_validation.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
io
# imports for guided decoding tests
import
json
from
unittest.mock
import
patch
import
librosa
import
numpy
as
np
import
pytest
import
soundfile
as
sf
from
openai._base_client
import
AsyncAPIClient
from
vllm.assets.audio
import
AudioAsset
from
...utils
import
RemoteOpenAIServer
@
pytest
.
fixture
def
foscolo
():
# Test translation it->en
path
=
AudioAsset
(
'azacinto_foscolo'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@
pytest
.
mark
.
asyncio
async
def
test_basic_audio
(
foscolo
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
translation
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
response_format
=
"text"
,
# TODO remove once language detection is implemented
extra_body
=
dict
(
language
=
"it"
),
temperature
=
0.0
)
out
=
json
.
loads
(
translation
)[
'text'
].
strip
()
assert
"Nor will I ever touch the sacred"
in
out
@
pytest
.
mark
.
asyncio
async
def
test_audio_prompt
(
foscolo
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
# Condition whisper on starting text
prompt
=
"Nor have I ever"
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
transcription
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
prompt
=
prompt
,
extra_body
=
dict
(
language
=
"it"
),
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
transcription
)[
'text'
]
assert
"Nor will I ever touch the sacred"
not
in
out
assert
prompt
not
in
out
@
pytest
.
mark
.
asyncio
async
def
test_non_asr_model
(
foscolo
):
# text to text model
model_name
=
"JackFram/llama-68m"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
res
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
temperature
=
0.0
)
assert
res
.
code
==
400
and
not
res
.
text
assert
res
.
message
==
"The model does not support Translations API"
@
pytest
.
mark
.
asyncio
async
def
test_streaming_response
(
foscolo
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
translation
=
""
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
res_no_stream
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
response_format
=
"json"
,
extra_body
=
dict
(
language
=
"it"
),
temperature
=
0.0
)
# Unfortunately this only works when the openai client is patched
# to use streaming mode, not exposed in the translation api.
original_post
=
AsyncAPIClient
.
post
async
def
post_with_stream
(
*
args
,
**
kwargs
):
kwargs
[
'stream'
]
=
True
return
await
original_post
(
*
args
,
**
kwargs
)
with
patch
.
object
(
AsyncAPIClient
,
"post"
,
new
=
post_with_stream
):
client
=
remote_server
.
get_async_client
()
res
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
temperature
=
0.0
,
extra_body
=
dict
(
stream
=
True
,
language
=
"it"
))
# Reconstruct from chunks and validate
async
for
chunk
in
res
:
# just a chunk
text
=
chunk
.
choices
[
0
][
'delta'
][
'content'
]
translation
+=
text
assert
translation
==
res_no_stream
.
text
@
pytest
.
mark
.
asyncio
async
def
test_stream_options
(
foscolo
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
original_post
=
AsyncAPIClient
.
post
async
def
post_with_stream
(
*
args
,
**
kwargs
):
kwargs
[
'stream'
]
=
True
return
await
original_post
(
*
args
,
**
kwargs
)
with
patch
.
object
(
AsyncAPIClient
,
"post"
,
new
=
post_with_stream
):
client
=
remote_server
.
get_async_client
()
res
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
foscolo
,
temperature
=
0.0
,
extra_body
=
dict
(
language
=
"it"
,
stream
=
True
,
stream_include_usage
=
True
,
stream_continuous_usage_stats
=
True
))
final
=
False
continuous
=
True
async
for
chunk
in
res
:
if
not
len
(
chunk
.
choices
):
# final usage sent
final
=
True
else
:
continuous
=
continuous
and
hasattr
(
chunk
,
'usage'
)
assert
final
and
continuous
@
pytest
.
mark
.
asyncio
async
def
test_long_audio_request
(
foscolo
):
model_name
=
"openai/whisper-small"
server_args
=
[
"--enforce-eager"
]
foscolo
.
seek
(
0
)
audio
,
sr
=
librosa
.
load
(
foscolo
)
repeated_audio
=
np
.
tile
(
audio
,
2
)
# Repeated audio to buffer
buffer
=
io
.
BytesIO
()
sf
.
write
(
buffer
,
repeated_audio
,
sr
,
format
=
'WAV'
)
buffer
.
seek
(
0
)
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
translation
=
await
client
.
audio
.
translations
.
create
(
model
=
model_name
,
file
=
buffer
,
extra_body
=
dict
(
language
=
"it"
),
response_format
=
"text"
,
temperature
=
0.0
)
out
=
json
.
loads
(
translation
)[
'text'
].
strip
().
lower
()
# TODO investigate higher model uncertainty in for longer translations.
assert
out
.
count
(
"nor will i ever"
)
==
2
tests/entrypoints/openai/test_video.py
View file @
99324e25
...
@@ -50,7 +50,7 @@ async def client(server):
...
@@ -50,7 +50,7 @@ async def client(server):
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64_encoded_video
()
->
dict
[
str
,
str
]:
def
base64_encoded_video
()
->
dict
[
str
,
str
]:
return
{
return
{
video_url
:
encode_video_base64
(
fetch_video
(
video_url
))
video_url
:
encode_video_base64
(
fetch_video
(
video_url
)
[
0
]
)
for
video_url
in
TEST_VIDEO_URLS
for
video_url
in
TEST_VIDEO_URLS
}
}
...
...
tests/entrypoints/openai/test_vision.py
View file @
99324e25
...
@@ -25,6 +25,25 @@ TEST_IMAGE_URLS = [
...
@@ -25,6 +25,25 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
]
EXPECTED_MM_BEAM_SEARCH_RES
=
[
[
"The image shows a wooden boardwalk leading through a"
,
"The image shows a wooden boardwalk extending into a"
,
],
[
"The image shows two parrots perched on"
,
"The image shows two birds perched on a cur"
,
],
[
"The image shows a Venn diagram with three over"
,
"This image shows a Venn diagram with three over"
,
],
[
"This image displays a gradient of colors ranging from"
,
"This image displays a gradient of colors transitioning from"
,
],
]
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
...
@@ -270,10 +289,13 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -270,10 +289,13 @@ async def test_single_chat_session_image_base64encoded(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"image_
url"
,
TEST_IMAGE_URLS
)
@
pytest
.
mark
.
parametrize
(
"image_
idx"
,
list
(
range
(
len
(
TEST_IMAGE_URLS
)
)))
async
def
test_single_chat_session_image_base64encoded_beamsearch
(
async
def
test_single_chat_session_image_base64encoded_beamsearch
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_
url
:
str
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
image_
idx
:
int
,
base64_encoded_image
:
dict
[
str
,
str
]):
base64_encoded_image
:
dict
[
str
,
str
]):
# NOTE: This test also validates that we pass MM data through beam search
image_url
=
TEST_IMAGE_URLS
[
image_idx
]
expected_res
=
EXPECTED_MM_BEAM_SEARCH_RES
[
image_idx
]
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
@@ -297,10 +319,11 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
...
@@ -297,10 +319,11 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
messages
=
messages
,
messages
=
messages
,
n
=
2
,
n
=
2
,
max_completion_tokens
=
10
,
max_completion_tokens
=
10
,
temperature
=
0.0
,
extra_body
=
dict
(
use_beam_search
=
True
))
extra_body
=
dict
(
use_beam_search
=
True
))
assert
len
(
chat_completion
.
choices
)
==
2
assert
len
(
chat_completion
.
choices
)
==
2
assert
chat_completion
.
choices
[
for
actual
,
expected_str
in
zip
(
chat_completion
.
choices
,
expected_res
):
0
]
.
message
.
content
!
=
chat_completion
.
choices
[
1
].
message
.
content
assert
actual
.
message
.
content
=
=
expected_str
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/test_chat_utils.py
View file @
99324e25
...
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
...
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
"url"
:
image_url
"url"
:
image_url
}
}
},
{
},
{
"type"
:
"image_url"
,
"type"
:
"image_pil"
,
"image_url"
:
{
"image_pil"
:
ImageAsset
(
'cherry_blossom'
).
pil_image
"url"
:
image_url
}
},
{
},
{
"type"
:
"text"
,
"type"
:
"text"
,
"text"
:
"What's in these images?"
"text"
:
"What's in these images?"
...
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
...
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
"url"
:
image_url
"url"
:
image_url
}
}
},
{
},
{
"type"
:
"image_url"
,
"type"
:
"image_pil"
,
"image_url"
:
{
"image_pil"
:
ImageAsset
(
'cherry_blossom'
).
pil_image
"url"
:
image_url
}
},
{
},
{
"type"
:
"text"
,
"type"
:
"text"
,
"text"
:
"What's in these images?"
"text"
:
"What's in these images?"
...
...
Prev
1
…
11
12
13
14
15
16
17
18
19
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment