Commit 99324e25 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-ori

parents cc7f22a8 a5dd03c1
...@@ -28,7 +28,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" ...@@ -28,7 +28,7 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
class ParallelSetup(NamedTuple): class ParallelSetup(NamedTuple):
tp_size: int tp_size: int
pp_size: int pp_size: int
sp_enabled: bool enable_fusion: bool
eager_mode: bool eager_mode: bool
chunked_prefill: bool chunked_prefill: bool
...@@ -67,49 +67,18 @@ class SPTestSettings: ...@@ -67,49 +67,18 @@ class SPTestSettings:
task: TaskOption = "auto", task: TaskOption = "auto",
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
parallel_setups = []
for eager_mode_val in [False, True]:
for pp_multiplier in [1, 2]:
for chunked_prefill_val in [False, True]:
parallel_setups.append(
ParallelSetup(tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
enable_fusion=False,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val))
return SPTestSettings( return SPTestSettings(
parallel_setups=[ parallel_setups=parallel_setups,
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
sp_enabled=True,
eager_mode=False,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
sp_enabled=True,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
sp_enabled=True,
eager_mode=True,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=pp_base,
sp_enabled=True,
eager_mode=True,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
sp_enabled=True,
eager_mode=False,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
sp_enabled=True,
eager_mode=False,
chunked_prefill=True),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
sp_enabled=True,
eager_mode=True,
chunked_prefill=False),
ParallelSetup(tp_size=tp_base,
pp_size=2 * pp_base,
sp_enabled=True,
eager_mode=True,
chunked_prefill=True)
],
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"], vllm_major_versions=["1", "1"],
task=task, task=task,
...@@ -126,19 +95,44 @@ class SPTestSettings: ...@@ -126,19 +95,44 @@ class SPTestSettings:
multi_node_only: bool = False, multi_node_only: bool = False,
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
parallel_setups = []
for eager_mode_val in [False, True]:
for pp_multiplier in [1, 2]:
for chunked_prefill_val in [False, True]:
parallel_setups.append(
ParallelSetup(tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
enable_fusion=False,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val))
return SPTestSettings( return SPTestSettings(
parallel_setups=[ parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
task=task,
test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
@staticmethod
def fp8_quant(
*,
tp_base: int = 2,
pp_base: int = 1,
task: TaskOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
):
parallel_setups = []
for fusion_val in [False, True]:
parallel_setups.append(
ParallelSetup(tp_size=tp_base, ParallelSetup(tp_size=tp_base,
pp_size=pp_base, pp_size=pp_base,
sp_enabled=True, enable_fusion=fusion_val,
eager_mode=False, eager_mode=True,
chunked_prefill=False), chunked_prefill=False))
ParallelSetup(tp_size=tp_base, return SPTestSettings(
pp_size=2 * pp_base, parallel_setups=parallel_setups,
sp_enabled=True,
eager_mode=False,
chunked_prefill=False),
],
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"], vllm_major_versions=["1", "1"],
task=task, task=task,
...@@ -171,7 +165,7 @@ def _compare_sp( ...@@ -171,7 +165,7 @@ def _compare_sp(
( (
tp_size, tp_size,
pp_size, pp_size,
sp_enabled, enable_fusion,
eager_mode, eager_mode,
chunked_prefill, chunked_prefill,
) = parallel_setup ) = parallel_setup
...@@ -240,9 +234,9 @@ def _compare_sp( ...@@ -240,9 +234,9 @@ def _compare_sp(
'compile_sizes': [4, 8], 'compile_sizes': [4, 8],
'splitting_ops': [], 'splitting_ops': [],
'pass_config': { 'pass_config': {
'enable_sequence_parallelism': sp_enabled, 'enable_sequence_parallelism': True,
'enable_fusion': enable_fusion,
'enable_noop': True, 'enable_noop': True,
'enable_fusion': True,
}, },
} }
...@@ -291,12 +285,14 @@ def _compare_sp( ...@@ -291,12 +285,14 @@ def _compare_sp(
SP_TEXT_GENERATION_MODELS = { SP_TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
"meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(), "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
} }
SP_TEST_MODELS = [ SP_TEST_MODELS = [
# TODO support other models # TODO support other models
# [LANGUAGE GENERATION] # [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct", "meta-llama/Llama-3.2-1B-Instruct",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
] ]
......
...@@ -231,6 +231,38 @@ def test_limit_mm_per_prompt_parser(arg, expected): ...@@ -231,6 +231,38 @@ def test_limit_mm_per_prompt_parser(arg, expected):
assert args.limit_mm_per_prompt == expected assert args.limit_mm_per_prompt == expected
@pytest.mark.parametrize(
("arg", "expected"),
[
(None, dict()),
('{"video": {"num_frames": 123} }', {
"video": {
"num_frames": 123
}
}),
(
'{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }', # noqa
{
"video": {
"num_frames": 123,
"fps": 1.0,
"foo": "bar"
},
"image": {
"foo": "bar"
}
}),
])
def test_media_io_kwargs_parser(arg, expected):
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
if arg is None:
args = parser.parse_args([])
else:
args = parser.parse_args(["--media-io-kwargs", arg])
assert args.media_io_kwargs == expected
def test_compilation_config(): def test_compilation_config():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
...@@ -239,32 +271,40 @@ def test_compilation_config(): ...@@ -239,32 +271,40 @@ def test_compilation_config():
assert args.compilation_config == CompilationConfig() assert args.compilation_config == CompilationConfig()
# set to O3 # set to O3
args = parser.parse_args(["-O3"]) args = parser.parse_args(["-O0"])
assert args.compilation_config.level == 3 assert args.compilation_config.level == 0
# set to O 3 (space) # set to O 3 (space)
args = parser.parse_args(["-O", "3"]) args = parser.parse_args(["-O", "1"])
assert args.compilation_config.level == 3 assert args.compilation_config.level == 1
# set to O 3 (equals) # set to O 3 (equals)
args = parser.parse_args(["-O=3"]) args = parser.parse_args(["-O=2"])
assert args.compilation_config.level == 2
# set to O.level 3
args = parser.parse_args(["-O.level", "3"])
assert args.compilation_config.level == 3 assert args.compilation_config.level == 3
# set to string form of a dict # set to string form of a dict
args = parser.parse_args([ args = parser.parse_args([
"--compilation-config", "-O",
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}', '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": false}',
]) ])
assert (args.compilation_config.level == 3 and assert (args.compilation_config.level == 3 and
args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor)
# set to string form of a dict # set to string form of a dict
args = parser.parse_args([ args = parser.parse_args([
"--compilation-config=" "--compilation-config="
'{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}', '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}',
]) ])
assert (args.compilation_config.level == 3 and assert (args.compilation_config.level == 3 and
args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]) args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor)
def test_prefix_cache_default(): def test_prefix_cache_default():
......
...@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str, ...@@ -48,9 +48,6 @@ def test_enable_prompt_embeds(hf_runner, model: str,
ctx = (nullcontext() if enable_prompt_embeds else pytest.raises( ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
ValueError, match="set `--enable-prompt-embeds`")) ValueError, match="set `--enable-prompt-embeds`"))
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm = LLM( llm = LLM(
model=model, model=model,
enable_prompt_embeds=enable_prompt_embeds, enable_prompt_embeds=enable_prompt_embeds,
......
...@@ -8,6 +8,8 @@ import pytest ...@@ -8,6 +8,8 @@ import pytest
from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ...models.utils import check_embeddings_close
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
PROMPTS = [ PROMPTS = [
...@@ -27,6 +29,14 @@ TOKEN_IDS = [ ...@@ -27,6 +29,14 @@ TOKEN_IDS = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def llm(): def llm():
# pytest caches the fixture so we use weakref.proxy to # pytest caches the fixture so we use weakref.proxy to
...@@ -46,9 +56,15 @@ def llm(): ...@@ -46,9 +56,15 @@ def llm():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: list[PoolingRequestOutput], def assert_outputs_match(o1: list[PoolingRequestOutput],
o2: list[PoolingRequestOutput]): o2: list[PoolingRequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2] check_embeddings_close(
embeddings_0_lst=[o.outputs.data for o in o1],
embeddings_1_lst=[o.outputs.data for o in o2],
name_0="hf",
name_1="vllm",
tol=1e-2,
)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
...@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, ...@@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids}, v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params) pooling_params=pooling_params)
assert_outputs_equal(v1_output, v2_output) assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
...@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): ...@@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
} for p in TOKEN_IDS], } for p in TOKEN_IDS],
pooling_params=pooling_params, pooling_params=pooling_params,
) )
assert_outputs_equal(v1_output, v2_output) assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup @pytest.mark.skip_global_cleanup
......
...@@ -125,4 +125,7 @@ def test_max_model_len(): ...@@ -125,4 +125,7 @@ def test_max_model_len():
for output in outputs: for output in outputs:
num_total_tokens = len(output.prompt_token_ids) + len( num_total_tokens = len(output.prompt_token_ids) + len(
output.outputs[0].token_ids) output.outputs[0].token_ids)
assert num_total_tokens == max_model_len # Total tokens must not exceed max_model_len.
# It can be less if generation finishes due to other reasons (e.g., EOS)
# before reaching the absolute model length limit.
assert num_total_tokens <= max_model_len
...@@ -7,34 +7,30 @@ import pytest ...@@ -7,34 +7,30 @@ import pytest
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
MTEB_EMBED_TOL, MTEB_EMBED_TOL,
OpenAIClientMtebEncoder, OpenAIClientMtebEncoder,
run_mteb_embed_task, run_mteb_embed_task)
run_mteb_embed_task_st)
from tests.utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "BAAI/bge-m3" MODEL_NAME = "intfloat/e5-small"
DTYPE = "float16" MAIN_SCORE = 0.7422994752439667
MAIN_SCORE = 0.7873427091972599
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager", "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
"--max-model-len", "512"
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
def test_mteb(server): def test_mteb_embed(server):
client = server.get_client() client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client) encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS) vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st( st_main_score = MAIN_SCORE
MODEL_NAME, MTEB_EMBED_TASKS)
print("VLLM main score: ", vllm_main_score) print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score) print("SentenceTransformer main score: ", st_main_score)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
# yapf conflicts with isort for this block
# yapf: disable
from tests.models.language.pooling.mteb_utils import (
MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL,
RerankClientMtebEncoder, ScoreClientMtebEncoder,
mteb_test_rerank_models_hf, run_mteb_rerank)
# yapf: enable
from tests.utils import RemoteOpenAIServer
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@pytest.fixture(scope="module")
def server():
args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def st_main_score(hf_runner):
# The main score related to the version of the dependency.
# So we need to recalculate every time.
main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME)
return main_score
def test_mteb_score(server, st_main_score):
url = server.url_for("score")
encoder = ScoreClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
def test_mteb_rerank(server, st_main_score):
url = server.url_for("rerank")
encoder = RerankClientMtebEncoder(MODEL_NAME, url)
vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS,
MTEB_RERANK_LANGS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
...@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" ...@@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists() assert chatml_jinja_path.exists()
# Define models, templates, and their corresponding expected outputs # Define models, templates, and their corresponding expected outputs
MODEL_TEMPLATE_GENERATON_OUTPUT = [ MODEL_TEMPLATE_GENERATION_OUTPUT = [
("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
Hello<|im_end|> Hello<|im_end|>
<|im_start|>assistant <|im_start|>assistant
...@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike(): ...@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model,template,add_generation_prompt,continue_final_message,expected_output", "model,template,add_generation_prompt,continue_final_message,expected_output",
MODEL_TEMPLATE_GENERATON_OUTPUT) MODEL_TEMPLATE_GENERATION_OUTPUT)
def test_get_gen_prompt(model, template, add_generation_prompt, def test_get_gen_prompt(model, template, add_generation_prompt,
continue_final_message, expected_output): continue_final_message, expected_output):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
......
...@@ -779,3 +779,57 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, ...@@ -779,3 +779,57 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
prompt="Give an example string that fits this regex", prompt="Give an example string that fits this regex",
extra_body=dict(guided_regex=sample_regex, extra_body=dict(guided_regex=sample_regex,
guided_json=sample_json_schema)) guided_json=sample_json_schema))
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,stream,echo",
[
(MODEL_NAME, False, False),
(MODEL_NAME, False, True),
(MODEL_NAME, True, False),
(MODEL_NAME, True, True) # should not raise BadRequestError error
],
)
async def test_echo_stream_completion(client: openai.AsyncOpenAI,
model_name: str, stream: bool,
echo: bool):
saying: str = "Hello, my name is"
result = await client.completions.create(model=model_name,
prompt=saying,
max_tokens=10,
temperature=0.0,
echo=echo,
stream=stream)
stop_reason = "length"
if not stream:
completion = result
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 1
choice = completion.choices[0]
assert len(choice.text) >= 5
assert choice.finish_reason == stop_reason
if echo:
assert choice.text is not None and saying in choice.text
else:
assert choice.text is not None and saying not in choice.text
else:
chunks: list[str] = []
final_finish_reason = None
async for chunk in result:
if chunk.choices and chunk.choices[0].text:
chunks.append(chunk.choices[0].text)
if chunk.choices and chunk.choices[0].finish_reason:
final_finish_reason = chunk.choices[0].finish_reason
assert final_finish_reason == stop_reason
content = "".join(chunks)
if echo:
assert content is not None and saying in content
else:
assert content is not None and saying not in content
...@@ -21,6 +21,14 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + ...@@ -21,6 +21,14 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
DTYPE = "bfloat16" DTYPE = "bfloat16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for middleware that's off by default and can be toggled through
server arguments, mainly --api-key and --enable-request-id-headers.
"""
from http import HTTPStatus
import pytest
import requests
from ...utils import RemoteOpenAIServer
# Use a small embeddings model for faster startup and smaller memory footprint.
# Since we are not testing any chat functionality,
# using a chat capable model is overkill.
MODEL_NAME = "intfloat/multilingual-e5-small"
@pytest.fixture(scope="module")
def server(request: pytest.FixtureRequest):
passed_params = []
if hasattr(request, "param"):
passed_params = request.param
if isinstance(passed_params, str):
passed_params = [passed_params]
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--max-model-len",
"512",
"--enforce-eager",
"--max-num-seqs",
"2",
*passed_params
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
async def test_no_api_token(server: RemoteOpenAIServer):
response = requests.get(server.url_for("v1/models"))
assert response.status_code == HTTPStatus.OK
@pytest.mark.asyncio
async def test_no_request_id_header(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
assert "X-Request-Id" not in response.headers
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_missing_api_token(server: RemoteOpenAIServer):
response = requests.get(server.url_for("v1/models"))
assert response.status_code == HTTPStatus.UNAUTHORIZED
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_passed_api_token(server: RemoteOpenAIServer):
response = requests.get(server.url_for("v1/models"),
headers={"Authorization": "Bearer test"})
assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server",
[["--api-key", "test"]],
indirect=True,
)
@pytest.mark.asyncio
async def test_not_v1_api_token(server: RemoteOpenAIServer):
# Authorization check is skipped for any paths that
# don't start with /v1 (e.g. /v1/chat/completions).
response = requests.get(server.url_for("health"))
assert response.status_code == HTTPStatus.OK
@pytest.mark.parametrize(
"server",
["--enable-request-id-headers"],
indirect=True,
)
@pytest.mark.asyncio
async def test_enable_request_id_header(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
assert "X-Request-Id" in response.headers
assert len(response.headers.get("X-Request-Id", "")) == 32
@pytest.mark.parametrize(
"server",
["--enable-request-id-headers"],
indirect=True,
)
@pytest.mark.asyncio
async def test_custom_request_id_header(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"),
headers={"X-Request-Id": "Custom"})
assert "X-Request-Id" in response.headers
assert response.headers.get("X-Request-Id") == "Custom"
...@@ -7,6 +7,7 @@ import numpy as np ...@@ -7,6 +7,7 @@ import numpy as np
import pytest import pytest
import requests import requests
from tests.models.utils import check_embeddings_close
from vllm.entrypoints.openai.protocol import PoolingResponse from vllm.entrypoints.openai.protocol import PoolingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
...@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, ...@@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
np.frombuffer(base64.b64decode(data.data), np.frombuffer(base64.b64decode(data.data),
dtype="float32").tolist()) dtype="float32").tolist())
assert responses_float.data[0].data == decoded_responses_base64_data[0] check_embeddings_close(
assert responses_float.data[1].data == decoded_responses_base64_data[1] embeddings_0_lst=[d.data for d in responses_float.data],
embeddings_1_lst=decoded_responses_base64_data,
name_0="float32",
name_1="base64")
# Default response is float32 decoded from base64 by OpenAI Client # Default response is float32 decoded from base64 by OpenAI Client
default_response = requests.post( default_response = requests.post(
...@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, ...@@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
default_response.raise_for_status() default_response.raise_for_status()
responses_default = PoolingResponse.model_validate(default_response.json()) responses_default = PoolingResponse.model_validate(default_response.json())
assert responses_float.data[0].data == responses_default.data[0].data check_embeddings_close(
assert responses_float.data[1].data == responses_default.data[1].data embeddings_0_lst=[d.data for d in responses_default.data],
embeddings_1_lst=[d.data for d in responses_default.data],
name_0="float32",
name_1="base64")
...@@ -12,6 +12,14 @@ MODEL_NAME = "BAAI/bge-reranker-base" ...@@ -12,6 +12,14 @@ MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16" DTYPE = "bfloat16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
......
...@@ -11,6 +11,15 @@ from vllm.entrypoints.openai.protocol import ScoreResponse ...@@ -11,6 +11,15 @@ from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
MODELS = [ MODELS = [
{ {
"name": "BAAI/bge-reranker-v2-m3", "name": "BAAI/bge-reranker-v2-m3",
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
import asyncio import asyncio
from contextlib import suppress from contextlib import suppress
from dataclasses import dataclass from dataclasses import dataclass, field
from typing import Optional from typing import Any, Optional
from unittest.mock import MagicMock from unittest.mock import MagicMock
from vllm.config import MultiModalConfig from vllm.config import MultiModalConfig
...@@ -40,6 +40,7 @@ class MockModelConfig: ...@@ -40,6 +40,7 @@ class MockModelConfig:
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
encoder_config = None encoder_config = None
generation_config: str = "auto" generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
......
...@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb): ...@@ -37,7 +37,6 @@ async def test_basic_audio(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo" model_name = "openai/whisper-large-v3-turbo"
server_args = ["--enforce-eager"] server_args = ["--enforce-eager"]
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
prompt = "THE FIRST WORDS I SPOKE"
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
...@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb): ...@@ -48,16 +47,6 @@ async def test_basic_audio(mary_had_lamb):
temperature=0.0) temperature=0.0)
out = json.loads(transcription)['text'] out = json.loads(transcription)['text']
assert "Mary had a little lamb," in out assert "Mary had a little lamb," in out
# This should "force" whisper to continue prompt in all caps
transcription_wprompt = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0)
out_capital = json.loads(transcription_wprompt)['text']
assert prompt not in out_capital
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -74,19 +63,31 @@ async def test_bad_requests(mary_had_lamb): ...@@ -74,19 +63,31 @@ async def test_bad_requests(mary_had_lamb):
language="hh", language="hh",
temperature=0.0) temperature=0.0)
# Expect audio too long: repeat the timeseries
mary_had_lamb.seek(0) @pytest.mark.asyncio
audio, sr = librosa.load(mary_had_lamb) async def test_long_audio_request(mary_had_lamb):
repeated_audio = np.tile(audio, 10) model_name = "openai/whisper-large-v3-turbo"
# Repeated audio to buffer server_args = ["--enforce-eager"]
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV') mary_had_lamb.seek(0)
buffer.seek(0) audio, sr = librosa.load(mary_had_lamb)
with pytest.raises(openai.BadRequestError): # Add small silence after each audio for repeatability in the split process
await client.audio.transcriptions.create(model=model_name, audio = np.pad(audio, (0, 1600))
file=buffer, repeated_audio = np.tile(audio, 10)
language="en", # Repeated audio to buffer
temperature=0.0) buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0)
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=buffer,
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert out.count("Mary had a little lamb") == 10
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -226,3 +227,31 @@ async def test_sampling_params(mary_had_lamb): ...@@ -226,3 +227,31 @@ async def test_sampling_params(mary_had_lamb):
extra_body=dict(seed=42)) extra_body=dict(seed=42))
assert greedy_transcription.text != transcription.text assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb):
model_name = "openai/whisper-large-v3-turbo"
server_args = ["--enforce-eager"]
prompt = "This is a speech, recorded in a phonograph."
with RemoteOpenAIServer(model_name, server_args) as remote_server:
#Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
client = remote_server.get_async_client()
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert prefix in out
transcription_wprompt = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0)
out_prompt = json.loads(transcription_wprompt)['text']
assert prefix in out_prompt
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
# imports for guided decoding tests
import json
from unittest.mock import patch
import librosa
import numpy as np
import pytest
import soundfile as sf
from openai._base_client import AsyncAPIClient
from vllm.assets.audio import AudioAsset
from ...utils import RemoteOpenAIServer
@pytest.fixture
def foscolo():
# Test translation it->en
path = AudioAsset('azacinto_foscolo').get_local_path()
with open(str(path), "rb") as f:
yield f
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="text",
# TODO remove once language detection is implemented
extra_body=dict(language="it"),
temperature=0.0)
out = json.loads(translation)['text'].strip()
assert "Nor will I ever touch the sacred" in out
@pytest.mark.asyncio
async def test_audio_prompt(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
# Condition whisper on starting text
prompt = "Nor have I ever"
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
transcription = await client.audio.translations.create(
model=model_name,
file=foscolo,
prompt=prompt,
extra_body=dict(language="it"),
response_format="text",
temperature=0.0)
out = json.loads(transcription)['text']
assert "Nor will I ever touch the sacred" not in out
assert prompt not in out
@pytest.mark.asyncio
async def test_non_asr_model(foscolo):
# text to text model
model_name = "JackFram/llama-68m"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res = await client.audio.translations.create(model=model_name,
file=foscolo,
temperature=0.0)
assert res.code == 400 and not res.text
assert res.message == "The model does not support Translations API"
@pytest.mark.asyncio
async def test_streaming_response(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
translation = ""
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res_no_stream = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="json",
extra_body=dict(language="it"),
temperature=0.0)
# Unfortunately this only works when the openai client is patched
# to use streaming mode, not exposed in the translation api.
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.translations.create(model=model_name,
file=foscolo,
temperature=0.0,
extra_body=dict(
stream=True,
language="it"))
# Reconstruct from chunks and validate
async for chunk in res:
# just a chunk
text = chunk.choices[0]['delta']['content']
translation += text
assert translation == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.translations.create(
model=model_name,
file=foscolo,
temperature=0.0,
extra_body=dict(language="it",
stream=True,
stream_include_usage=True,
stream_continuous_usage_stats=True))
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, 'usage')
assert final and continuous
@pytest.mark.asyncio
async def test_long_audio_request(foscolo):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
foscolo.seek(0)
audio, sr = librosa.load(foscolo)
repeated_audio = np.tile(audio, 2)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format='WAV')
buffer.seek(0)
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create(
model=model_name,
file=buffer,
extra_body=dict(language="it"),
response_format="text",
temperature=0.0)
out = json.loads(translation)['text'].strip().lower()
# TODO investigate higher model uncertainty in for longer translations.
assert out.count("nor will i ever") == 2
...@@ -50,7 +50,7 @@ async def client(server): ...@@ -50,7 +50,7 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_video() -> dict[str, str]: def base64_encoded_video() -> dict[str, str]:
return { return {
video_url: encode_video_base64(fetch_video(video_url)) video_url: encode_video_base64(fetch_video(video_url)[0])
for video_url in TEST_VIDEO_URLS for video_url in TEST_VIDEO_URLS
} }
......
...@@ -25,6 +25,25 @@ TEST_IMAGE_URLS = [ ...@@ -25,6 +25,25 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
] ]
EXPECTED_MM_BEAM_SEARCH_RES = [
[
"The image shows a wooden boardwalk leading through a",
"The image shows a wooden boardwalk extending into a",
],
[
"The image shows two parrots perched on",
"The image shows two birds perched on a cur",
],
[
"The image shows a Venn diagram with three over",
"This image shows a Venn diagram with three over",
],
[
"This image displays a gradient of colors ranging from",
"This image displays a gradient of colors transitioning from",
],
]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
...@@ -270,10 +289,13 @@ async def test_single_chat_session_image_base64encoded( ...@@ -270,10 +289,13 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS))))
async def test_single_chat_session_image_base64encoded_beamsearch( async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str, client: openai.AsyncOpenAI, model_name: str, image_idx: int,
base64_encoded_image: dict[str, str]): base64_encoded_image: dict[str, str]):
# NOTE: This test also validates that we pass MM data through beam search
image_url = TEST_IMAGE_URLS[image_idx]
expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
messages = [{ messages = [{
"role": "role":
...@@ -297,10 +319,11 @@ async def test_single_chat_session_image_base64encoded_beamsearch( ...@@ -297,10 +319,11 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
messages=messages, messages=messages,
n=2, n=2,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.0,
extra_body=dict(use_beam_search=True)) extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2 assert len(chat_completion.choices) == 2
assert chat_completion.choices[ for actual, expected_str in zip(chat_completion.choices, expected_res):
0].message.content != chat_completion.choices[1].message.content assert actual.message.content == expected_str
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images( ...@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
"url": image_url "url": image_url
} }
}, { }, {
"type": "image_url", "type": "image_pil",
"image_url": { "image_pil": ImageAsset('cherry_blossom').pil_image
"url": image_url
}
}, { }, {
"type": "text", "type": "text",
"text": "What's in these images?" "text": "What's in these images?"
...@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async( ...@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
"url": image_url "url": image_url
} }
}, { }, {
"type": "image_url", "type": "image_pil",
"image_url": { "image_pil": ImageAsset('cherry_blossom').pil_image
"url": image_url
}
}, { }, {
"type": "text", "type": "text",
"text": "What's in these images?" "text": "What's in these images?"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment