Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
353 additions
and
153 deletions
+353
-153
tests/entrypoints/conftest.py
tests/entrypoints/conftest.py
+29
-0
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+3
-2
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+2
-2
tests/entrypoints/openai/conftest.py
tests/entrypoints/openai/conftest.py
+27
-0
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-1
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+1
-56
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+3
-1
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+0
-26
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
.../entrypoints/openai/test_completion_with_prompt_embeds.py
+1
-26
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+1
-0
tests/entrypoints/openai/test_lora_adapters.py
tests/entrypoints/openai/test_lora_adapters.py
+0
-8
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-7
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+0
-8
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+9
-5
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+18
-1
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_token_ids.py
+1
-1
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+0
-2
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+238
-3
tests/entrypoints/openai/test_skip_tokenizer.py
tests/entrypoints/openai/test_skip_tokenizer.py
+4
-2
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+0
-2
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/entrypoints/conftest.py
View file @
38d80967
...
...
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
condition: column "=" number
number: "1" | "2"
"""
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
"""Download zephyr LoRA files once per test session."""
from
huggingface_hub
import
snapshot_download
return
snapshot_download
(
repo_id
=
"typeof/zephyr-7b-beta-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
"""Create zephyr LoRA files with added tokens once per test session."""
import
shutil
from
tempfile
import
TemporaryDirectory
from
transformers
import
AutoTokenizer
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"HuggingFaceH4/zephyr-7b-beta"
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
tests/entrypoints/llm/test_chat.py
View file @
38d80967
...
...
@@ -7,7 +7,7 @@ import pytest
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
..openai.test_vision
import
TEST_IMAGE_
URL
S
from
..openai.test_vision
import
TEST_IMAGE_
ASSET
S
@
pytest
.
fixture
(
scope
=
"function"
)
...
...
@@ -95,7 +95,8 @@ def vision_llm():
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_ASSETS
[
0
],
TEST_IMAGE_ASSETS
[
1
]]],
indirect
=
True
)
def
test_chat_multi_image
(
vision_llm
,
image_urls
:
list
[
str
]):
messages
=
[{
"role"
:
...
...
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
38d80967
...
...
@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set
up offline mode
_re_import_modules
()
# Cached model files should be used in offline mode
for
model_config
in
MODEL_CONFIGS
:
...
...
@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
disable_connect
,
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set
up offline mode
_re_import_modules
()
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
...
...
tests/entrypoints/openai/conftest.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.assets.audio
import
AudioAsset
@
pytest
.
fixture
def
mary_had_lamb
():
path
=
AudioAsset
(
'mary_had_lamb'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
fixture
def
winning_call
():
path
=
AudioAsset
(
'winning_call'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
fixture
def
foscolo
():
# Test translation it->en
path
=
AudioAsset
(
'azacinto_foscolo'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
38d80967
...
...
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
# don
'
t account for that time though
with
to_bytes
(
y
,
sr
)
as
f
:
start_time
=
time
.
perf_counter
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
...
...
tests/entrypoints/openai/test_chat.py
View file @
38d80967
...
...
@@ -12,11 +12,9 @@ import pytest_asyncio
import
regex
as
re
import
requests
import
torch
from
openai
import
BadRequestError
,
OpenAI
from
openai
import
BadRequestError
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
...
@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
async
def
test_http_chat_no_model_name_with_curl
(
server
:
RemoteOpenAIServer
):
url
=
f
"http://localhost:
{
server
.
port
}
/v1/chat/completions"
headers
=
{
"Content-Type"
:
"application/json"
,
}
data
=
{
# model_name is avoided here.
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
"max_tokens"
:
5
}
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
response_data
=
response
.
json
()
print
(
response_data
)
assert
response_data
.
get
(
"model"
)
==
MODEL_NAME
choice
=
response_data
.
get
(
"choices"
)[
0
]
message
=
choice
.
get
(
"message"
)
assert
message
is
not
None
content
=
message
.
get
(
"content"
)
assert
content
is
not
None
assert
len
(
content
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_http_chat_no_model_name_with_openai
(
server
:
RemoteOpenAIServer
):
openai_api_key
=
"EMPTY"
openai_api_base
=
f
"http://localhost:
{
server
.
port
}
/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hello, vLLM!"
},
]
response
=
client
.
chat
.
completions
.
create
(
model
=
""
,
# empty string
messages
=
messages
,
)
assert
response
.
model
==
MODEL_NAME
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
):
...
...
tests/entrypoints/openai/test_chat_template.py
View file @
38d80967
...
...
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
trust_remote_code
=
model_info
.
trust_remote_code
,
revision
=
model_info
.
revision
,
hf_overrides
=
model_info
.
hf_overrides
,
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
dtype
=
model_info
.
dtype
)
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
...
...
tests/entrypoints/openai/test_completion.py
View file @
38d80967
...
...
@@ -3,8 +3,6 @@
# imports for guided decoding tests
import
json
import
os
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
Optional
import
jsonschema
...
...
@@ -14,9 +12,7 @@ import pytest_asyncio
import
regex
as
re
import
requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
...
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"xgrammar"
,
"guidance"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
return
[
...
...
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
View file @
38d80967
...
...
@@ -3,48 +3,23 @@
import
base64
import
io
import
shutil
from
tempfile
import
TemporaryDirectory
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
import
torch
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
transformers
import
AutoConfig
,
AutoTokenizer
from
transformers
import
AutoConfig
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
CONFIG
=
AutoConfig
.
from_pretrained
(
MODEL_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
...
...
tests/entrypoints/openai/test_encoder_decoder.py
View file @
38d80967
...
...
@@ -30,6 +30,7 @@ async def client(server):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"bart is not yet supported in V1"
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
...
...
tests/entrypoints/openai/test_lora_adapters.py
View file @
38d80967
...
...
@@ -9,8 +9,6 @@ from contextlib import suppress
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
...
...
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
BADREQUEST_CASES
=
[
(
...
...
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
...
...
tests/entrypoints/openai/test_metrics.py
View file @
38d80967
...
...
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:inter_token_latency_seconds_sum"
,
"vllm:inter_token_latency_seconds_bucket"
,
"vllm:inter_token_latency_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
...
...
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count"
,
]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
]
@
pytest
.
mark
.
asyncio
...
...
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
(
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
):
if
(
not
server
.
show_hidden_metrics
and
metric
not
in
HIDDEN_DEPRECATED_METRICS
):
assert
metric
in
response
.
text
if
(
metric
in
HIDDEN_DEPRECATED_METRICS
and
not
server
.
show_hidden_metrics
):
continue
assert
metric
in
response
.
text
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_models.py
View file @
38d80967
...
...
@@ -4,8 +4,6 @@
import
openai
# use the official client for correctness check
import
pytest
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
...
...
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
tests/entrypoints/openai/test_prompt_validation.py
View file @
38d80967
...
...
@@ -10,7 +10,7 @@ import pytest
import
regex
as
re
import
torch
from
vllm.entrypoints.
openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.
renderer
import
BaseRenderer
from
...utils
import
RemoteOpenAIServer
...
...
@@ -27,12 +27,16 @@ async def test_empty_prompt():
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"decoder prompt cannot be empty"
):
with
pytest
.
raises
(
openai
.
BadRequestError
,
match
=
"Either prompt or prompt_embeds must be provided and non-empty."
):
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
""
,
max_tokens
=
5
,
temperature
=
0.0
)
temperature
=
0.0
,
extra_body
=
{
"prompt_embeds"
:
[]})
@
pytest
.
mark
.
asyncio
...
...
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
buffer
.
seek
(
0
)
encoded_tensor
=
pybase64
.
b64encode
(
buffer
.
getvalue
())
loaded_prompt_embeds
=
OpenAIServing
.
_
load_prompt_embeds
(
encoded_tensor
)
loaded_prompt_embeds
=
BaseRenderer
.
load_prompt_embeds
(
encoded_tensor
)
assert
len
(
loaded_prompt_embeds
)
==
1
loaded_tensor
=
loaded_prompt_embeds
[
0
][
"prompt_embeds"
]
assert
loaded_tensor
.
device
.
type
==
"cpu"
...
...
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
38d80967
...
...
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
):
@
pytest
.
mark
.
parametrize
(
"background"
,
[
True
,
False
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
,
background
:
bool
):
# TODO: Add back when web search and code interpreter are available in CI
prompts
=
[
"tell me a story about a cat in 20 words"
,
...
...
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
# },
],
stream
=
True
,
background
=
background
,
)
events
=
[]
current_event_mode
=
None
resp_id
=
None
async
for
event
in
response
:
if
event
.
type
==
"response.created"
:
resp_id
=
event
.
response
.
id
if
current_event_mode
!=
event
.
type
:
current_event_mode
=
event
.
type
print
(
f
"
\n
[
{
event
.
type
}
] "
,
end
=
""
,
flush
=
True
)
...
...
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
assert
len
(
events
)
>
0
if
background
:
starting_after
=
5
async
with
await
client
.
responses
.
retrieve
(
response_id
=
resp_id
,
stream
=
True
,
starting_after
=
starting_after
)
as
stream
:
counter
=
starting_after
async
for
event
in
stream
:
counter
+=
1
assert
event
==
events
[
counter
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
...
...
tests/entrypoints/openai/test_return_token_ids.py
View file @
38d80967
...
...
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
logprobs_token_ids
.
append
(
token_id
)
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the
the
suffix of response portion
# The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
View file @
38d80967
...
...
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
default_server_args
# noqa: F401
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
from
.test_completion
import
MODEL_NAME
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
asyncio
from
contextlib
import
suppress
from
dataclasses
import
dataclass
,
field
from
typing
import
Any
,
Optional
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
from
unittest.mock
import
MagicMock
import
pytest
import
pytest_asyncio
from
vllm.config
import
MultiModalConfig
from
vllm.engine.multiprocessing.client
import
MQLLMEngineClient
...
...
@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels
)
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
if
TYPE_CHECKING
:
from
openai
import
OpenAI
GPT_OSS_MODEL_NAME
=
"openai/gpt-oss-20b"
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
,
False
],
ids
=
[
"with_tool_parser"
,
"without_tool_parser"
])
def
with_tool_parser
(
request
)
->
bool
:
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
with_tool_parser
:
bool
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--enforce-eager"
,
"--max-model-len"
,
"4096"
,
"--reasoning-parser"
,
"openai_gptoss"
,
"--gpu-memory-utilization"
,
"0.8"
,
]
if
with_tool_parser
:
args
.
extend
([
"--tool-call-parser"
,
"openai"
,
"--enable-auto-tool-choice"
,
])
return
args
@
pytest
.
fixture
(
scope
=
"module"
)
def
gptoss_server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
,
default_server_args
:
list
[
str
]):
with
monkeypatch_module
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN_VLLM_V1"
)
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
gptoss_client
(
gptoss_server
):
async
with
gptoss_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_chat_tool_call_streaming
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
):
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"What is the weather in Dallas, TX?"
},
]
stream
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
if
with_tool_parser
else
None
,
stream
=
True
)
name
=
None
args_buf
=
""
content_buf
=
""
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
tool_calls
:
tc
=
delta
.
tool_calls
[
0
]
if
tc
.
function
and
tc
.
function
.
name
:
name
=
tc
.
function
.
name
if
tc
.
function
and
tc
.
function
.
arguments
:
args_buf
+=
tc
.
function
.
arguments
if
getattr
(
delta
,
"content"
,
None
):
content_buf
+=
delta
.
content
if
with_tool_parser
:
assert
name
is
not
None
assert
len
(
args_buf
)
>
0
else
:
assert
name
is
None
assert
len
(
args_buf
)
==
0
assert
len
(
content_buf
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_multi_turn_chat
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
):
if
not
with_tool_parser
:
pytest
.
skip
(
"skip non-tool for multi-turn tests"
)
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}]
messages
=
[
{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"What is the weather in Dallas, TX with celsius?"
},
]
first
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
temperature
=
0.0
,
)
first_msg
=
first
.
choices
[
0
].
message
assert
first_msg
.
tool_calls
is
not
None
and
len
(
first_msg
.
tool_calls
)
>
0
tc
=
first_msg
.
tool_calls
[
0
]
assert
tc
.
function
is
not
None
and
tc
.
function
.
name
==
"get_current_weather"
args1
=
tc
.
function
.
arguments
assert
args1
is
not
None
and
len
(
args1
)
>
0
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
args1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Now convert to celsius and return JSON only"
})
second
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
temperature
=
0.0
,
)
second_msg
=
second
.
choices
[
0
].
message
assert
(
second_msg
.
content
is
not
None
and
len
(
second_msg
.
content
)
>
0
)
or
\
(
second_msg
.
tool_calls
is
not
None
and
len
(
second_msg
.
tool_calls
)
>
0
)
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
),
BaseModelPath
(
name
=
MODEL_NAME_SHORT
,
model_path
=
MODEL_NAME_SHORT
)
]
@
dataclass
...
...
@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_returns_correct_model_name
():
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
models
=
OpenAIServingModels
(
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
model_config
=
MockModelConfig
())
serving_chat
=
OpenAIServingChat
(
mock_engine
,
MockModelConfig
(),
models
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
async
def
return_model_name
(
*
args
):
return
args
[
3
]
serving_chat
.
chat_completion_full_generator
=
return_model_name
# Test that full name is returned when short name is requested
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME_SHORT
,
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
# Test that full name is returned when empty string is specified
req
=
ChatCompletionRequest
(
model
=
""
,
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
# Test that full name is returned when no model is specified
req
=
ChatCompletionRequest
(
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_should_set_correct_max_tokens
():
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
...
...
@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
}],
)
# By default cache_salt in the engine prompt is not set
# By default
,
cache_salt in the engine prompt is not set
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
assert
"cache_salt"
not
in
mock_engine
.
generate
.
call_args
.
args
[
0
]
...
...
tests/entrypoints/openai/test_skip_tokenizer.py
View file @
38d80967
...
...
@@ -11,7 +11,7 @@ import torch
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"
christian-pinto
/Prithvi-EO-2.0-300M-TL-
VLLM
"
MODEL_NAME
=
"
ibm-nasa-geospatial
/Prithvi-EO-2.0-300M-TL-
Sen1Floods11
"
DTYPE
=
"float16"
...
...
@@ -35,7 +35,9 @@ def server():
"--trust-remote-code"
,
"--skip-tokenizer-init"
,
"--max-num-seqs"
,
"32"
"32"
,
"--model-impl"
,
"terratorch"
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
tests/entrypoints/openai/test_tokenization.py
View file @
38d80967
...
...
@@ -8,8 +8,6 @@ import requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment