Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
353 additions
and
153 deletions
+353
-153
tests/entrypoints/conftest.py
tests/entrypoints/conftest.py
+29
-0
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+3
-2
tests/entrypoints/offline_mode/test_offline_mode.py
tests/entrypoints/offline_mode/test_offline_mode.py
+2
-2
tests/entrypoints/openai/conftest.py
tests/entrypoints/openai/conftest.py
+27
-0
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-1
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+1
-56
tests/entrypoints/openai/test_chat_template.py
tests/entrypoints/openai/test_chat_template.py
+3
-1
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+0
-26
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
.../entrypoints/openai/test_completion_with_prompt_embeds.py
+1
-26
tests/entrypoints/openai/test_encoder_decoder.py
tests/entrypoints/openai/test_encoder_decoder.py
+1
-0
tests/entrypoints/openai/test_lora_adapters.py
tests/entrypoints/openai/test_lora_adapters.py
+0
-8
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-7
tests/entrypoints/openai/test_models.py
tests/entrypoints/openai/test_models.py
+0
-8
tests/entrypoints/openai/test_prompt_validation.py
tests/entrypoints/openai/test_prompt_validation.py
+9
-5
tests/entrypoints/openai/test_response_api_with_harmony.py
tests/entrypoints/openai/test_response_api_with_harmony.py
+18
-1
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_token_ids.py
+1
-1
tests/entrypoints/openai/test_return_tokens_as_ids.py
tests/entrypoints/openai/test_return_tokens_as_ids.py
+0
-2
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+238
-3
tests/entrypoints/openai/test_skip_tokenizer.py
tests/entrypoints/openai/test_skip_tokenizer.py
+4
-2
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+0
-2
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/entrypoints/conftest.py
View file @
38d80967
...
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
...
@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
condition: column "=" number
condition: column "=" number
number: "1" | "2"
number: "1" | "2"
"""
)
"""
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_files
():
"""Download zephyr LoRA files once per test session."""
from
huggingface_hub
import
snapshot_download
return
snapshot_download
(
repo_id
=
"typeof/zephyr-7b-beta-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
"""Create zephyr LoRA files with added tokens once per test session."""
import
shutil
from
tempfile
import
TemporaryDirectory
from
transformers
import
AutoTokenizer
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"HuggingFaceH4/zephyr-7b-beta"
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
tests/entrypoints/llm/test_chat.py
View file @
38d80967
...
@@ -7,7 +7,7 @@ import pytest
...
@@ -7,7 +7,7 @@ import pytest
from
vllm
import
LLM
from
vllm
import
LLM
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
..openai.test_vision
import
TEST_IMAGE_
URL
S
from
..openai.test_vision
import
TEST_IMAGE_
ASSET
S
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
...
@@ -95,7 +95,8 @@ def vision_llm():
...
@@ -95,7 +95,8 @@ def vision_llm():
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
[[
TEST_IMAGE_ASSETS
[
0
],
TEST_IMAGE_ASSETS
[
1
]]],
indirect
=
True
)
def
test_chat_multi_image
(
vision_llm
,
image_urls
:
list
[
str
]):
def
test_chat_multi_image
(
vision_llm
,
image_urls
:
list
[
str
]):
messages
=
[{
messages
=
[{
"role"
:
"role"
:
...
...
tests/entrypoints/offline_mode/test_offline_mode.py
View file @
38d80967
...
@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
...
@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
)
)
# Need to re-import huggingface_hub
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set
up offline mode
_re_import_modules
()
_re_import_modules
()
# Cached model files should be used in offline mode
# Cached model files should be used in offline mode
for
model_config
in
MODEL_CONFIGS
:
for
model_config
in
MODEL_CONFIGS
:
...
@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
...
@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
disable_connect
,
disable_connect
,
)
)
# Need to re-import huggingface_hub
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set
up offline mode
_re_import_modules
()
_re_import_modules
()
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
)
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
...
...
tests/entrypoints/openai/conftest.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.assets.audio
import
AudioAsset
@
pytest
.
fixture
def
mary_had_lamb
():
path
=
AudioAsset
(
'mary_had_lamb'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
fixture
def
winning_call
():
path
=
AudioAsset
(
'winning_call'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
@
pytest
.
fixture
def
foscolo
():
# Test translation it->en
path
=
AudioAsset
(
'azacinto_foscolo'
).
get_local_path
()
with
open
(
str
(
path
),
"rb"
)
as
f
:
yield
f
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
38d80967
...
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
...
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
# Send loaded audio directly instead of loading from disk,
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
# don
'
t account for that time though
with
to_bytes
(
y
,
sr
)
as
f
:
with
to_bytes
(
y
,
sr
)
as
f
:
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
...
...
tests/entrypoints/openai/test_chat.py
View file @
38d80967
...
@@ -12,11 +12,9 @@ import pytest_asyncio
...
@@ -12,11 +12,9 @@ import pytest_asyncio
import
regex
as
re
import
regex
as
re
import
requests
import
requests
import
torch
import
torch
from
openai
import
BadRequestError
,
OpenAI
from
openai
import
BadRequestError
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
...
@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
or
"less_than_equal"
in
exc_info
.
value
.
message
)
or
"less_than_equal"
in
exc_info
.
value
.
message
)
@
pytest
.
mark
.
asyncio
async
def
test_http_chat_no_model_name_with_curl
(
server
:
RemoteOpenAIServer
):
url
=
f
"http://localhost:
{
server
.
port
}
/v1/chat/completions"
headers
=
{
"Content-Type"
:
"application/json"
,
}
data
=
{
# model_name is avoided here.
"messages"
:
[{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}],
"max_tokens"
:
5
}
response
=
requests
.
post
(
url
,
headers
=
headers
,
json
=
data
)
response_data
=
response
.
json
()
print
(
response_data
)
assert
response_data
.
get
(
"model"
)
==
MODEL_NAME
choice
=
response_data
.
get
(
"choices"
)[
0
]
message
=
choice
.
get
(
"message"
)
assert
message
is
not
None
content
=
message
.
get
(
"content"
)
assert
content
is
not
None
assert
len
(
content
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_http_chat_no_model_name_with_openai
(
server
:
RemoteOpenAIServer
):
openai_api_key
=
"EMPTY"
openai_api_base
=
f
"http://localhost:
{
server
.
port
}
/v1"
client
=
OpenAI
(
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
messages
=
[
{
"role"
:
"user"
,
"content"
:
"Hello, vLLM!"
},
]
response
=
client
.
chat
.
completions
.
create
(
model
=
""
,
# empty string
messages
=
messages
,
)
assert
response
.
model
==
MODEL_NAME
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
):
client
:
openai
.
AsyncOpenAI
):
...
...
tests/entrypoints/openai/test_chat_template.py
View file @
38d80967
...
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
...
@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
trust_remote_code
=
model_info
.
trust_remote_code
,
trust_remote_code
=
model_info
.
trust_remote_code
,
revision
=
model_info
.
revision
,
revision
=
model_info
.
revision
,
hf_overrides
=
model_info
.
hf_overrides
,
hf_overrides
=
model_info
.
hf_overrides
,
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
,
enforce_eager
=
model_info
.
enforce_eager
,
dtype
=
model_info
.
dtype
)
# Initialize the tokenizer
# Initialize the tokenizer
tokenizer
=
get_tokenizer
(
tokenizer
=
get_tokenizer
(
...
...
tests/entrypoints/openai/test_completion.py
View file @
38d80967
...
@@ -3,8 +3,6 @@
...
@@ -3,8 +3,6 @@
# imports for guided decoding tests
# imports for guided decoding tests
import
json
import
json
import
os
import
os
import
shutil
from
tempfile
import
TemporaryDirectory
from
typing
import
Optional
from
typing
import
Optional
import
jsonschema
import
jsonschema
...
@@ -14,9 +12,7 @@ import pytest_asyncio
...
@@ -14,9 +12,7 @@ import pytest_asyncio
import
regex
as
re
import
regex
as
re
import
requests
import
requests
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
from
transformers
import
AutoTokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
...
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
...
@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model,
# technically these adapters use a different base model,
# but we're not testing generation quality here
# but we're not testing generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"xgrammar"
,
"guidance"
]
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"xgrammar"
,
"guidance"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_added_tokens_files
):
return
[
return
[
...
...
tests/entrypoints/openai/test_completion_with_prompt_embeds.py
View file @
38d80967
...
@@ -3,48 +3,23 @@
...
@@ -3,48 +3,23 @@
import
base64
import
base64
import
io
import
io
import
shutil
from
tempfile
import
TemporaryDirectory
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
import
torch
import
torch
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
from
transformers
import
AutoConfig
,
AutoTokenizer
from
transformers
import
AutoConfig
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
CONFIG
=
AutoConfig
.
from_pretrained
(
MODEL_NAME
)
CONFIG
=
AutoConfig
.
from_pretrained
(
MODEL_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_added_tokens_files
(
zephyr_lora_files
):
tmp_dir
=
TemporaryDirectory
()
tmp_model_dir
=
f
"
{
tmp_dir
.
name
}
/zephyr"
shutil
.
copytree
(
zephyr_lora_files
,
tmp_model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added
=
tokenizer
.
add_tokens
([
"vllm1"
,
"vllm2"
,
"vllm3"
],
special_tokens
=
True
)
assert
added
==
3
tokenizer
.
save_pretrained
(
tmp_model_dir
)
yield
tmp_model_dir
tmp_dir
.
cleanup
()
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
def
default_server_args
(
zephyr_lora_files
,
zephyr_lora_files
,
...
...
tests/entrypoints/openai/test_encoder_decoder.py
View file @
38d80967
...
@@ -30,6 +30,7 @@ async def client(server):
...
@@ -30,6 +30,7 @@ async def client(server):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
skip
(
reason
=
"bart is not yet supported in V1"
)
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
async
def
test_single_completion
(
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
...
...
tests/entrypoints/openai/test_lora_adapters.py
View file @
38d80967
...
@@ -9,8 +9,6 @@ from contextlib import suppress
...
@@ -9,8 +9,6 @@ from contextlib import suppress
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
...
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
...
@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
BADREQUEST_CASES
=
[
BADREQUEST_CASES
=
[
(
(
...
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
...
@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
]
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
from
_pytest.monkeypatch
import
MonkeyPatch
...
...
tests/entrypoints/openai/test_metrics.py
View file @
38d80967
...
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
...
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:request_params_max_tokens_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:inter_token_latency_seconds_sum"
,
"vllm:inter_token_latency_seconds_bucket"
,
"vllm:inter_token_latency_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
"vllm:e2e_request_latency_seconds_count"
,
...
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
...
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count"
,
"vllm:request_decode_time_seconds_count"
,
]
]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
...
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
response
.
status_code
==
HTTPStatus
.
OK
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
(
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
):
for
metric
in
(
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
):
if
(
not
server
.
show_hidden_metrics
if
(
metric
in
HIDDEN_DEPRECATED_METRICS
and
metric
not
in
HIDDEN_DEPRECATED_METRICS
):
and
not
server
.
show_hidden_metrics
):
assert
metric
in
response
.
text
continue
assert
metric
in
response
.
text
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
tests/entrypoints/openai/test_models.py
View file @
38d80967
...
@@ -4,8 +4,6 @@
...
@@ -4,8 +4,6 @@
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
...
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
...
@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
# generation quality here
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
tests/entrypoints/openai/test_prompt_validation.py
View file @
38d80967
...
@@ -10,7 +10,7 @@ import pytest
...
@@ -10,7 +10,7 @@ import pytest
import
regex
as
re
import
regex
as
re
import
torch
import
torch
from
vllm.entrypoints.
openai.serving_engine
import
OpenAIServing
from
vllm.entrypoints.
renderer
import
BaseRenderer
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
...
@@ -27,12 +27,16 @@ async def test_empty_prompt():
...
@@ -27,12 +27,16 @@ async def test_empty_prompt():
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
with
RemoteOpenAIServer
(
model_name
,
server_args
)
as
remote_server
:
client
=
remote_server
.
get_async_client
()
client
=
remote_server
.
get_async_client
()
with
pytest
.
raises
(
openai
.
BadRequestError
,
with
pytest
.
raises
(
match
=
"decoder prompt cannot be empty"
):
openai
.
BadRequestError
,
match
=
"Either prompt or prompt_embeds must be provided and non-empty."
):
await
client
.
completions
.
create
(
model
=
model_name
,
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
""
,
prompt
=
""
,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
)
temperature
=
0.0
,
extra_body
=
{
"prompt_embeds"
:
[]})
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
...
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
buffer
.
seek
(
0
)
buffer
.
seek
(
0
)
encoded_tensor
=
pybase64
.
b64encode
(
buffer
.
getvalue
())
encoded_tensor
=
pybase64
.
b64encode
(
buffer
.
getvalue
())
loaded_prompt_embeds
=
OpenAIServing
.
_
load_prompt_embeds
(
encoded_tensor
)
loaded_prompt_embeds
=
BaseRenderer
.
load_prompt_embeds
(
encoded_tensor
)
assert
len
(
loaded_prompt_embeds
)
==
1
assert
len
(
loaded_prompt_embeds
)
==
1
loaded_tensor
=
loaded_prompt_embeds
[
0
][
"prompt_embeds"
]
loaded_tensor
=
loaded_prompt_embeds
[
0
][
"prompt_embeds"
]
assert
loaded_tensor
.
device
.
type
==
"cpu"
assert
loaded_tensor
.
device
.
type
==
"cpu"
...
...
tests/entrypoints/openai/test_response_api_with_harmony.py
View file @
38d80967
...
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
...
@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
):
@
pytest
.
mark
.
parametrize
(
"background"
,
[
True
,
False
])
async
def
test_streaming
(
client
:
OpenAI
,
model_name
:
str
,
background
:
bool
):
# TODO: Add back when web search and code interpreter are available in CI
# TODO: Add back when web search and code interpreter are available in CI
prompts
=
[
prompts
=
[
"tell me a story about a cat in 20 words"
,
"tell me a story about a cat in 20 words"
,
...
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
...
@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
# },
# },
],
],
stream
=
True
,
stream
=
True
,
background
=
background
,
)
)
events
=
[]
events
=
[]
current_event_mode
=
None
current_event_mode
=
None
resp_id
=
None
async
for
event
in
response
:
async
for
event
in
response
:
if
event
.
type
==
"response.created"
:
resp_id
=
event
.
response
.
id
if
current_event_mode
!=
event
.
type
:
if
current_event_mode
!=
event
.
type
:
current_event_mode
=
event
.
type
current_event_mode
=
event
.
type
print
(
f
"
\n
[
{
event
.
type
}
] "
,
end
=
""
,
flush
=
True
)
print
(
f
"
\n
[
{
event
.
type
}
] "
,
end
=
""
,
flush
=
True
)
...
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
...
@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
assert
len
(
events
)
>
0
assert
len
(
events
)
>
0
if
background
:
starting_after
=
5
async
with
await
client
.
responses
.
retrieve
(
response_id
=
resp_id
,
stream
=
True
,
starting_after
=
starting_after
)
as
stream
:
counter
=
starting_after
async
for
event
in
stream
:
counter
+=
1
assert
event
==
events
[
counter
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
...
...
tests/entrypoints/openai/test_return_token_ids.py
View file @
38d80967
...
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
...
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
logprobs_token_ids
.
append
(
token_id
)
logprobs_token_ids
.
append
(
token_id
)
# When echo=True, the logprobs include both prompt and response tokens
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the
the
suffix of response portion
# The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion
# The prompt_token_ids should match the prompt portion
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
...
...
tests/entrypoints/openai/test_return_tokens_as_ids.py
View file @
38d80967
...
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
...
@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
default_server_args
# noqa: F401
from
.test_completion
import
default_server_args
# noqa: F401
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
from
.test_completion
import
MODEL_NAME
from
.test_completion
import
MODEL_NAME
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
asyncio
import
asyncio
from
contextlib
import
suppress
from
contextlib
import
suppress
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
typing
import
Any
,
Optional
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
import
pytest
import
pytest_asyncio
from
vllm.config
import
MultiModalConfig
from
vllm.config
import
MultiModalConfig
from
vllm.engine.multiprocessing.client
import
MQLLMEngineClient
from
vllm.engine.multiprocessing.client
import
MQLLMEngineClient
...
@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
...
@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels
)
OpenAIServingModels
)
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
if
TYPE_CHECKING
:
from
openai
import
OpenAI
GPT_OSS_MODEL_NAME
=
"openai/gpt-oss-20b"
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
from
_pytest.monkeypatch
import
MonkeyPatch
mpatch
=
MonkeyPatch
()
yield
mpatch
mpatch
.
undo
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
,
False
],
ids
=
[
"with_tool_parser"
,
"without_tool_parser"
])
def
with_tool_parser
(
request
)
->
bool
:
return
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
default_server_args
(
with_tool_parser
:
bool
):
args
=
[
# use half precision for speed and memory savings in CI environment
"--enforce-eager"
,
"--max-model-len"
,
"4096"
,
"--reasoning-parser"
,
"openai_gptoss"
,
"--gpu-memory-utilization"
,
"0.8"
,
]
if
with_tool_parser
:
args
.
extend
([
"--tool-call-parser"
,
"openai"
,
"--enable-auto-tool-choice"
,
])
return
args
@
pytest
.
fixture
(
scope
=
"module"
)
def
gptoss_server
(
monkeypatch_module
:
pytest
.
MonkeyPatch
,
default_server_args
:
list
[
str
]):
with
monkeypatch_module
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN_VLLM_V1"
)
with
RemoteOpenAIServer
(
GPT_OSS_MODEL_NAME
,
default_server_args
)
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
gptoss_client
(
gptoss_server
):
async
with
gptoss_server
.
get_async_client
()
as
async_client
:
yield
async_client
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_chat_tool_call_streaming
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
):
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}]
messages
=
[
{
"role"
:
"user"
,
"content"
:
"What is the weather in Dallas, TX?"
},
]
stream
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
if
with_tool_parser
else
None
,
stream
=
True
)
name
=
None
args_buf
=
""
content_buf
=
""
async
for
chunk
in
stream
:
delta
=
chunk
.
choices
[
0
].
delta
if
delta
.
tool_calls
:
tc
=
delta
.
tool_calls
[
0
]
if
tc
.
function
and
tc
.
function
.
name
:
name
=
tc
.
function
.
name
if
tc
.
function
and
tc
.
function
.
arguments
:
args_buf
+=
tc
.
function
.
arguments
if
getattr
(
delta
,
"content"
,
None
):
content_buf
+=
delta
.
content
if
with_tool_parser
:
assert
name
is
not
None
assert
len
(
args_buf
)
>
0
else
:
assert
name
is
None
assert
len
(
args_buf
)
==
0
assert
len
(
content_buf
)
>
0
@
pytest
.
mark
.
asyncio
async
def
test_gpt_oss_multi_turn_chat
(
gptoss_client
:
OpenAI
,
with_tool_parser
:
bool
):
if
not
with_tool_parser
:
pytest
.
skip
(
"skip non-tool for multi-turn tests"
)
tools
=
[{
"type"
:
"function"
,
"function"
:
{
"name"
:
"get_current_weather"
,
"description"
:
"Get the current weather in a given location"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"city"
:
{
"type"
:
"string"
},
"state"
:
{
"type"
:
"string"
},
"unit"
:
{
"type"
:
"string"
,
"enum"
:
[
"celsius"
,
"fahrenheit"
],
},
},
"required"
:
[
"city"
,
"state"
,
"unit"
],
},
},
}]
messages
=
[
{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"What is the weather in Dallas, TX with celsius?"
},
]
first
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
temperature
=
0.0
,
)
first_msg
=
first
.
choices
[
0
].
message
assert
first_msg
.
tool_calls
is
not
None
and
len
(
first_msg
.
tool_calls
)
>
0
tc
=
first_msg
.
tool_calls
[
0
]
assert
tc
.
function
is
not
None
and
tc
.
function
.
name
==
"get_current_weather"
args1
=
tc
.
function
.
arguments
assert
args1
is
not
None
and
len
(
args1
)
>
0
messages
.
append
({
"role"
:
"assistant"
,
"content"
:
args1
})
messages
.
append
({
"role"
:
"user"
,
"content"
:
"Now convert to celsius and return JSON only"
})
second
=
await
gptoss_client
.
chat
.
completions
.
create
(
model
=
GPT_OSS_MODEL_NAME
,
messages
=
messages
,
tools
=
tools
,
temperature
=
0.0
,
)
second_msg
=
second
.
choices
[
0
].
message
assert
(
second_msg
.
content
is
not
None
and
len
(
second_msg
.
content
)
>
0
)
or
\
(
second_msg
.
tool_calls
is
not
None
and
len
(
second_msg
.
tool_calls
)
>
0
)
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME
=
"openai-community/gpt2"
MODEL_NAME_SHORT
=
"gpt2"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
CHAT_TEMPLATE
=
"Dummy chat template for testing {}"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
),
BaseModelPath
(
name
=
MODEL_NAME_SHORT
,
model_path
=
MODEL_NAME_SHORT
)
]
@
dataclass
@
dataclass
...
@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
...
@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
assert
serving_completion
.
chat_template
==
CHAT_TEMPLATE
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_returns_correct_model_name
():
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
.
get_tokenizer
.
return_value
=
get_tokenizer
(
MODEL_NAME
)
mock_engine
.
errored
=
False
models
=
OpenAIServingModels
(
engine_client
=
mock_engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
model_config
=
MockModelConfig
())
serving_chat
=
OpenAIServingChat
(
mock_engine
,
MockModelConfig
(),
models
,
response_role
=
"assistant"
,
chat_template
=
CHAT_TEMPLATE
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
)
messages
=
[{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
async
def
return_model_name
(
*
args
):
return
args
[
3
]
serving_chat
.
chat_completion_full_generator
=
return_model_name
# Test that full name is returned when short name is requested
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME_SHORT
,
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
# Test that full name is returned when empty string is specified
req
=
ChatCompletionRequest
(
model
=
""
,
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
# Test that full name is returned when no model is specified
req
=
ChatCompletionRequest
(
messages
=
messages
)
assert
await
serving_chat
.
create_chat_completion
(
req
)
==
MODEL_NAME
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_serving_chat_should_set_correct_max_tokens
():
async
def
test_serving_chat_should_set_correct_max_tokens
():
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
mock_engine
=
MagicMock
(
spec
=
MQLLMEngineClient
)
...
@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
...
@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
}],
}],
)
)
# By default cache_salt in the engine prompt is not set
# By default
,
cache_salt in the engine prompt is not set
with
suppress
(
Exception
):
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
await
serving_chat
.
create_chat_completion
(
req
)
assert
"cache_salt"
not
in
mock_engine
.
generate
.
call_args
.
args
[
0
]
assert
"cache_salt"
not
in
mock_engine
.
generate
.
call_args
.
args
[
0
]
...
...
tests/entrypoints/openai/test_skip_tokenizer.py
View file @
38d80967
...
@@ -11,7 +11,7 @@ import torch
...
@@ -11,7 +11,7 @@ import torch
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"
christian-pinto
/Prithvi-EO-2.0-300M-TL-
VLLM
"
MODEL_NAME
=
"
ibm-nasa-geospatial
/Prithvi-EO-2.0-300M-TL-
Sen1Floods11
"
DTYPE
=
"float16"
DTYPE
=
"float16"
...
@@ -35,7 +35,9 @@ def server():
...
@@ -35,7 +35,9 @@ def server():
"--trust-remote-code"
,
"--trust-remote-code"
,
"--skip-tokenizer-init"
,
"--skip-tokenizer-init"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
"32"
"32"
,
"--model-impl"
,
"terratorch"
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
tests/entrypoints/openai/test_tokenization.py
View file @
38d80967
...
@@ -8,8 +8,6 @@ import requests
...
@@ -8,8 +8,6 @@ import requests
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
from
.test_completion
import
zephyr_lora_added_tokens_files
# noqa: F401
from
.test_completion
import
zephyr_lora_files
# noqa: F401
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment