Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cbd14ed5
Unverified
Commit
cbd14ed5
authored
Jul 11, 2025
by
Cyrus Leung
Committed by
GitHub
Jul 11, 2025
Browse files
[Bugfix] Refactor `/invocations` to be task-agnostic (#20764)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
7bd4c37a
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
352 additions
and
75 deletions
+352
-75
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+33
-4
tests/entrypoints/openai/test_classification.py
tests/entrypoints/openai/test_classification.py
+22
-0
tests/entrypoints/openai/test_completion.py
tests/entrypoints/openai/test_completion.py
+25
-0
tests/entrypoints/openai/test_embedding.py
tests/entrypoints/openai/test_embedding.py
+60
-0
tests/entrypoints/openai/test_pooling.py
tests/entrypoints/openai/test_pooling.py
+91
-22
tests/entrypoints/openai/test_rerank.py
tests/entrypoints/openai/test_rerank.py
+27
-0
tests/entrypoints/openai/test_score.py
tests/entrypoints/openai/test_score.py
+25
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+45
-47
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+24
-2
No files found.
tests/entrypoints/openai/test_chat.py
View file @
cbd14ed5
...
@@ -1113,10 +1113,7 @@ async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
...
@@ -1113,10 +1113,7 @@ async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
,
""
])
async
def
test_http_chat_no_model_name_with_openai
(
server
:
RemoteOpenAIServer
):
async
def
test_http_chat_no_model_name_with_openai
(
server
:
RemoteOpenAIServer
,
model_name
:
str
):
openai_api_key
=
"EMPTY"
openai_api_key
=
"EMPTY"
openai_api_base
=
f
"http://localhost:
{
server
.
port
}
/v1"
openai_api_base
=
f
"http://localhost:
{
server
.
port
}
/v1"
...
@@ -1135,3 +1132,35 @@ async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
...
@@ -1135,3 +1132,35 @@ async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
messages
=
messages
,
messages
=
messages
,
)
)
assert
response
.
model
==
MODEL_NAME
assert
response
.
model
==
MODEL_NAME
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
"what is 1+1?"
}]
request_args
=
{
"model"
:
MODEL_NAME
,
"messages"
:
messages
,
"max_completion_tokens"
:
5
,
"temperature"
:
0.0
,
"logprobs"
:
False
,
}
chat_completion
=
await
client
.
chat
.
completions
.
create
(
**
request_args
)
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
chat_output
=
chat_completion
.
model_dump
()
invocation_output
=
invocation_response
.
json
()
assert
chat_output
.
keys
()
==
invocation_output
.
keys
()
assert
chat_output
[
"choices"
]
==
invocation_output
[
"choices"
]
tests/entrypoints/openai/test_classification.py
View file @
cbd14ed5
...
@@ -155,3 +155,25 @@ def test_batch_classification_empty_list(server: RemoteOpenAIServer,
...
@@ -155,3 +155,25 @@ def test_batch_classification_empty_list(server: RemoteOpenAIServer,
assert
output
.
object
==
"list"
assert
output
.
object
==
"list"
assert
isinstance
(
output
.
data
,
list
)
assert
isinstance
(
output
.
data
,
list
)
assert
len
(
output
.
data
)
==
0
assert
len
(
output
.
data
)
==
0
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
):
request_args
=
{
"model"
:
MODEL_NAME
,
"input"
:
"This product was excellent and exceeded my expectations"
}
classification_response
=
requests
.
post
(
server
.
url_for
(
"classify"
),
json
=
request_args
)
classification_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
classification_output
=
classification_response
.
json
()
invocation_output
=
invocation_response
.
json
()
assert
classification_output
.
keys
()
==
invocation_output
.
keys
()
assert
classification_output
[
"data"
]
==
invocation_output
[
"data"
]
tests/entrypoints/openai/test_completion.py
View file @
cbd14ed5
...
@@ -11,6 +11,7 @@ import openai # use the official client for correctness check
...
@@ -11,6 +11,7 @@ import openai # use the official client for correctness check
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
import
regex
as
re
import
regex
as
re
import
requests
# downloading lora to test lora requests
# downloading lora to test lora requests
from
huggingface_hub
import
snapshot_download
from
huggingface_hub
import
snapshot_download
from
openai
import
BadRequestError
from
openai
import
BadRequestError
...
@@ -833,3 +834,27 @@ async def test_echo_stream_completion(client: openai.AsyncOpenAI,
...
@@ -833,3 +834,27 @@ async def test_echo_stream_completion(client: openai.AsyncOpenAI,
assert
content
is
not
None
and
saying
in
content
assert
content
is
not
None
and
saying
in
content
else
:
else
:
assert
content
is
not
None
and
saying
not
in
content
assert
content
is
not
None
and
saying
not
in
content
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
):
request_args
=
{
"model"
:
MODEL_NAME
,
"prompt"
:
"Hello, my name is"
,
"max_tokens"
:
5
,
"temperature"
:
0.0
,
"logprobs"
:
None
,
}
completion
=
await
client
.
completions
.
create
(
**
request_args
)
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
completion_output
=
completion
.
model_dump
()
invocation_output
=
invocation_response
.
json
()
assert
completion_output
.
keys
()
==
invocation_output
.
keys
()
assert
completion_output
[
"choices"
]
==
invocation_output
[
"choices"
]
tests/entrypoints/openai/test_embedding.py
View file @
cbd14ed5
...
@@ -296,3 +296,63 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
...
@@ -296,3 +296,63 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
assert
"error"
in
response
.
object
assert
"error"
in
response
.
object
assert
"truncate_prompt_tokens value is greater than max_model_len. "
\
assert
"truncate_prompt_tokens value is greater than max_model_len. "
\
"Please, select a smaller truncation size."
in
response
.
message
"Please, select a smaller truncation size."
in
response
.
message
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
,
client
:
openai
.
AsyncOpenAI
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
request_args
=
{
"model"
:
MODEL_NAME
,
"input"
:
input_texts
,
"encoding_format"
:
"float"
,
}
completion_response
=
await
client
.
embeddings
.
create
(
**
request_args
)
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
completion_output
=
completion_response
.
model_dump
()
invocation_output
=
invocation_response
.
json
()
assert
completion_output
.
keys
()
==
invocation_output
.
keys
()
assert
completion_output
[
"data"
]
==
invocation_output
[
"data"
]
@
pytest
.
mark
.
asyncio
async
def
test_invocations_conversation
(
server
:
RemoteOpenAIServer
):
messages
=
[{
"role"
:
"user"
,
"content"
:
"The cat sat on the mat."
,
},
{
"role"
:
"assistant"
,
"content"
:
"A feline was resting on a rug."
,
},
{
"role"
:
"user"
,
"content"
:
"Stars twinkle brightly in the night sky."
,
}]
request_args
=
{
"model"
:
MODEL_NAME
,
"messages"
:
messages
,
"encoding_format"
:
"float"
,
}
chat_response
=
requests
.
post
(
server
.
url_for
(
"v1/embeddings"
),
json
=
request_args
)
chat_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
chat_output
=
chat_response
.
json
()
invocation_output
=
invocation_response
.
json
()
assert
chat_output
.
keys
()
==
invocation_output
.
keys
()
assert
chat_output
[
"data"
]
==
invocation_output
[
"data"
]
tests/entrypoints/openai/test_pooling.py
View file @
cbd14ed5
...
@@ -13,7 +13,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
...
@@ -13,7 +13,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from
...utils
import
RemoteOpenAIServer
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"
jason9693/Qwen2.5-1.5B-apeach
"
MODEL_NAME
=
"
internlm/internlm2-1_8b-reward
"
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
DUMMY_CHAT_TEMPLATE
=
"""{% for message in messages %}{{message['role'] + ': ' + message['content'] + '
\\
n'}}{% endfor %}"""
# noqa: E501
...
@@ -21,15 +21,16 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
...
@@ -21,15 +21,16 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
def
server
():
def
server
():
args
=
[
args
=
[
"--task"
,
"--task"
,
"
classify
"
,
"
reward
"
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--max-model-len"
,
"--max-model-len"
,
"
819
2"
,
"
51
2"
,
"--chat-template"
,
"--chat-template"
,
DUMMY_CHAT_TEMPLATE
,
DUMMY_CHAT_TEMPLATE
,
"--trust-remote-code"
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
@@ -57,10 +58,10 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -57,10 +58,10 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
assert
poolings
.
id
is
not
None
assert
poolings
.
id
is
not
None
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
2
assert
len
(
poolings
.
data
[
0
].
data
)
==
8
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
prompt_tokens
==
7
assert
poolings
.
usage
.
prompt_tokens
==
8
assert
poolings
.
usage
.
total_tokens
==
7
assert
poolings
.
usage
.
total_tokens
==
8
# test using token IDs
# test using token IDs
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
input_tokens
=
[
1
,
1
,
1
,
1
,
1
]
...
@@ -77,7 +78,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -77,7 +78,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
assert
poolings
.
id
is
not
None
assert
poolings
.
id
is
not
None
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
)
==
1
assert
len
(
poolings
.
data
[
0
].
data
)
==
2
assert
len
(
poolings
.
data
[
0
].
data
)
==
5
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
prompt_tokens
==
5
assert
poolings
.
usage
.
prompt_tokens
==
5
assert
poolings
.
usage
.
total_tokens
==
5
assert
poolings
.
usage
.
total_tokens
==
5
...
@@ -104,10 +105,10 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -104,10 +105,10 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
assert
poolings
.
id
is
not
None
assert
poolings
.
id
is
not
None
assert
len
(
poolings
.
data
)
==
3
assert
len
(
poolings
.
data
)
==
3
assert
len
(
poolings
.
data
[
0
].
data
)
==
2
assert
len
(
poolings
.
data
[
0
].
data
)
==
8
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
prompt_tokens
==
2
5
assert
poolings
.
usage
.
prompt_tokens
==
2
9
assert
poolings
.
usage
.
total_tokens
==
2
5
assert
poolings
.
usage
.
total_tokens
==
2
9
# test list[list[int]]
# test list[list[int]]
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
input_tokens
=
[[
4
,
5
,
7
,
9
,
20
],
[
15
,
29
,
499
],
[
24
,
24
,
24
,
24
,
24
],
...
@@ -125,7 +126,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
...
@@ -125,7 +126,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
assert
poolings
.
id
is
not
None
assert
poolings
.
id
is
not
None
assert
len
(
poolings
.
data
)
==
4
assert
len
(
poolings
.
data
)
==
4
assert
len
(
poolings
.
data
[
0
].
data
)
==
2
assert
len
(
poolings
.
data
[
0
].
data
)
==
5
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
completion_tokens
==
0
assert
poolings
.
usage
.
prompt_tokens
==
17
assert
poolings
.
usage
.
prompt_tokens
==
17
assert
poolings
.
usage
.
total_tokens
==
17
assert
poolings
.
usage
.
total_tokens
==
17
...
@@ -157,7 +158,11 @@ async def test_conversation_pooling(server: RemoteOpenAIServer,
...
@@ -157,7 +158,11 @@ async def test_conversation_pooling(server: RemoteOpenAIServer,
chat_response
.
raise_for_status
()
chat_response
.
raise_for_status
()
chat_poolings
=
PoolingResponse
.
model_validate
(
chat_response
.
json
())
chat_poolings
=
PoolingResponse
.
model_validate
(
chat_response
.
json
())
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
tokenizer_mode
=
"fast"
,
trust_remote_code
=
True
,
)
prompt
=
tokenizer
.
apply_chat_template
(
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
messages
,
chat_template
=
DUMMY_CHAT_TEMPLATE
,
chat_template
=
DUMMY_CHAT_TEMPLATE
,
...
@@ -206,6 +211,9 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
...
@@ -206,6 +211,9 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
)
)
float_response
.
raise_for_status
()
float_response
.
raise_for_status
()
responses_float
=
PoolingResponse
.
model_validate
(
float_response
.
json
())
responses_float
=
PoolingResponse
.
model_validate
(
float_response
.
json
())
float_data
=
[
np
.
array
(
d
.
data
).
squeeze
(
-
1
).
tolist
()
for
d
in
responses_float
.
data
]
base64_response
=
requests
.
post
(
base64_response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
server
.
url_for
(
"pooling"
),
...
@@ -224,11 +232,10 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
...
@@ -224,11 +232,10 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
np
.
frombuffer
(
base64
.
b64decode
(
data
.
data
),
np
.
frombuffer
(
base64
.
b64decode
(
data
.
data
),
dtype
=
"float32"
).
tolist
())
dtype
=
"float32"
).
tolist
())
check_embeddings_close
(
check_embeddings_close
(
embeddings_0_lst
=
float_data
,
embeddings_0_lst
=
[
d
.
data
for
d
in
responses_float
.
data
],
embeddings_1_lst
=
decoded_responses_base64_data
,
embeddings_1_lst
=
decoded_responses_base64_data
,
name_0
=
"float32"
,
name_0
=
"float32"
,
name_1
=
"base64"
)
name_1
=
"base64"
)
# Default response is float32 decoded from base64 by OpenAI Client
# Default response is float32 decoded from base64 by OpenAI Client
default_response
=
requests
.
post
(
default_response
=
requests
.
post
(
...
@@ -240,9 +247,71 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
...
@@ -240,9 +247,71 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer,
)
)
default_response
.
raise_for_status
()
default_response
.
raise_for_status
()
responses_default
=
PoolingResponse
.
model_validate
(
default_response
.
json
())
responses_default
=
PoolingResponse
.
model_validate
(
default_response
.
json
())
default_data
=
[
np
.
array
(
d
.
data
).
squeeze
(
-
1
).
tolist
()
for
d
in
responses_default
.
data
]
check_embeddings_close
(
embeddings_0_lst
=
float_data
,
embeddings_1_lst
=
default_data
,
name_0
=
"float32"
,
name_1
=
"default"
)
@
pytest
.
mark
.
asyncio
async
def
test_invocations
(
server
:
RemoteOpenAIServer
):
input_texts
=
[
"The chef prepared a delicious meal."
,
]
request_args
=
{
"model"
:
MODEL_NAME
,
"input"
:
input_texts
,
"encoding_format"
:
"float"
,
}
completion_response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
request_args
)
completion_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
completion_output
=
completion_response
.
json
()
invocation_output
=
invocation_response
.
json
()
assert
completion_output
.
keys
()
==
invocation_output
.
keys
()
assert
completion_output
[
"data"
]
==
invocation_output
[
"data"
]
@
pytest
.
mark
.
asyncio
async
def
test_invocations_conversation
(
server
:
RemoteOpenAIServer
):
messages
=
[{
"role"
:
"user"
,
"content"
:
"The cat sat on the mat."
,
},
{
"role"
:
"assistant"
,
"content"
:
"A feline was resting on a rug."
,
},
{
"role"
:
"user"
,
"content"
:
"Stars twinkle brightly in the night sky."
,
}]
request_args
=
{
"model"
:
MODEL_NAME
,
"messages"
:
messages
,
"encoding_format"
:
"float"
,
}
chat_response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
request_args
)
chat_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
chat_output
=
chat_response
.
json
()
invocation_output
=
invocation_response
.
json
()
check_embeddings_close
(
assert
chat_output
.
keys
()
==
invocation_output
.
keys
()
embeddings_0_lst
=
[
d
.
data
for
d
in
responses_default
.
data
],
assert
chat_output
[
"data"
]
==
invocation_output
[
"data"
]
embeddings_1_lst
=
[
d
.
data
for
d
in
responses_default
.
data
],
name_0
=
"float32"
,
name_1
=
"base64"
)
tests/entrypoints/openai/test_rerank.py
View file @
cbd14ed5
...
@@ -94,3 +94,30 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
...
@@ -94,3 +94,30 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
# Assert just a small fragments of the response
# Assert just a small fragments of the response
assert
"Please reduce the length of the input."
in
\
assert
"Please reduce the length of the input."
in
\
rerank_response
.
text
rerank_response
.
text
def
test_invocations
(
server
:
RemoteOpenAIServer
):
query
=
"What is the capital of France?"
documents
=
[
"The capital of Brazil is Brasilia."
,
"The capital of France is Paris."
]
request_args
=
{
"model"
:
MODEL_NAME
,
"query"
:
query
,
"documents"
:
documents
,
}
rerank_response
=
requests
.
post
(
server
.
url_for
(
"rerank"
),
json
=
request_args
)
rerank_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
rerank_output
=
rerank_response
.
json
()
invocation_output
=
invocation_response
.
json
()
assert
rerank_output
.
keys
()
==
invocation_output
.
keys
()
assert
rerank_output
[
"results"
]
==
invocation_output
[
"results"
]
tests/entrypoints/openai/test_score.py
View file @
cbd14ed5
...
@@ -191,3 +191,28 @@ class TestModel:
...
@@ -191,3 +191,28 @@ class TestModel:
assert
score_response
.
status_code
==
400
assert
score_response
.
status_code
==
400
assert
"Please, select a smaller truncation size."
in
\
assert
"Please, select a smaller truncation size."
in
\
score_response
.
text
score_response
.
text
def
test_invocations
(
self
,
server
:
RemoteOpenAIServer
,
model
:
dict
[
str
,
Any
]):
text_1
=
"What is the capital of France?"
text_2
=
"The capital of France is Paris."
request_args
=
{
"model"
:
model
[
"name"
],
"text_1"
:
text_1
,
"text_2"
:
text_2
,
}
score_response
=
requests
.
post
(
server
.
url_for
(
"score"
),
json
=
request_args
)
score_response
.
raise_for_status
()
invocation_response
=
requests
.
post
(
server
.
url_for
(
"invocations"
),
json
=
request_args
)
invocation_response
.
raise_for_status
()
score_output
=
score_response
.
json
()
invocation_output
=
invocation_response
.
json
()
assert
score_output
.
keys
()
==
invocation_output
.
keys
()
assert
score_output
[
"data"
]
==
invocation_output
[
"data"
]
vllm/entrypoints/openai/api_server.py
View file @
cbd14ed5
...
@@ -18,7 +18,7 @@ from collections.abc import AsyncIterator, Awaitable
...
@@ -18,7 +18,7 @@ from collections.abc import AsyncIterator, Awaitable
from
contextlib
import
asynccontextmanager
from
contextlib
import
asynccontextmanager
from
functools
import
partial
from
functools
import
partial
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Annotated
,
Any
,
Optional
from
typing
import
Annotated
,
Any
,
Callable
,
Optional
import
prometheus_client
import
prometheus_client
import
pydantic
import
pydantic
...
@@ -61,13 +61,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
...
@@ -61,13 +61,9 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionResponse
,
CompletionResponse
,
DetokenizeRequest
,
DetokenizeRequest
,
DetokenizeResponse
,
DetokenizeResponse
,
EmbeddingChatRequest
,
EmbeddingCompletionRequest
,
EmbeddingRequest
,
EmbeddingRequest
,
EmbeddingResponse
,
ErrorResponse
,
EmbeddingResponse
,
ErrorResponse
,
LoadLoRAAdapterRequest
,
LoadLoRAAdapterRequest
,
PoolingChatRequest
,
PoolingCompletionRequest
,
PoolingRequest
,
PoolingResponse
,
PoolingRequest
,
PoolingResponse
,
RerankRequest
,
RerankResponse
,
RerankRequest
,
RerankResponse
,
ResponsesRequest
,
ResponsesRequest
,
...
@@ -434,6 +430,7 @@ async def get_server_load_metrics(request: Request):
...
@@ -434,6 +430,7 @@ async def get_server_load_metrics(request: Request):
# - /v1/chat/completions
# - /v1/chat/completions
# - /v1/completions
# - /v1/completions
# - /v1/audio/transcriptions
# - /v1/audio/transcriptions
# - /v1/audio/translations
# - /v1/embeddings
# - /v1/embeddings
# - /pooling
# - /pooling
# - /classify
# - /classify
...
@@ -957,31 +954,6 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
...
@@ -957,31 +954,6 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
return
await
do_rerank
(
request
,
raw_request
)
return
await
do_rerank
(
request
,
raw_request
)
TASK_HANDLERS
:
dict
[
str
,
dict
[
str
,
tuple
]]
=
{
"generate"
:
{
"messages"
:
(
ChatCompletionRequest
,
create_chat_completion
),
"default"
:
(
CompletionRequest
,
create_completion
),
},
"embed"
:
{
"messages"
:
(
EmbeddingChatRequest
,
create_embedding
),
"default"
:
(
EmbeddingCompletionRequest
,
create_embedding
),
},
"score"
:
{
"default"
:
(
RerankRequest
,
do_rerank
)
},
"rerank"
:
{
"default"
:
(
RerankRequest
,
do_rerank
)
},
"reward"
:
{
"messages"
:
(
PoolingChatRequest
,
create_pooling
),
"default"
:
(
PoolingCompletionRequest
,
create_pooling
),
},
"classify"
:
{
"messages"
:
(
PoolingChatRequest
,
create_pooling
),
"default"
:
(
PoolingCompletionRequest
,
create_pooling
),
},
}
if
envs
.
VLLM_SERVER_DEV_MODE
:
if
envs
.
VLLM_SERVER_DEV_MODE
:
logger
.
warning
(
"SECURITY WARNING: Development endpoints are enabled! "
logger
.
warning
(
"SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!"
)
"This should NOT be used in production!"
)
...
@@ -1033,6 +1005,30 @@ if envs.VLLM_SERVER_DEV_MODE:
...
@@ -1033,6 +1005,30 @@ if envs.VLLM_SERVER_DEV_MODE:
return
JSONResponse
(
content
=
{
"is_sleeping"
:
is_sleeping
})
return
JSONResponse
(
content
=
{
"is_sleeping"
:
is_sleeping
})
# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
# (requires typing_extensions >= 4.13)
RequestType
=
Any
GetHandlerFn
=
Callable
[[
Request
],
Optional
[
OpenAIServing
]]
EndpointFn
=
Callable
[[
RequestType
,
Request
],
Awaitable
[
Any
]]
# NOTE: Items defined earlier take higher priority
INVOCATION_TYPES
:
list
[
tuple
[
RequestType
,
tuple
[
GetHandlerFn
,
EndpointFn
]]]
=
[
(
ChatCompletionRequest
,
(
chat
,
create_chat_completion
)),
(
CompletionRequest
,
(
completion
,
create_completion
)),
(
EmbeddingRequest
,
(
embedding
,
create_embedding
)),
(
ClassificationRequest
,
(
classify
,
create_classify
)),
(
ScoreRequest
,
(
score
,
create_score
)),
(
RerankRequest
,
(
rerank
,
do_rerank
)),
(
PoolingRequest
,
(
pooling
,
create_pooling
)),
]
# NOTE: Construct the TypeAdapters only once
INVOCATION_VALIDATORS
=
[
(
pydantic
.
TypeAdapter
(
request_type
),
(
get_handler
,
endpoint
))
for
request_type
,
(
get_handler
,
endpoint
)
in
INVOCATION_TYPES
]
@
router
.
post
(
"/invocations"
,
@
router
.
post
(
"/invocations"
,
dependencies
=
[
Depends
(
validate_json_request
)],
dependencies
=
[
Depends
(
validate_json_request
)],
responses
=
{
responses
=
{
...
@@ -1047,32 +1043,34 @@ if envs.VLLM_SERVER_DEV_MODE:
...
@@ -1047,32 +1043,34 @@ if envs.VLLM_SERVER_DEV_MODE:
},
},
})
})
async
def
invocations
(
raw_request
:
Request
):
async
def
invocations
(
raw_request
:
Request
):
"""
"""For SageMaker, routes requests based on the request type."""
For SageMaker, routes requests to other handlers based on model `task`.
"""
try
:
try
:
body
=
await
raw_request
.
json
()
body
=
await
raw_request
.
json
()
except
json
.
JSONDecodeError
as
e
:
except
json
.
JSONDecodeError
as
e
:
raise
HTTPException
(
status_code
=
HTTPStatus
.
BAD_REQUEST
.
value
,
raise
HTTPException
(
status_code
=
HTTPStatus
.
BAD_REQUEST
.
value
,
detail
=
f
"JSON decode error:
{
e
}
"
)
from
e
detail
=
f
"JSON decode error:
{
e
}
"
)
from
e
task
=
raw_request
.
app
.
state
.
task
valid_endpoints
=
[(
validator
,
endpoint
)
for
validator
,
(
get_handler
,
endpoint
)
in
INVOCATION_VALIDATORS
if
get_handler
(
raw_request
)
is
not
None
]
if
task
not
in
TASK_HANDLERS
:
for
request_validator
,
endpoint
in
valid_endpoints
:
raise
HTTPException
(
try
:
status_code
=
400
,
request
=
request_validator
.
validate_python
(
body
)
detail
=
f
"Unsupported task: '
{
task
}
' for '/invocations'. "
except
pydantic
.
ValidationError
:
f
"Expected one of
{
set
(
TASK_HANDLERS
.
keys
())
}
"
)
continue
handler_config
=
TASK_HANDLERS
[
task
]
return
await
endpoint
(
request
,
raw_request
)
if
"messages"
in
body
:
request_model
,
handler
=
handler_config
[
"messages"
]
else
:
request_model
,
handler
=
handler_config
[
"default"
]
# this is required since we lose the FastAPI automatic casting
type_names
=
[
request
=
request_model
.
model_validate
(
body
)
t
.
__name__
if
isinstance
(
t
:
=
validator
.
_type
,
type
)
else
str
(
t
)
return
await
handler
(
request
,
raw_request
)
for
validator
,
_
in
valid_endpoints
]
msg
=
(
"Cannot find suitable handler for request. "
f
"Expected one of:
{
type_names
}
"
)
res
=
base
(
raw_request
).
create_error_response
(
message
=
msg
)
return
JSONResponse
(
content
=
res
.
model_dump
(),
status_code
=
res
.
code
)
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
if
envs
.
VLLM_TORCH_PROFILER_DIR
:
...
...
vllm/entrypoints/openai/protocol.py
View file @
cbd14ed5
...
@@ -11,6 +11,12 @@ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
...
@@ -11,6 +11,12 @@ from typing import Annotated, Any, ClassVar, Literal, Optional, Union
import
regex
as
re
import
regex
as
re
import
torch
import
torch
from
fastapi
import
HTTPException
,
UploadFile
from
fastapi
import
HTTPException
,
UploadFile
# yapf: disable
from
openai.types.chat.chat_completion_audio
import
(
ChatCompletionAudio
as
OpenAIChatCompletionAudio
)
from
openai.types.chat.chat_completion_message
import
(
Annotation
as
OpenAIAnnotation
)
# yapf: enable
from
openai.types.responses
import
(
ResponseInputParam
,
ResponseOutputItem
,
from
openai.types.responses
import
(
ResponseInputParam
,
ResponseOutputItem
,
ResponseOutputMessage
,
ResponsePrompt
,
ResponseOutputMessage
,
ResponsePrompt
,
ResponseStatus
,
ResponseTextConfig
)
ResponseStatus
,
ResponseTextConfig
)
...
@@ -1393,11 +1399,16 @@ class CompletionResponseChoice(OpenAIBaseModel):
...
@@ -1393,11 +1399,16 @@ class CompletionResponseChoice(OpenAIBaseModel):
class
CompletionResponse
(
OpenAIBaseModel
):
class
CompletionResponse
(
OpenAIBaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"cmpl-
{
random_uuid
()
}
"
)
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"cmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"text_completion"
object
:
Literal
[
"text_completion"
]
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
list
[
CompletionResponseChoice
]
choices
:
list
[
CompletionResponseChoice
]
service_tier
:
Optional
[
Literal
[
"auto"
,
"default"
,
"flex"
,
"scale"
,
"priority"
]]
=
None
system_fingerprint
:
Optional
[
str
]
=
None
usage
:
UsageInfo
usage
:
UsageInfo
# vLLM-specific fields that are not in OpenAI spec
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
Field
(
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
"KVTransfer parameters."
)
default
=
None
,
description
=
"KVTransfer parameters."
)
...
@@ -1549,10 +1560,16 @@ class ExtractedToolCallInformation(BaseModel):
...
@@ -1549,10 +1560,16 @@ class ExtractedToolCallInformation(BaseModel):
class
ChatMessage
(
OpenAIBaseModel
):
class
ChatMessage
(
OpenAIBaseModel
):
role
:
str
role
:
str
reasoning_content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
refusal
:
Optional
[
str
]
=
None
annotations
:
Optional
[
OpenAIAnnotation
]
=
None
audio
:
Optional
[
OpenAIChatCompletionAudio
]
=
None
function_call
:
Optional
[
FunctionCall
]
=
None
tool_calls
:
list
[
ToolCall
]
=
Field
(
default_factory
=
list
)
tool_calls
:
list
[
ToolCall
]
=
Field
(
default_factory
=
list
)
# vLLM-specific fields that are not in OpenAI spec
reasoning_content
:
Optional
[
str
]
=
None
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
token
:
str
token
:
str
...
@@ -1587,7 +1604,12 @@ class ChatCompletionResponse(OpenAIBaseModel):
...
@@ -1587,7 +1604,12 @@ class ChatCompletionResponse(OpenAIBaseModel):
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
list
[
ChatCompletionResponseChoice
]
choices
:
list
[
ChatCompletionResponseChoice
]
service_tier
:
Optional
[
Literal
[
"auto"
,
"default"
,
"flex"
,
"scale"
,
"priority"
]]
=
None
system_fingerprint
:
Optional
[
str
]
=
None
usage
:
UsageInfo
usage
:
UsageInfo
# vLLM-specific fields that are not in OpenAI spec
prompt_logprobs
:
Optional
[
list
[
Optional
[
dict
[
int
,
Logprob
]]]]
=
None
prompt_logprobs
:
Optional
[
list
[
Optional
[
dict
[
int
,
Logprob
]]]]
=
None
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
Field
(
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
"KVTransfer parameters."
)
default
=
None
,
description
=
"KVTransfer parameters."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment