Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
225 additions
and
214 deletions
+225
-214
vllm/entrypoints/logger.py
vllm/entrypoints/logger.py
+2
-2
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+5
-4
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+4
-3
vllm/entrypoints/openai/logits_processors.py
vllm/entrypoints/openai/logits_processors.py
+12
-11
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+64
-64
vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
...ypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+11
-10
vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
.../openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+3
-2
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/run_batch.py
+5
-4
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+15
-16
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+16
-16
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_embedding.py
+8
-7
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+21
-22
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+5
-5
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_pooling.py
+8
-7
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+25
-24
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+2
-2
vllm/entrypoints/openai/serving_transcription.py
vllm/entrypoints/openai/serving_transcription.py
+2
-1
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+11
-10
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
...ypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+3
-2
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+3
-2
No files found.
vllm/entrypoints/logger.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
...
...
@@ -22,7 +22,7 @@ class RequestLogger:
self
,
request_id
:
str
,
prompt
:
Optional
[
str
],
prompt_token_ids
:
Optional
[
L
ist
[
int
]],
prompt_token_ids
:
Optional
[
l
ist
[
int
]],
params
:
Optional
[
Union
[
SamplingParams
,
PoolingParams
,
BeamSearchParams
]],
lora_request
:
Optional
[
LoRARequest
],
...
...
vllm/entrypoints/openai/api_server.py
View file @
cf069aa8
...
...
@@ -13,10 +13,11 @@ import socket
import
tempfile
import
uuid
from
argparse
import
Namespace
from
collections.abc
import
AsyncIterator
from
contextlib
import
asynccontextmanager
from
functools
import
partial
from
http
import
HTTPStatus
from
typing
import
Annotated
,
AsyncIterator
,
Dict
,
Optional
,
Set
,
Tuple
,
Union
from
typing
import
Annotated
,
Optional
,
Union
import
uvloop
from
fastapi
import
APIRouter
,
Depends
,
FastAPI
,
Form
,
HTTPException
,
Request
...
...
@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
logger
=
init_logger
(
'vllm.entrypoints.openai.api_server'
)
_running_tasks
:
S
et
[
asyncio
.
Task
]
=
set
()
_running_tasks
:
s
et
[
asyncio
.
Task
]
=
set
()
@
asynccontextmanager
...
...
@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
return
await
do_rerank
(
request
,
raw_request
)
TASK_HANDLERS
:
D
ict
[
str
,
D
ict
[
str
,
tuple
]]
=
{
TASK_HANDLERS
:
d
ict
[
str
,
d
ict
[
str
,
tuple
]]
=
{
"generate"
:
{
"messages"
:
(
ChatCompletionRequest
,
create_chat_completion
),
"default"
:
(
CompletionRequest
,
create_completion
),
...
...
@@ -894,7 +895,7 @@ async def init_app_state(
state
.
task
=
model_config
.
task
def
create_server_socket
(
addr
:
T
uple
[
str
,
int
])
->
socket
.
socket
:
def
create_server_socket
(
addr
:
t
uple
[
str
,
int
])
->
socket
.
socket
:
family
=
socket
.
AF_INET
if
is_valid_ipv6_address
(
addr
[
0
]):
family
=
socket
.
AF_INET6
...
...
vllm/entrypoints/openai/cli_args.py
View file @
cf069aa8
...
...
@@ -8,7 +8,8 @@ purposes.
import
argparse
import
json
import
ssl
from
typing
import
List
,
Optional
,
Sequence
,
Union
,
get_args
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
,
get_args
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
...
...
@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action):
if
isinstance
(
values
,
str
):
raise
TypeError
(
"Expected values to be a list"
)
lora_list
:
L
ist
[
LoRAModulePath
]
=
[]
lora_list
:
l
ist
[
LoRAModulePath
]
=
[]
for
item
in
values
:
if
item
in
[
None
,
''
]:
# Skip if item is None or empty string
continue
...
...
@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action):
if
isinstance
(
values
,
str
):
raise
TypeError
(
"Expected values to be a list"
)
adapter_list
:
L
ist
[
PromptAdapterPath
]
=
[]
adapter_list
:
l
ist
[
PromptAdapterPath
]
=
[]
for
item
in
values
:
name
,
path
=
item
.
split
(
'='
)
adapter_list
.
append
(
PromptAdapterPath
(
name
,
path
))
...
...
vllm/entrypoints/openai/logits_processors.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Iterable
from
functools
import
lru_cache
,
partial
from
typing
import
Dict
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
import
torch
...
...
@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
specific set of token ids."""
def
__init__
(
self
,
allowed_ids
:
Iterable
[
int
]):
self
.
allowed_ids
:
Optional
[
L
ist
[
int
]]
=
list
(
allowed_ids
)
self
.
allowed_ids
:
Optional
[
l
ist
[
int
]]
=
list
(
allowed_ids
)
self
.
mask
:
Optional
[
torch
.
Tensor
]
=
None
def
__call__
(
self
,
token_ids
:
L
ist
[
int
],
def
__call__
(
self
,
token_ids
:
l
ist
[
int
],
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
self
.
mask
is
None
:
self
.
mask
=
torch
.
ones
((
logits
.
shape
[
-
1
],
),
...
...
@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor:
@
lru_cache
(
maxsize
=
32
)
def
_get_allowed_token_ids_logits_processor
(
allowed_token_ids
:
F
rozen
S
et
[
int
],
allowed_token_ids
:
f
rozen
s
et
[
int
],
vocab_size
:
int
,
)
->
LogitsProcessor
:
if
not
allowed_token_ids
:
...
...
@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
def
logit_bias_logits_processor
(
logit_bias
:
D
ict
[
int
,
float
],
token_ids
:
L
ist
[
int
],
logit_bias
:
d
ict
[
int
,
float
],
token_ids
:
l
ist
[
int
],
logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
for
token_id
,
bias
in
logit_bias
.
items
():
...
...
@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
def
get_logits_processors
(
logit_bias
:
Optional
[
Union
[
D
ict
[
int
,
float
],
D
ict
[
str
,
float
]]],
allowed_token_ids
:
Optional
[
L
ist
[
int
]],
logit_bias
:
Optional
[
Union
[
d
ict
[
int
,
float
],
d
ict
[
str
,
float
]]],
allowed_token_ids
:
Optional
[
l
ist
[
int
]],
tokenizer
:
AnyTokenizer
,
)
->
L
ist
[
LogitsProcessor
]:
logits_processors
:
L
ist
[
LogitsProcessor
]
=
[]
)
->
l
ist
[
LogitsProcessor
]:
logits_processors
:
l
ist
[
LogitsProcessor
]
=
[]
if
logit_bias
:
try
:
# Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec
clamped_logit_bias
:
D
ict
[
int
,
float
]
=
{
clamped_logit_bias
:
d
ict
[
int
,
float
]
=
{
int
(
token_id
):
min
(
100.0
,
max
(
-
100.0
,
bias
))
for
token_id
,
bias
in
logit_bias
.
items
()
}
...
...
vllm/entrypoints/openai/protocol.py
View file @
cf069aa8
...
...
@@ -5,13 +5,13 @@
import
re
import
time
from
argparse
import
Namespace
from
typing
import
Any
,
ClassVar
,
Dict
,
List
,
Literal
,
Optional
,
Set
,
Union
from
typing
import
Annotated
,
Any
,
ClassVar
,
Literal
,
Optional
,
Union
import
torch
from
fastapi
import
UploadFile
from
pydantic
import
(
BaseModel
,
ConfigDict
,
Field
,
TypeAdapter
,
ValidationInfo
,
field_validator
,
model_validator
)
from
typing_extensions
import
Annotated
,
TypeAlias
from
typing_extensions
import
TypeAlias
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
vllm.logger
import
init_logger
...
...
@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
model_config
=
ConfigDict
(
extra
=
"allow"
)
# Cache class field names
field_names
:
ClassVar
[
Optional
[
S
et
[
str
]]]
=
None
field_names
:
ClassVar
[
Optional
[
s
et
[
str
]]]
=
None
@
model_validator
(
mode
=
"wrap"
)
@
classmethod
...
...
@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
max_model_len
:
Optional
[
int
]
=
None
permission
:
L
ist
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
permission
:
l
ist
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
class
ModelList
(
OpenAIBaseModel
):
object
:
str
=
"list"
data
:
L
ist
[
ModelCard
]
=
Field
(
default_factory
=
list
)
data
:
l
ist
[
ModelCard
]
=
Field
(
default_factory
=
list
)
class
PromptTokenUsageInfo
(
OpenAIBaseModel
):
...
...
@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
description
:
Optional
[
str
]
=
None
# schema is the field in openai but that causes conflicts with pydantic so
# instead use json_schema with an alias
json_schema
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
alias
=
'schema'
)
json_schema
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
alias
=
'schema'
)
strict
:
Optional
[
bool
]
=
None
...
...
@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
class
FunctionDefinition
(
OpenAIBaseModel
):
name
:
str
description
:
Optional
[
str
]
=
None
parameters
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
parameters
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
class
ChatCompletionToolsParam
(
OpenAIBaseModel
):
...
...
@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
class
LogitsProcessorConstructor
(
BaseModel
):
qualname
:
str
args
:
Optional
[
L
ist
[
Any
]]
=
None
kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
args
:
Optional
[
l
ist
[
Any
]]
=
None
kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
LogitsProcessors
=
L
ist
[
Union
[
str
,
LogitsProcessorConstructor
]]
LogitsProcessors
=
l
ist
[
Union
[
str
,
LogitsProcessorConstructor
]]
def
get_logits_processors
(
processors
:
Optional
[
LogitsProcessors
],
pattern
:
Optional
[
str
])
->
Optional
[
L
ist
[
Any
]]:
pattern
:
Optional
[
str
])
->
Optional
[
l
ist
[
Any
]]:
if
processors
and
pattern
:
logits_processors
=
[]
for
processor
in
processors
:
...
...
@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
class
ChatCompletionRequest
(
OpenAIBaseModel
):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
model
:
Optional
[
str
]
=
None
frequency_penalty
:
Optional
[
float
]
=
0.0
logit_bias
:
Optional
[
D
ict
[
str
,
float
]]
=
None
logit_bias
:
Optional
[
d
ict
[
str
,
float
]]
=
None
logprobs
:
Optional
[
bool
]
=
False
top_logprobs
:
Optional
[
int
]
=
0
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
...
...
@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
presence_penalty
:
Optional
[
float
]
=
0.0
response_format
:
Optional
[
ResponseFormat
]
=
None
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
stop
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stop
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
temperature
:
Optional
[
float
]
=
None
top_p
:
Optional
[
float
]
=
None
tools
:
Optional
[
L
ist
[
ChatCompletionToolsParam
]]
=
None
tools
:
Optional
[
l
ist
[
ChatCompletionToolsParam
]]
=
None
tool_choice
:
Optional
[
Union
[
Literal
[
"none"
],
Literal
[
"auto"
],
ChatCompletionNamedToolChoiceParam
]]
=
"none"
...
...
@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
min_p
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
length_penalty
:
float
=
1.0
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
Field
(
default_factory
=
list
)
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
Field
(
default_factory
=
list
)
include_stop_str_in_output
:
bool
=
False
ignore_eos
:
bool
=
False
min_tokens
:
int
=
0
...
...
@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to false (as is the "
"default)."
),
)
documents
:
Optional
[
L
ist
[
D
ict
[
str
,
str
]]]
=
Field
(
documents
:
Optional
[
l
ist
[
d
ict
[
str
,
str
]]]
=
Field
(
default
=
None
,
description
=
(
"A list of dicts representing documents that will be accessible to "
...
...
@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
...
...
@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
description
=
(
"If specified, the output will follow the regex pattern."
),
)
guided_choice
:
Optional
[
L
ist
[
str
]]
=
Field
(
guided_choice
:
Optional
[
l
ist
[
str
]]
=
Field
(
default
=
None
,
description
=
(
"If specified, the output will be exactly one of the choices."
),
...
...
@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/completions/create
model
:
Optional
[
str
]
=
None
prompt
:
Union
[
L
ist
[
int
],
L
ist
[
L
ist
[
int
]],
str
,
L
ist
[
str
]]
prompt
:
Union
[
l
ist
[
int
],
l
ist
[
l
ist
[
int
]],
str
,
l
ist
[
str
]]
best_of
:
Optional
[
int
]
=
None
echo
:
Optional
[
bool
]
=
False
frequency_penalty
:
Optional
[
float
]
=
0.0
logit_bias
:
Optional
[
D
ict
[
str
,
float
]]
=
None
logit_bias
:
Optional
[
d
ict
[
str
,
float
]]
=
None
logprobs
:
Optional
[
int
]
=
None
max_tokens
:
Optional
[
int
]
=
16
n
:
int
=
1
presence_penalty
:
Optional
[
float
]
=
0.0
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
stop
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stop
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
suffix
:
Optional
[
str
]
=
None
...
...
@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
min_p
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
length_penalty
:
float
=
1.0
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
Field
(
default_factory
=
list
)
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
Field
(
default_factory
=
list
)
include_stop_str_in_output
:
bool
=
False
ignore_eos
:
bool
=
False
min_tokens
:
int
=
0
skip_special_tokens
:
bool
=
True
spaces_between_special_tokens
:
bool
=
True
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
allowed_token_ids
:
Optional
[
L
ist
[
int
]]
=
None
allowed_token_ids
:
Optional
[
l
ist
[
int
]]
=
None
prompt_logprobs
:
Optional
[
int
]
=
None
# doc: end-completion-sampling-params
...
...
@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
description
=
(
"If specified, the output will follow the regex pattern."
),
)
guided_choice
:
Optional
[
L
ist
[
str
]]
=
Field
(
guided_choice
:
Optional
[
l
ist
[
str
]]
=
Field
(
default
=
None
,
description
=
(
"If specified, the output will be exactly one of the choices."
),
...
...
@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model
:
Optional
[
str
]
=
None
input
:
Union
[
L
ist
[
int
],
L
ist
[
L
ist
[
int
]],
str
,
L
ist
[
str
]]
input
:
Union
[
l
ist
[
int
],
l
ist
[
l
ist
[
int
]],
str
,
l
ist
[
str
]]
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
dimensions
:
Optional
[
int
]
=
None
user
:
Optional
[
str
]
=
None
...
...
@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
class
EmbeddingChatRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
dimensions
:
Optional
[
int
]
=
None
...
...
@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
...
...
@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
class
ScoreRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
text_1
:
Union
[
L
ist
[
str
],
str
]
text_2
:
Union
[
L
ist
[
str
],
str
]
text_1
:
Union
[
l
ist
[
str
],
str
]
text_2
:
Union
[
l
ist
[
str
],
str
]
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
# doc: begin-score-pooling-params
...
...
@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel):
class
RerankRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
query
:
str
documents
:
L
ist
[
str
]
documents
:
l
ist
[
str
]
top_n
:
int
=
Field
(
default_factory
=
lambda
:
0
)
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
...
...
@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
id
:
str
model
:
str
usage
:
RerankUsage
results
:
L
ist
[
RerankResult
]
results
:
l
ist
[
RerankResult
]
class
CompletionLogProbs
(
OpenAIBaseModel
):
text_offset
:
L
ist
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
L
ist
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
L
ist
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
L
ist
[
Optional
[
D
ict
[
str
,
text_offset
:
l
ist
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
l
ist
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
l
ist
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
l
ist
[
Optional
[
d
ict
[
str
,
float
]]]
=
Field
(
default_factory
=
list
)
...
...
@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason "
"including encountering the EOS token"
),
)
prompt_logprobs
:
Optional
[
L
ist
[
Optional
[
D
ict
[
int
,
Logprob
]]]]
=
None
prompt_logprobs
:
Optional
[
l
ist
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
=
None
class
CompletionResponse
(
OpenAIBaseModel
):
...
...
@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
L
ist
[
CompletionResponseChoice
]
choices
:
l
ist
[
CompletionResponseChoice
]
usage
:
UsageInfo
...
...
@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
L
ist
[
CompletionResponseStreamChoice
]
choices
:
l
ist
[
CompletionResponseStreamChoice
]
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
class
EmbeddingResponseData
(
OpenAIBaseModel
):
index
:
int
object
:
str
=
"embedding"
embedding
:
Union
[
L
ist
[
float
],
str
]
embedding
:
Union
[
l
ist
[
float
],
str
]
class
EmbeddingResponse
(
OpenAIBaseModel
):
...
...
@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
L
ist
[
EmbeddingResponseData
]
data
:
l
ist
[
EmbeddingResponseData
]
usage
:
UsageInfo
class
PoolingResponseData
(
OpenAIBaseModel
):
index
:
int
object
:
str
=
"pooling"
data
:
Union
[
L
ist
[
L
ist
[
float
]],
L
ist
[
float
],
str
]
data
:
Union
[
l
ist
[
l
ist
[
float
]],
l
ist
[
float
],
str
]
class
PoolingResponse
(
OpenAIBaseModel
):
...
...
@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
L
ist
[
PoolingResponseData
]
data
:
l
ist
[
PoolingResponseData
]
usage
:
UsageInfo
...
...
@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
data
:
L
ist
[
ScoreResponseData
]
data
:
l
ist
[
ScoreResponseData
]
usage
:
UsageInfo
...
...
@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
tools_called
:
bool
# extracted tool calls
tool_calls
:
L
ist
[
ToolCall
]
tool_calls
:
l
ist
[
ToolCall
]
# content - per OpenAI spec, content AND tool calls can be returned rarely
# But some models will do this intentionally
...
...
@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
role
:
str
reasoning_content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
tool_calls
:
L
ist
[
ToolCall
]
=
Field
(
default_factory
=
list
)
tool_calls
:
l
ist
[
ToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
token
:
str
logprob
:
float
=
-
9999.0
bytes
:
Optional
[
L
ist
[
int
]]
=
None
bytes
:
Optional
[
l
ist
[
int
]]
=
None
class
ChatCompletionLogProbsContent
(
ChatCompletionLogProb
):
top_logprobs
:
L
ist
[
ChatCompletionLogProb
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
l
ist
[
ChatCompletionLogProb
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionLogProbs
(
OpenAIBaseModel
):
content
:
Optional
[
L
ist
[
ChatCompletionLogProbsContent
]]
=
None
content
:
Optional
[
l
ist
[
ChatCompletionLogProbsContent
]]
=
None
class
ChatCompletionResponseChoice
(
OpenAIBaseModel
):
...
...
@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
object
:
Literal
[
"chat.completion"
]
=
"chat.completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
L
ist
[
ChatCompletionResponseChoice
]
choices
:
l
ist
[
ChatCompletionResponseChoice
]
usage
:
UsageInfo
prompt_logprobs
:
Optional
[
L
ist
[
Optional
[
D
ict
[
int
,
Logprob
]]]]
=
None
prompt_logprobs
:
Optional
[
l
ist
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
=
None
class
DeltaMessage
(
OpenAIBaseModel
):
role
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
reasoning_content
:
Optional
[
str
]
=
None
tool_calls
:
L
ist
[
DeltaToolCall
]
=
Field
(
default_factory
=
list
)
tool_calls
:
l
ist
[
DeltaToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionResponseStreamChoice
(
OpenAIBaseModel
):
...
...
@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
object
:
Literal
[
"chat.completion.chunk"
]
=
"chat.completion.chunk"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
L
ist
[
ChatCompletionResponseStreamChoice
]
choices
:
l
ist
[
ChatCompletionResponseStreamChoice
]
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
...
...
@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
class
TokenizeChatRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
add_generation_prompt
:
bool
=
Field
(
default
=
True
,
...
...
@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
...
...
@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
class
TokenizeResponse
(
OpenAIBaseModel
):
count
:
int
max_model_len
:
int
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
class
DetokenizeRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
class
DetokenizeResponse
(
OpenAIBaseModel
):
...
...
@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to automatically increase the temperature until certain thresholds are hit.
"""
timestamp_granularities
:
L
ist
[
Literal
[
"word"
,
"segment"
]]
=
Field
(
timestamp_granularities
:
l
ist
[
Literal
[
"word"
,
"segment"
]]
=
Field
(
alias
=
"timestamp_granularities[]"
,
default
=
[])
"""The timestamp granularities to populate for this transcription.
...
...
@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
text
:
str
"""Text content of the segment."""
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
"""Array of token IDs for the text content."""
...
...
@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
text
:
str
"""The transcribed text."""
segments
:
Optional
[
L
ist
[
TranscriptionSegment
]]
=
None
segments
:
Optional
[
l
ist
[
TranscriptionSegment
]]
=
None
"""Segments of the transcribed text and their corresponding details."""
words
:
Optional
[
L
ist
[
TranscriptionWord
]]
=
None
words
:
Optional
[
l
ist
[
TranscriptionWord
]]
=
None
"""Extracted words and their corresponding timestamps."""
vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
os
from
collections.abc
import
Sequence
from
functools
import
cached_property
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Type
,
Union
from
typing
import
Callable
,
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
)
...
...
@@ -25,14 +26,14 @@ class ReasoningParser:
self
.
model_tokenizer
=
tokenizer
@
cached_property
def
vocab
(
self
)
->
D
ict
[
str
,
int
]:
def
vocab
(
self
)
->
d
ict
[
str
,
int
]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
return
self
.
model_tokenizer
.
get_vocab
()
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
"""
Extract reasoning content from a complete model-generated string.
...
...
@@ -47,7 +48,7 @@ class ReasoningParser:
The request object that was used to generate the model_output.
Returns:
T
uple[Optional[str], Optional[str]]
t
uple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
"""
...
...
@@ -77,10 +78,10 @@ class ReasoningParser:
class
ReasoningParserManager
:
reasoning_parsers
:
D
ict
[
str
,
T
ype
]
=
{}
reasoning_parsers
:
d
ict
[
str
,
t
ype
]
=
{}
@
classmethod
def
get_reasoning_parser
(
cls
,
name
)
->
T
ype
:
def
get_reasoning_parser
(
cls
,
name
)
->
t
ype
:
"""
Get reasoning parser by name which is registered by `register_module`.
...
...
@@ -94,8 +95,8 @@ class ReasoningParserManager:
@
classmethod
def
_register_module
(
cls
,
module
:
T
ype
,
module_name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
module
:
t
ype
,
module_name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
)
->
None
:
if
not
issubclass
(
module
,
ReasoningParser
):
raise
TypeError
(
"module must be subclass of ReasoningParser, "
...
...
@@ -114,9 +115,9 @@ class ReasoningParserManager:
@
classmethod
def
register_module
(
cls
,
name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
,
module
:
Union
[
T
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
module
:
Union
[
t
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
"""
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
...
...
vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
re
from
typing
import
Optional
,
Sequence
,
Tuple
,
Union
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
from
transformers
import
PreTrainedTokenizerBase
...
...
@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
# DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start.
...
...
vllm/entrypoints/openai/run_batch.py
View file @
cf069aa8
...
...
@@ -2,9 +2,10 @@
import
asyncio
import
tempfile
from
collections.abc
import
Awaitable
from
http
import
HTTPStatus
from
io
import
StringIO
from
typing
import
Awaitable
,
Callable
,
List
,
Optional
from
typing
import
Callable
,
Optional
import
aiohttp
import
torch
...
...
@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
async
def
write_local_file
(
output_path
:
str
,
batch_outputs
:
L
ist
[
BatchRequestOutput
])
->
None
:
batch_outputs
:
l
ist
[
BatchRequestOutput
])
->
None
:
"""
Write the responses to a local file.
output_path: The path to write the responses to.
...
...
@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
f
"Error message:
{
str
(
e
)
}
."
)
from
e
async
def
write_file
(
path_or_url
:
str
,
batch_outputs
:
L
ist
[
BatchRequestOutput
],
async
def
write_file
(
path_or_url
:
str
,
batch_outputs
:
l
ist
[
BatchRequestOutput
],
output_tmp_dir
:
str
)
->
None
:
"""
Write batch_outputs to a file or upload to a URL.
...
...
@@ -353,7 +354,7 @@ async def main(args):
logger
.
info
(
"Reading batch from %s..."
,
args
.
input_file
)
# Submit all requests in the file to the engine "concurrently".
response_futures
:
L
ist
[
Awaitable
[
BatchRequestOutput
]]
=
[]
response_futures
:
l
ist
[
Awaitable
[
BatchRequestOutput
]]
=
[]
for
request_json
in
(
await
read_file
(
args
.
input_file
)).
strip
().
split
(
"
\n
"
):
# Skip empty lines.
request_json
=
request_json
.
strip
()
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
cf069aa8
...
...
@@ -3,10 +3,9 @@
import
asyncio
import
json
import
time
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Callable
,
Dict
,
Final
,
List
,
Optional
)
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Union
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Callable
,
Final
,
Optional
,
Union
from
fastapi
import
Request
...
...
@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing):
raw_request
.
state
.
request_metadata
=
request_metadata
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
...
...
@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
,
model_name
:
str
,
conversation
:
L
ist
[
ConversationMessage
],
conversation
:
l
ist
[
ConversationMessage
],
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
)
->
AsyncGenerator
[
str
,
None
]:
...
...
@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing):
should_stream_with_reasoning_parsing
=
(
self
.
_should_stream_with_reasoning_parsing
(
request
))
all_previous_token_ids
:
Optional
[
L
ist
[
L
ist
[
int
]]]
all_previous_token_ids
:
Optional
[
l
ist
[
l
ist
[
int
]]]
# Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration.
...
...
@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing):
# Prepare the tool parser if it's needed
try
:
if
tool_choice_auto
and
self
.
tool_parser
:
tool_parsers
:
L
ist
[
Optional
[
ToolParser
]]
=
[
tool_parsers
:
l
ist
[
Optional
[
ToolParser
]]
=
[
self
.
tool_parser
(
tokenizer
)
]
*
num_choices
else
:
...
...
@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing):
# Send response to echo the input portion of the
# last message
if
request
.
echo
:
last_msg_content
:
Union
[
str
,
L
ist
[
D
ict
[
str
,
str
]]]
=
""
last_msg_content
:
Union
[
str
,
l
ist
[
d
ict
[
str
,
str
]]]
=
""
if
conversation
and
"content"
in
conversation
[
-
1
]
and
conversation
[
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
...
...
@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
,
model_name
:
str
,
conversation
:
L
ist
[
ConversationMessage
],
conversation
:
l
ist
[
ConversationMessage
],
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
)
->
Union
[
ErrorResponse
,
ChatCompletionResponse
]:
...
...
@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing):
assert
final_res
is
not
None
choices
:
L
ist
[
ChatCompletionResponseChoice
]
=
[]
choices
:
l
ist
[
ChatCompletionResponseChoice
]
=
[]
role
=
self
.
get_chat_request_role
(
request
)
for
output
in
final_res
.
outputs
:
...
...
@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing):
choices
.
append
(
choice_data
)
if
request
.
echo
:
last_msg_content
:
Union
[
str
,
L
ist
[
D
ict
[
str
,
str
]]]
=
""
last_msg_content
:
Union
[
str
,
l
ist
[
d
ict
[
str
,
str
]]]
=
""
if
conversation
and
"content"
in
conversation
[
-
1
]
and
conversation
[
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
...
...
@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing):
return
response
def
_get_top_logprobs
(
self
,
logprobs
:
D
ict
[
int
,
Logprob
],
top_logprobs
:
Optional
[
int
],
tokenizer
:
AnyTokenizer
)
->
L
ist
[
ChatCompletionLogProb
]:
self
,
logprobs
:
d
ict
[
int
,
Logprob
],
top_logprobs
:
Optional
[
int
],
tokenizer
:
AnyTokenizer
)
->
l
ist
[
ChatCompletionLogProb
]:
return
[
ChatCompletionLogProb
(
token
=
(
token
:
=
self
.
_get_decoded_token
(
p
[
1
],
...
...
@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing):
def
_create_chat_logprobs
(
self
,
token_ids
:
GenericSequence
[
int
],
top_logprobs
:
GenericSequence
[
Optional
[
D
ict
[
int
,
Logprob
]]],
top_logprobs
:
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]],
tokenizer
:
AnyTokenizer
,
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
)
->
ChatCompletionLogProbs
:
"""Create OpenAI-style logprobs."""
logprobs_content
:
L
ist
[
ChatCompletionLogProbsContent
]
=
[]
logprobs_content
:
l
ist
[
ChatCompletionLogProbsContent
]
=
[]
for
i
,
token_id
in
enumerate
(
token_ids
):
step_top_logprobs
=
top_logprobs
[
i
]
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
cf069aa8
...
...
@@ -2,9 +2,9 @@
import
asyncio
import
time
from
typing
import
AsyncGenerator
,
AsyncIterator
,
Dict
,
List
,
Optional
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Tuple
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Optional
,
Union
,
cast
from
fastapi
import
Request
...
...
@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
...
...
@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_metadata
=
request_metadata
)
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
RequestOutput
]]
=
[
None
]
*
num_prompts
final_res_batch
:
l
ist
[
Optional
[
RequestOutput
]]
=
[
None
]
*
num_prompts
try
:
async
for
i
,
res
in
result_generator
:
final_res_batch
[
i
]
=
res
...
...
@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
final_res
.
prompt
is
None
:
final_res
.
prompt
=
request_prompts
[
i
][
"prompt"
]
final_res_batch_checked
=
cast
(
L
ist
[
RequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
RequestOutput
],
final_res_batch
)
response
=
self
.
request_output_to_completion_response
(
...
...
@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
self
,
request
:
CompletionRequest
,
result_generator
:
AsyncIterator
[
T
uple
[
int
,
RequestOutput
]],
result_generator
:
AsyncIterator
[
t
uple
[
int
,
RequestOutput
]],
request_id
:
str
,
created_time
:
int
,
model_name
:
str
,
...
...
@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing):
num_prompt_tokens
[
prompt_idx
]
=
len
(
res
.
prompt_token_ids
)
delta_token_ids
:
GenericSequence
[
int
]
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
D
ict
[
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
for
output
in
res
.
outputs
:
...
...
@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing):
def
request_output_to_completion_response
(
self
,
final_res_batch
:
L
ist
[
RequestOutput
],
final_res_batch
:
l
ist
[
RequestOutput
],
request
:
CompletionRequest
,
request_id
:
str
,
created_time
:
int
,
...
...
@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing):
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
)
->
CompletionResponse
:
choices
:
L
ist
[
CompletionResponseChoice
]
=
[]
choices
:
l
ist
[
CompletionResponseChoice
]
=
[]
num_prompt_tokens
=
0
num_generated_tokens
=
0
...
...
@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text
=
final_res
.
prompt
token_ids
:
GenericSequence
[
int
]
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
D
ict
[
int
,
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
for
output
in
final_res
.
outputs
:
...
...
@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing):
def
_create_completion_logprobs
(
self
,
token_ids
:
GenericSequence
[
int
],
top_logprobs
:
GenericSequence
[
Optional
[
D
ict
[
int
,
Logprob
]]],
top_logprobs
:
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]],
num_output_top_logprobs
:
int
,
tokenizer
:
AnyTokenizer
,
initial_text_offset
:
int
=
0
,
)
->
CompletionLogProbs
:
"""Create logprobs for OpenAI Completion API."""
out_text_offset
:
L
ist
[
int
]
=
[]
out_token_logprobs
:
L
ist
[
Optional
[
float
]]
=
[]
out_tokens
:
L
ist
[
str
]
=
[]
out_top_logprobs
:
L
ist
[
Optional
[
D
ict
[
str
,
float
]]]
=
[]
out_text_offset
:
l
ist
[
int
]
=
[]
out_token_logprobs
:
l
ist
[
Optional
[
float
]]
=
[]
out_tokens
:
l
ist
[
str
]
=
[]
out_top_logprobs
:
l
ist
[
Optional
[
d
ict
[
str
,
float
]]]
=
[]
last_token_len
=
0
...
...
vllm/entrypoints/openai/serving_embedding.py
View file @
cf069aa8
...
...
@@ -3,7 +3,8 @@
import
asyncio
import
base64
import
time
from
typing
import
AsyncGenerator
,
Final
,
List
,
Literal
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Final
,
Literal
,
Optional
,
Union
,
cast
import
numpy
as
np
from
fastapi
import
Request
...
...
@@ -31,7 +32,7 @@ logger = init_logger(__name__)
def
_get_embedding
(
output
:
EmbeddingOutput
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
Union
[
L
ist
[
float
],
str
]:
)
->
Union
[
l
ist
[
float
],
str
]:
if
encoding_format
==
"float"
:
return
output
.
embedding
elif
encoding_format
==
"base64"
:
...
...
@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
try
:
pooling_params
=
request
.
to_pooling_params
()
...
...
@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing):
num_prompts
=
len
(
engine_prompts
)
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
=
[
None
]
*
num_prompts
try
:
async
for
i
,
res
in
result_generator
:
...
...
@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing):
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
final_res_batch_checked
=
cast
(
L
ist
[
PoolingRequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
PoolingRequestOutput
],
final_res_batch
)
response
=
self
.
request_output_to_embedding_response
(
...
...
@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing):
def
request_output_to_embedding_response
(
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
created_time
:
int
,
model_name
:
str
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
EmbeddingResponse
:
items
:
L
ist
[
EmbeddingResponseData
]
=
[]
items
:
l
ist
[
EmbeddingResponseData
]
=
[]
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
json
from
collections.abc
import
Iterable
,
Iterator
,
Mapping
,
Sequence
from
concurrent.futures.thread
import
ThreadPoolExecutor
from
http
import
HTTPStatus
from
typing
import
(
Any
,
Callable
,
Dict
,
Iterable
,
Iterator
,
List
,
Mapping
,
Optional
,
Sequence
,
Tuple
,
TypedDict
,
Union
)
from
typing
import
Annotated
,
Any
,
Callable
,
Optional
,
TypedDict
,
Union
from
fastapi
import
Request
from
pydantic
import
Field
from
starlette.datastructures
import
Headers
from
typing_extensions
import
Annotated
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
...
...
@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
class
TextTokensPrompt
(
TypedDict
):
prompt
:
str
prompt_token_ids
:
L
ist
[
int
]
prompt_token_ids
:
l
ist
[
int
]
RequestPrompt
=
Union
[
L
ist
[
int
],
str
,
TextTokensPrompt
]
RequestPrompt
=
Union
[
l
ist
[
int
],
str
,
TextTokensPrompt
]
class
OpenAIServing
:
...
...
@@ -144,7 +143,7 @@ class OpenAIServing:
def
_maybe_get_adapters
(
self
,
request
:
AnyRequest
)
->
Union
[
T
uple
[
None
,
None
],
T
uple
[
LoRARequest
,
None
],
T
uple
[
)
->
Union
[
t
uple
[
None
,
None
],
t
uple
[
LoRARequest
,
None
],
t
uple
[
None
,
PromptAdapterRequest
]]:
if
self
.
_is_model_supported
(
request
.
model
):
return
None
,
None
...
...
@@ -188,7 +187,7 @@ class OpenAIServing:
self
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
prompt_ids
:
L
ist
[
int
],
prompt_ids
:
l
ist
[
int
],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]],
)
->
TextTokensPrompt
:
if
truncate_prompt_tokens
is
None
:
...
...
@@ -203,7 +202,7 @@ class OpenAIServing:
def
_validate_input
(
self
,
request
:
AnyRequest
,
input_ids
:
L
ist
[
int
],
input_ids
:
l
ist
[
int
],
input_text
:
str
,
)
->
TextTokensPrompt
:
token_num
=
len
(
input_ids
)
...
...
@@ -259,7 +258,7 @@ class OpenAIServing:
self
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
prompt_input
:
Union
[
str
,
L
ist
[
int
]],
prompt_input
:
Union
[
str
,
l
ist
[
int
]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
)
->
TextTokensPrompt
:
...
...
@@ -280,7 +279,7 @@ class OpenAIServing:
self
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
prompt_inputs
:
Iterable
[
Union
[
str
,
L
ist
[
int
]]],
prompt_inputs
:
Iterable
[
Union
[
str
,
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
)
->
Iterator
[
TextTokensPrompt
]:
...
...
@@ -309,10 +308,10 @@ class OpenAIServing:
self
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
input_or_inputs
:
Union
[
str
,
L
ist
[
str
],
L
ist
[
int
],
L
ist
[
L
ist
[
int
]]],
input_or_inputs
:
Union
[
str
,
l
ist
[
str
],
l
ist
[
int
],
l
ist
[
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
)
->
L
ist
[
TextTokensPrompt
]:
)
->
l
ist
[
TextTokensPrompt
]:
"""
Tokenize/detokenize depending on the input format.
...
...
@@ -344,10 +343,10 @@ class OpenAIServing:
self
,
request
:
CompletionLikeRequest
,
tokenizer
:
AnyTokenizer
,
input_or_inputs
:
Union
[
str
,
L
ist
[
str
],
L
ist
[
int
],
L
ist
[
L
ist
[
int
]]],
input_or_inputs
:
Union
[
str
,
l
ist
[
str
],
l
ist
[
int
],
l
ist
[
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
)
->
T
uple
[
L
ist
[
TextTokensPrompt
],
L
ist
[
TokensPrompt
]]:
)
->
t
uple
[
l
ist
[
TextTokensPrompt
],
l
ist
[
TokensPrompt
]]:
request_prompts
=
await
self
.
_tokenize_prompt_input_or_inputs_async
(
request
,
tokenizer
,
...
...
@@ -367,19 +366,19 @@ class OpenAIServing:
self
,
request
:
ChatLikeRequest
,
tokenizer
:
AnyTokenizer
,
messages
:
L
ist
[
ChatCompletionMessageParam
],
messages
:
l
ist
[
ChatCompletionMessageParam
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
add_generation_prompt
:
bool
=
True
,
continue_final_message
:
bool
=
False
,
tool_dicts
:
Optional
[
L
ist
[
D
ict
[
str
,
Any
]]]
=
None
,
documents
:
Optional
[
L
ist
[
D
ict
[
str
,
str
]]]
=
None
,
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
,
tool_dicts
:
Optional
[
l
ist
[
d
ict
[
str
,
Any
]]]
=
None
,
documents
:
Optional
[
l
ist
[
d
ict
[
str
,
str
]]]
=
None
,
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
,
tool_parser
:
Optional
[
Callable
[[
AnyTokenizer
],
ToolParser
]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
False
,
)
->
T
uple
[
L
ist
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
L
ist
[
TokensPrompt
]]:
)
->
t
uple
[
l
ist
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
l
ist
[
TokensPrompt
]]:
resolved_content_format
=
resolve_chat_template_content_format
(
chat_template
,
chat_template_content_format
,
...
...
@@ -392,7 +391,7 @@ class OpenAIServing:
content_format
=
resolved_content_format
,
)
_chat_template_kwargs
:
D
ict
[
str
,
Any
]
=
dict
(
_chat_template_kwargs
:
d
ict
[
str
,
Any
]
=
dict
(
chat_template
=
chat_template
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
...
...
@@ -401,7 +400,7 @@ class OpenAIServing:
)
_chat_template_kwargs
.
update
(
chat_template_kwargs
or
{})
request_prompt
:
Union
[
str
,
L
ist
[
int
]]
request_prompt
:
Union
[
str
,
l
ist
[
int
]]
if
isinstance
(
tokenizer
,
MistralTokenizer
):
request_prompt
=
apply_mistral_chat_template
(
tokenizer
,
...
...
vllm/entrypoints/openai/serving_models.py
View file @
cf069aa8
...
...
@@ -4,7 +4,7 @@ import json
import
pathlib
from
dataclasses
import
dataclass
from
http
import
HTTPStatus
from
typing
import
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
...
...
@@ -53,10 +53,10 @@ class OpenAIServingModels:
self
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
base_model_paths
:
L
ist
[
BaseModelPath
],
base_model_paths
:
l
ist
[
BaseModelPath
],
*
,
lora_modules
:
Optional
[
L
ist
[
LoRAModulePath
]]
=
None
,
prompt_adapters
:
Optional
[
L
ist
[
PromptAdapterPath
]]
=
None
,
lora_modules
:
Optional
[
l
ist
[
LoRAModulePath
]]
=
None
,
prompt_adapters
:
Optional
[
l
ist
[
PromptAdapterPath
]]
=
None
,
):
super
().
__init__
()
...
...
@@ -65,7 +65,7 @@ class OpenAIServingModels:
self
.
engine_client
=
engine_client
self
.
static_lora_modules
=
lora_modules
self
.
lora_requests
:
L
ist
[
LoRARequest
]
=
[]
self
.
lora_requests
:
l
ist
[
LoRARequest
]
=
[]
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
prompt_adapter_requests
=
[]
...
...
vllm/entrypoints/openai/serving_pooling.py
View file @
cf069aa8
...
...
@@ -3,7 +3,8 @@
import
asyncio
import
base64
import
time
from
typing
import
AsyncGenerator
,
Final
,
List
,
Literal
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Final
,
Literal
,
Optional
,
Union
,
cast
import
numpy
as
np
from
fastapi
import
Request
...
...
@@ -29,7 +30,7 @@ logger = init_logger(__name__)
def
_get_data
(
output
:
PoolingOutput
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
Union
[
L
ist
[
float
],
str
]:
)
->
Union
[
l
ist
[
float
],
str
]:
if
encoding_format
==
"float"
:
return
output
.
data
.
tolist
()
elif
encoding_format
==
"base64"
:
...
...
@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
try
:
pooling_params
=
request
.
to_pooling_params
()
...
...
@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
num_prompts
=
len
(
engine_prompts
)
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
=
[
None
]
*
num_prompts
try
:
async
for
i
,
res
in
result_generator
:
...
...
@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing):
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
final_res_batch_checked
=
cast
(
L
ist
[
PoolingRequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
PoolingRequestOutput
],
final_res_batch
)
response
=
self
.
request_output_to_pooling_response
(
...
...
@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing):
def
request_output_to_pooling_response
(
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
created_time
:
int
,
model_name
:
str
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
PoolingResponse
:
items
:
L
ist
[
PoolingResponseData
]
=
[]
items
:
l
ist
[
PoolingResponseData
]
=
[]
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
...
vllm/entrypoints/openai/serving_score.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
time
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
List
,
Mapping
,
Optional
,
Union
from
collections.abc
import
AsyncGenerator
,
Mapping
from
typing
import
Any
,
Optional
,
Union
from
fastapi
import
Request
...
...
@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing):
async
def
_embedding_score
(
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
texts_1
:
L
ist
[
str
],
texts_2
:
L
ist
[
str
],
texts_1
:
l
ist
[
str
],
texts_2
:
l
ist
[
str
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request_id
=
str
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
...
...
@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing):
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
None
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
input_texts
=
texts_1
+
texts_2
engine_prompts
:
L
ist
[
TokensPrompt
]
=
[]
engine_prompts
:
l
ist
[
TokensPrompt
]
=
[]
tokenize_async
=
make_async
(
tokenizer
.
__call__
,
executor
=
self
.
_tokenizer_executor
)
...
...
@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing):
prompt_token_ids
=
text_token_prompt
[
"prompt_token_ids"
]))
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
()
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
...
...
@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
# Non-streaming response
final_res_batch
:
L
ist
[
PoolingRequestOutput
]
=
[]
final_res_batch
:
l
ist
[
PoolingRequestOutput
]
=
[]
embeddings
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
=
\
embeddings
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
=
\
[
None
]
*
len
(
engine_prompts
)
async
for
i
,
res
in
result_generator
:
embeddings
[
i
]
=
res
emb_texts_1
:
L
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_2
:
L
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_1
:
l
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_2
:
l
ist
[
PoolingRequestOutput
]
=
[]
for
i
in
range
(
0
,
len
(
texts_1
)):
assert
(
emb
:
=
embeddings
[
i
])
is
not
None
...
...
@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing):
async
def
_cross_encoding_score
(
self
,
tokenizer
:
Union
[
AnyTokenizer
],
texts_1
:
L
ist
[
str
],
texts_2
:
L
ist
[
str
],
texts_1
:
l
ist
[
str
],
texts_2
:
l
ist
[
str
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request_id
=
str
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
...
...
@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing):
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
None
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
request_prompts
:
L
ist
[
str
]
=
[]
engine_prompts
:
L
ist
[
TokensPrompt
]
=
[]
request_prompts
:
l
ist
[
str
]
=
[]
engine_prompts
:
l
ist
[
TokensPrompt
]
=
[]
if
len
(
texts_1
)
==
1
:
texts_1
=
texts_1
*
len
(
texts_2
)
...
...
@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing):
engine_prompts
.
append
(
engine_prompt
)
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
()
...
...
@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
# Non-streaming response
final_res_batch
:
L
ist
[
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
=
[
None
]
*
len
(
engine_prompts
)
async
for
i
,
res
in
result_generator
:
...
...
@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing):
request_id
:
str
,
raw_request
:
Optional
[
Request
]
=
None
,
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
tokenization_kwargs
:
D
ict
[
str
,
Any
]
=
{}
tokenization_kwargs
:
d
ict
[
str
,
Any
]
=
{}
if
truncate_prompt_tokens
is
not
None
:
tokenization_kwargs
[
"truncation"
]
=
True
tokenization_kwargs
[
"max_length"
]
=
truncate_prompt_tokens
...
...
@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing):
def
request_output_to_score_response
(
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
created_time
:
int
,
model_name
:
str
,
)
->
ScoreResponse
:
items
:
L
ist
[
ScoreResponseData
]
=
[]
items
:
l
ist
[
ScoreResponseData
]
=
[]
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
...
@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing):
)
def
request_output_to_rerank_response
(
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
request_id
:
str
,
model_name
:
str
,
documents
:
L
ist
[
str
],
self
,
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
model_name
:
str
,
documents
:
l
ist
[
str
],
top_n
:
int
)
->
RerankResponse
:
"""
Convert the output of do_rank to a RerankResponse
"""
results
:
L
ist
[
RerankResult
]
=
[]
results
:
l
ist
[
RerankResult
]
=
[]
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
classify_res
=
ScoringRequestOutput
.
from_base
(
final_res
)
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Final
,
List
,
Optional
,
Union
from
typing
import
Final
,
Optional
,
Union
from
fastapi
import
Request
...
...
@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing):
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
str
(
e
))
input_ids
:
L
ist
[
int
]
=
[]
input_ids
:
l
ist
[
int
]
=
[]
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
self
.
_log_inputs
(
request_id
,
request_prompts
[
i
],
...
...
vllm/entrypoints/openai/serving_transcription.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
io
from
typing
import
AsyncGenerator
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Optional
,
Union
,
cast
from
fastapi
import
Request
...
...
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
os
from
collections.abc
import
Sequence
from
functools
import
cached_property
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Type
,
Union
from
typing
import
Callable
,
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
,
...
...
@@ -22,16 +23,16 @@ class ToolParser:
"""
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
self
.
prev_tool_call_arr
:
L
ist
[
D
ict
]
=
[]
self
.
prev_tool_call_arr
:
l
ist
[
d
ict
]
=
[]
# the index of the tool call that is currently being parsed
self
.
current_tool_id
:
int
=
-
1
self
.
current_tool_name_sent
:
bool
=
False
self
.
streamed_args_for_tool
:
L
ist
[
str
]
=
[]
self
.
streamed_args_for_tool
:
l
ist
[
str
]
=
[]
self
.
model_tokenizer
=
tokenizer
@
cached_property
def
vocab
(
self
)
->
D
ict
[
str
,
int
]:
def
vocab
(
self
)
->
d
ict
[
str
,
int
]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
return
self
.
model_tokenizer
.
get_vocab
()
...
...
@@ -79,10 +80,10 @@ class ToolParser:
class
ToolParserManager
:
tool_parsers
:
D
ict
[
str
,
T
ype
]
=
{}
tool_parsers
:
d
ict
[
str
,
t
ype
]
=
{}
@
classmethod
def
get_tool_parser
(
cls
,
name
)
->
T
ype
:
def
get_tool_parser
(
cls
,
name
)
->
t
ype
:
"""
Get tool parser by name which is registered by `register_module`.
...
...
@@ -95,8 +96,8 @@ class ToolParserManager:
@
classmethod
def
_register_module
(
cls
,
module
:
T
ype
,
module_name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
module
:
t
ype
,
module_name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
)
->
None
:
if
not
issubclass
(
module
,
ToolParser
):
raise
TypeError
(
...
...
@@ -116,9 +117,9 @@ class ToolParserManager:
@
classmethod
def
register_module
(
cls
,
name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
,
module
:
Union
[
T
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
module
:
Union
[
t
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
"""
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
...
...
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
View file @
cf069aa8
...
...
@@ -2,8 +2,9 @@
import
json
import
re
from
collections.abc
import
Sequence
from
json
import
JSONDecoder
from
typing
import
Dict
,
Sequence
,
Union
from
typing
import
Union
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
...
...
@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser):
return
None
# select as the current tool call the one we're on the state at
current_tool_call
:
D
ict
=
tool_call_arr
[
self
.
current_tool_id
]
\
current_tool_call
:
d
ict
=
tool_call_arr
[
self
.
current_tool_id
]
\
if
len
(
tool_call_arr
)
>
0
else
{}
# case -- if no tokens have been streamed for the tool, e.g.
...
...
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
json
from
typing
import
Dict
,
Sequence
,
Union
from
collections.abc
import
Sequence
from
typing
import
Union
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
...
...
@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser):
return
None
# select as the current tool call the one we're on the state at
current_tool_call
:
D
ict
=
tool_call_arr
[
self
.
current_tool_id
]
current_tool_call
:
d
ict
=
tool_call_arr
[
self
.
current_tool_id
]
delta
=
None
# case: we are starting a new tool in the array
...
...
Prev
1
…
8
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment