Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
225 additions
and
214 deletions
+225
-214
vllm/entrypoints/logger.py
vllm/entrypoints/logger.py
+2
-2
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+5
-4
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+4
-3
vllm/entrypoints/openai/logits_processors.py
vllm/entrypoints/openai/logits_processors.py
+12
-11
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+64
-64
vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
...ypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+11
-10
vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
.../openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+3
-2
vllm/entrypoints/openai/run_batch.py
vllm/entrypoints/openai/run_batch.py
+5
-4
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+15
-16
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+16
-16
vllm/entrypoints/openai/serving_embedding.py
vllm/entrypoints/openai/serving_embedding.py
+8
-7
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+21
-22
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+5
-5
vllm/entrypoints/openai/serving_pooling.py
vllm/entrypoints/openai/serving_pooling.py
+8
-7
vllm/entrypoints/openai/serving_score.py
vllm/entrypoints/openai/serving_score.py
+25
-24
vllm/entrypoints/openai/serving_tokenization.py
vllm/entrypoints/openai/serving_tokenization.py
+2
-2
vllm/entrypoints/openai/serving_transcription.py
vllm/entrypoints/openai/serving_transcription.py
+2
-1
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+11
-10
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
...ypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+3
-2
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+3
-2
No files found.
vllm/entrypoints/logger.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -22,7 +22,7 @@ class RequestLogger:
...
@@ -22,7 +22,7 @@ class RequestLogger:
self
,
self
,
request_id
:
str
,
request_id
:
str
,
prompt
:
Optional
[
str
],
prompt
:
Optional
[
str
],
prompt_token_ids
:
Optional
[
L
ist
[
int
]],
prompt_token_ids
:
Optional
[
l
ist
[
int
]],
params
:
Optional
[
Union
[
SamplingParams
,
PoolingParams
,
params
:
Optional
[
Union
[
SamplingParams
,
PoolingParams
,
BeamSearchParams
]],
BeamSearchParams
]],
lora_request
:
Optional
[
LoRARequest
],
lora_request
:
Optional
[
LoRARequest
],
...
...
vllm/entrypoints/openai/api_server.py
View file @
cf069aa8
...
@@ -13,10 +13,11 @@ import socket
...
@@ -13,10 +13,11 @@ import socket
import
tempfile
import
tempfile
import
uuid
import
uuid
from
argparse
import
Namespace
from
argparse
import
Namespace
from
collections.abc
import
AsyncIterator
from
contextlib
import
asynccontextmanager
from
contextlib
import
asynccontextmanager
from
functools
import
partial
from
functools
import
partial
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
Annotated
,
AsyncIterator
,
Dict
,
Optional
,
Set
,
Tuple
,
Union
from
typing
import
Annotated
,
Optional
,
Union
import
uvloop
import
uvloop
from
fastapi
import
APIRouter
,
Depends
,
FastAPI
,
Form
,
HTTPException
,
Request
from
fastapi
import
APIRouter
,
Depends
,
FastAPI
,
Form
,
HTTPException
,
Request
...
@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
...
@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
logger
=
init_logger
(
'vllm.entrypoints.openai.api_server'
)
logger
=
init_logger
(
'vllm.entrypoints.openai.api_server'
)
_running_tasks
:
S
et
[
asyncio
.
Task
]
=
set
()
_running_tasks
:
s
et
[
asyncio
.
Task
]
=
set
()
@
asynccontextmanager
@
asynccontextmanager
...
@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
...
@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
return
await
do_rerank
(
request
,
raw_request
)
return
await
do_rerank
(
request
,
raw_request
)
TASK_HANDLERS
:
D
ict
[
str
,
D
ict
[
str
,
tuple
]]
=
{
TASK_HANDLERS
:
d
ict
[
str
,
d
ict
[
str
,
tuple
]]
=
{
"generate"
:
{
"generate"
:
{
"messages"
:
(
ChatCompletionRequest
,
create_chat_completion
),
"messages"
:
(
ChatCompletionRequest
,
create_chat_completion
),
"default"
:
(
CompletionRequest
,
create_completion
),
"default"
:
(
CompletionRequest
,
create_completion
),
...
@@ -894,7 +895,7 @@ async def init_app_state(
...
@@ -894,7 +895,7 @@ async def init_app_state(
state
.
task
=
model_config
.
task
state
.
task
=
model_config
.
task
def
create_server_socket
(
addr
:
T
uple
[
str
,
int
])
->
socket
.
socket
:
def
create_server_socket
(
addr
:
t
uple
[
str
,
int
])
->
socket
.
socket
:
family
=
socket
.
AF_INET
family
=
socket
.
AF_INET
if
is_valid_ipv6_address
(
addr
[
0
]):
if
is_valid_ipv6_address
(
addr
[
0
]):
family
=
socket
.
AF_INET6
family
=
socket
.
AF_INET6
...
...
vllm/entrypoints/openai/cli_args.py
View file @
cf069aa8
...
@@ -8,7 +8,8 @@ purposes.
...
@@ -8,7 +8,8 @@ purposes.
import
argparse
import
argparse
import
json
import
json
import
ssl
import
ssl
from
typing
import
List
,
Optional
,
Sequence
,
Union
,
get_args
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
,
get_args
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
nullable_str
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
from
vllm.entrypoints.chat_utils
import
(
ChatTemplateContentFormatOption
,
...
@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action):
...
@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action):
if
isinstance
(
values
,
str
):
if
isinstance
(
values
,
str
):
raise
TypeError
(
"Expected values to be a list"
)
raise
TypeError
(
"Expected values to be a list"
)
lora_list
:
L
ist
[
LoRAModulePath
]
=
[]
lora_list
:
l
ist
[
LoRAModulePath
]
=
[]
for
item
in
values
:
for
item
in
values
:
if
item
in
[
None
,
''
]:
# Skip if item is None or empty string
if
item
in
[
None
,
''
]:
# Skip if item is None or empty string
continue
continue
...
@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action):
...
@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action):
if
isinstance
(
values
,
str
):
if
isinstance
(
values
,
str
):
raise
TypeError
(
"Expected values to be a list"
)
raise
TypeError
(
"Expected values to be a list"
)
adapter_list
:
L
ist
[
PromptAdapterPath
]
=
[]
adapter_list
:
l
ist
[
PromptAdapterPath
]
=
[]
for
item
in
values
:
for
item
in
values
:
name
,
path
=
item
.
split
(
'='
)
name
,
path
=
item
.
split
(
'='
)
adapter_list
.
append
(
PromptAdapterPath
(
name
,
path
))
adapter_list
.
append
(
PromptAdapterPath
(
name
,
path
))
...
...
vllm/entrypoints/openai/logits_processors.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Iterable
from
functools
import
lru_cache
,
partial
from
functools
import
lru_cache
,
partial
from
typing
import
Dict
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
import
torch
import
torch
...
@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
...
@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
specific set of token ids."""
specific set of token ids."""
def
__init__
(
self
,
allowed_ids
:
Iterable
[
int
]):
def
__init__
(
self
,
allowed_ids
:
Iterable
[
int
]):
self
.
allowed_ids
:
Optional
[
L
ist
[
int
]]
=
list
(
allowed_ids
)
self
.
allowed_ids
:
Optional
[
l
ist
[
int
]]
=
list
(
allowed_ids
)
self
.
mask
:
Optional
[
torch
.
Tensor
]
=
None
self
.
mask
:
Optional
[
torch
.
Tensor
]
=
None
def
__call__
(
self
,
token_ids
:
L
ist
[
int
],
def
__call__
(
self
,
token_ids
:
l
ist
[
int
],
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
self
.
mask
is
None
:
if
self
.
mask
is
None
:
self
.
mask
=
torch
.
ones
((
logits
.
shape
[
-
1
],
),
self
.
mask
=
torch
.
ones
((
logits
.
shape
[
-
1
],
),
...
@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor:
...
@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor:
@
lru_cache
(
maxsize
=
32
)
@
lru_cache
(
maxsize
=
32
)
def
_get_allowed_token_ids_logits_processor
(
def
_get_allowed_token_ids_logits_processor
(
allowed_token_ids
:
F
rozen
S
et
[
int
],
allowed_token_ids
:
f
rozen
s
et
[
int
],
vocab_size
:
int
,
vocab_size
:
int
,
)
->
LogitsProcessor
:
)
->
LogitsProcessor
:
if
not
allowed_token_ids
:
if
not
allowed_token_ids
:
...
@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
...
@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
def
logit_bias_logits_processor
(
def
logit_bias_logits_processor
(
logit_bias
:
D
ict
[
int
,
float
],
logit_bias
:
d
ict
[
int
,
float
],
token_ids
:
L
ist
[
int
],
token_ids
:
l
ist
[
int
],
logits
:
torch
.
Tensor
,
logits
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
for
token_id
,
bias
in
logit_bias
.
items
():
for
token_id
,
bias
in
logit_bias
.
items
():
...
@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
...
@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
def
get_logits_processors
(
def
get_logits_processors
(
logit_bias
:
Optional
[
Union
[
D
ict
[
int
,
float
],
D
ict
[
str
,
float
]]],
logit_bias
:
Optional
[
Union
[
d
ict
[
int
,
float
],
d
ict
[
str
,
float
]]],
allowed_token_ids
:
Optional
[
L
ist
[
int
]],
allowed_token_ids
:
Optional
[
l
ist
[
int
]],
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
)
->
L
ist
[
LogitsProcessor
]:
)
->
l
ist
[
LogitsProcessor
]:
logits_processors
:
L
ist
[
LogitsProcessor
]
=
[]
logits_processors
:
l
ist
[
LogitsProcessor
]
=
[]
if
logit_bias
:
if
logit_bias
:
try
:
try
:
# Convert token_id to integer
# Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec
# Clamp the bias between -100 and 100 per OpenAI API spec
clamped_logit_bias
:
D
ict
[
int
,
float
]
=
{
clamped_logit_bias
:
d
ict
[
int
,
float
]
=
{
int
(
token_id
):
min
(
100.0
,
max
(
-
100.0
,
bias
))
int
(
token_id
):
min
(
100.0
,
max
(
-
100.0
,
bias
))
for
token_id
,
bias
in
logit_bias
.
items
()
for
token_id
,
bias
in
logit_bias
.
items
()
}
}
...
...
vllm/entrypoints/openai/protocol.py
View file @
cf069aa8
...
@@ -5,13 +5,13 @@
...
@@ -5,13 +5,13 @@
import
re
import
re
import
time
import
time
from
argparse
import
Namespace
from
argparse
import
Namespace
from
typing
import
Any
,
ClassVar
,
Dict
,
List
,
Literal
,
Optional
,
Set
,
Union
from
typing
import
Annotated
,
Any
,
ClassVar
,
Literal
,
Optional
,
Union
import
torch
import
torch
from
fastapi
import
UploadFile
from
fastapi
import
UploadFile
from
pydantic
import
(
BaseModel
,
ConfigDict
,
Field
,
TypeAdapter
,
from
pydantic
import
(
BaseModel
,
ConfigDict
,
Field
,
TypeAdapter
,
ValidationInfo
,
field_validator
,
model_validator
)
ValidationInfo
,
field_validator
,
model_validator
)
from
typing_extensions
import
Annotated
,
TypeAlias
from
typing_extensions
import
TypeAlias
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
...
@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
model_config
=
ConfigDict
(
extra
=
"allow"
)
model_config
=
ConfigDict
(
extra
=
"allow"
)
# Cache class field names
# Cache class field names
field_names
:
ClassVar
[
Optional
[
S
et
[
str
]]]
=
None
field_names
:
ClassVar
[
Optional
[
s
et
[
str
]]]
=
None
@
model_validator
(
mode
=
"wrap"
)
@
model_validator
(
mode
=
"wrap"
)
@
classmethod
@
classmethod
...
@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
...
@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
root
:
Optional
[
str
]
=
None
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
max_model_len
:
Optional
[
int
]
=
None
max_model_len
:
Optional
[
int
]
=
None
permission
:
L
ist
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
permission
:
l
ist
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
class
ModelList
(
OpenAIBaseModel
):
class
ModelList
(
OpenAIBaseModel
):
object
:
str
=
"list"
object
:
str
=
"list"
data
:
L
ist
[
ModelCard
]
=
Field
(
default_factory
=
list
)
data
:
l
ist
[
ModelCard
]
=
Field
(
default_factory
=
list
)
class
PromptTokenUsageInfo
(
OpenAIBaseModel
):
class
PromptTokenUsageInfo
(
OpenAIBaseModel
):
...
@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
...
@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
description
:
Optional
[
str
]
=
None
description
:
Optional
[
str
]
=
None
# schema is the field in openai but that causes conflicts with pydantic so
# schema is the field in openai but that causes conflicts with pydantic so
# instead use json_schema with an alias
# instead use json_schema with an alias
json_schema
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
alias
=
'schema'
)
json_schema
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
alias
=
'schema'
)
strict
:
Optional
[
bool
]
=
None
strict
:
Optional
[
bool
]
=
None
...
@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
...
@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
class
FunctionDefinition
(
OpenAIBaseModel
):
class
FunctionDefinition
(
OpenAIBaseModel
):
name
:
str
name
:
str
description
:
Optional
[
str
]
=
None
description
:
Optional
[
str
]
=
None
parameters
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
parameters
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
class
ChatCompletionToolsParam
(
OpenAIBaseModel
):
class
ChatCompletionToolsParam
(
OpenAIBaseModel
):
...
@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
...
@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
class
LogitsProcessorConstructor
(
BaseModel
):
class
LogitsProcessorConstructor
(
BaseModel
):
qualname
:
str
qualname
:
str
args
:
Optional
[
L
ist
[
Any
]]
=
None
args
:
Optional
[
l
ist
[
Any
]]
=
None
kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
LogitsProcessors
=
L
ist
[
Union
[
str
,
LogitsProcessorConstructor
]]
LogitsProcessors
=
l
ist
[
Union
[
str
,
LogitsProcessorConstructor
]]
def
get_logits_processors
(
processors
:
Optional
[
LogitsProcessors
],
def
get_logits_processors
(
processors
:
Optional
[
LogitsProcessors
],
pattern
:
Optional
[
str
])
->
Optional
[
L
ist
[
Any
]]:
pattern
:
Optional
[
str
])
->
Optional
[
l
ist
[
Any
]]:
if
processors
and
pattern
:
if
processors
and
pattern
:
logits_processors
=
[]
logits_processors
=
[]
for
processor
in
processors
:
for
processor
in
processors
:
...
@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
...
@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
class
ChatCompletionRequest
(
OpenAIBaseModel
):
class
ChatCompletionRequest
(
OpenAIBaseModel
):
# Ordered by official OpenAI API documentation
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
# https://platform.openai.com/docs/api-reference/chat/create
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
frequency_penalty
:
Optional
[
float
]
=
0.0
frequency_penalty
:
Optional
[
float
]
=
0.0
logit_bias
:
Optional
[
D
ict
[
str
,
float
]]
=
None
logit_bias
:
Optional
[
d
ict
[
str
,
float
]]
=
None
logprobs
:
Optional
[
bool
]
=
False
logprobs
:
Optional
[
bool
]
=
False
top_logprobs
:
Optional
[
int
]
=
0
top_logprobs
:
Optional
[
int
]
=
0
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
...
@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
presence_penalty
:
Optional
[
float
]
=
0.0
presence_penalty
:
Optional
[
float
]
=
0.0
response_format
:
Optional
[
ResponseFormat
]
=
None
response_format
:
Optional
[
ResponseFormat
]
=
None
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
stop
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stop
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
stream_options
:
Optional
[
StreamOptions
]
=
None
temperature
:
Optional
[
float
]
=
None
temperature
:
Optional
[
float
]
=
None
top_p
:
Optional
[
float
]
=
None
top_p
:
Optional
[
float
]
=
None
tools
:
Optional
[
L
ist
[
ChatCompletionToolsParam
]]
=
None
tools
:
Optional
[
l
ist
[
ChatCompletionToolsParam
]]
=
None
tool_choice
:
Optional
[
Union
[
Literal
[
"none"
],
Literal
[
"auto"
],
tool_choice
:
Optional
[
Union
[
Literal
[
"none"
],
Literal
[
"auto"
],
ChatCompletionNamedToolChoiceParam
]]
=
"none"
ChatCompletionNamedToolChoiceParam
]]
=
"none"
...
@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
min_p
:
Optional
[
float
]
=
None
min_p
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
length_penalty
:
float
=
1.0
length_penalty
:
float
=
1.0
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
Field
(
default_factory
=
list
)
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
Field
(
default_factory
=
list
)
include_stop_str_in_output
:
bool
=
False
include_stop_str_in_output
:
bool
=
False
ignore_eos
:
bool
=
False
ignore_eos
:
bool
=
False
min_tokens
:
int
=
0
min_tokens
:
int
=
0
...
@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to false (as is the "
"special tokens so this should be set to false (as is the "
"default)."
),
"default)."
),
)
)
documents
:
Optional
[
L
ist
[
D
ict
[
str
,
str
]]]
=
Field
(
documents
:
Optional
[
l
ist
[
d
ict
[
str
,
str
]]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
description
=
(
"A list of dicts representing documents that will be accessible to "
(
"A list of dicts representing documents that will be accessible to "
...
@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
"does not define one."
),
)
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
"Will be accessible by the chat template."
),
)
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
)
...
@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
...
@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
description
=
(
description
=
(
"If specified, the output will follow the regex pattern."
),
"If specified, the output will follow the regex pattern."
),
)
)
guided_choice
:
Optional
[
L
ist
[
str
]]
=
Field
(
guided_choice
:
Optional
[
l
ist
[
str
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
description
=
(
"If specified, the output will be exactly one of the choices."
),
"If specified, the output will be exactly one of the choices."
),
...
@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
...
@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/completions/create
# https://platform.openai.com/docs/api-reference/completions/create
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
prompt
:
Union
[
L
ist
[
int
],
L
ist
[
L
ist
[
int
]],
str
,
L
ist
[
str
]]
prompt
:
Union
[
l
ist
[
int
],
l
ist
[
l
ist
[
int
]],
str
,
l
ist
[
str
]]
best_of
:
Optional
[
int
]
=
None
best_of
:
Optional
[
int
]
=
None
echo
:
Optional
[
bool
]
=
False
echo
:
Optional
[
bool
]
=
False
frequency_penalty
:
Optional
[
float
]
=
0.0
frequency_penalty
:
Optional
[
float
]
=
0.0
logit_bias
:
Optional
[
D
ict
[
str
,
float
]]
=
None
logit_bias
:
Optional
[
d
ict
[
str
,
float
]]
=
None
logprobs
:
Optional
[
int
]
=
None
logprobs
:
Optional
[
int
]
=
None
max_tokens
:
Optional
[
int
]
=
16
max_tokens
:
Optional
[
int
]
=
16
n
:
int
=
1
n
:
int
=
1
presence_penalty
:
Optional
[
float
]
=
0.0
presence_penalty
:
Optional
[
float
]
=
0.0
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
seed
:
Optional
[
int
]
=
Field
(
None
,
ge
=
_LONG_INFO
.
min
,
le
=
_LONG_INFO
.
max
)
stop
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stop
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
stream
:
Optional
[
bool
]
=
False
stream_options
:
Optional
[
StreamOptions
]
=
None
stream_options
:
Optional
[
StreamOptions
]
=
None
suffix
:
Optional
[
str
]
=
None
suffix
:
Optional
[
str
]
=
None
...
@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
...
@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
min_p
:
Optional
[
float
]
=
None
min_p
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
repetition_penalty
:
Optional
[
float
]
=
None
length_penalty
:
float
=
1.0
length_penalty
:
float
=
1.0
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
Field
(
default_factory
=
list
)
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
Field
(
default_factory
=
list
)
include_stop_str_in_output
:
bool
=
False
include_stop_str_in_output
:
bool
=
False
ignore_eos
:
bool
=
False
ignore_eos
:
bool
=
False
min_tokens
:
int
=
0
min_tokens
:
int
=
0
skip_special_tokens
:
bool
=
True
skip_special_tokens
:
bool
=
True
spaces_between_special_tokens
:
bool
=
True
spaces_between_special_tokens
:
bool
=
True
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
allowed_token_ids
:
Optional
[
L
ist
[
int
]]
=
None
allowed_token_ids
:
Optional
[
l
ist
[
int
]]
=
None
prompt_logprobs
:
Optional
[
int
]
=
None
prompt_logprobs
:
Optional
[
int
]
=
None
# doc: end-completion-sampling-params
# doc: end-completion-sampling-params
...
@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
...
@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
description
=
(
description
=
(
"If specified, the output will follow the regex pattern."
),
"If specified, the output will follow the regex pattern."
),
)
)
guided_choice
:
Optional
[
L
ist
[
str
]]
=
Field
(
guided_choice
:
Optional
[
l
ist
[
str
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
description
=
(
"If specified, the output will be exactly one of the choices."
),
"If specified, the output will be exactly one of the choices."
),
...
@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
...
@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
# https://platform.openai.com/docs/api-reference/embeddings
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
input
:
Union
[
L
ist
[
int
],
L
ist
[
L
ist
[
int
]],
str
,
L
ist
[
str
]]
input
:
Union
[
l
ist
[
int
],
l
ist
[
l
ist
[
int
]],
str
,
l
ist
[
str
]]
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
dimensions
:
Optional
[
int
]
=
None
dimensions
:
Optional
[
int
]
=
None
user
:
Optional
[
str
]
=
None
user
:
Optional
[
str
]
=
None
...
@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
...
@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
class
EmbeddingChatRequest
(
OpenAIBaseModel
):
class
EmbeddingChatRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
encoding_format
:
Literal
[
"float"
,
"base64"
]
=
"float"
dimensions
:
Optional
[
int
]
=
None
dimensions
:
Optional
[
int
]
=
None
...
@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
...
@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
"does not define one."
),
)
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
"Will be accessible by the chat template."
),
)
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
)
...
@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
...
@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
class
ScoreRequest
(
OpenAIBaseModel
):
class
ScoreRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
text_1
:
Union
[
L
ist
[
str
],
str
]
text_1
:
Union
[
l
ist
[
str
],
str
]
text_2
:
Union
[
L
ist
[
str
],
str
]
text_2
:
Union
[
l
ist
[
str
],
str
]
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
# doc: begin-score-pooling-params
# doc: begin-score-pooling-params
...
@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel):
...
@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel):
class
RerankRequest
(
OpenAIBaseModel
):
class
RerankRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
query
:
str
query
:
str
documents
:
L
ist
[
str
]
documents
:
l
ist
[
str
]
top_n
:
int
=
Field
(
default_factory
=
lambda
:
0
)
top_n
:
int
=
Field
(
default_factory
=
lambda
:
0
)
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
...
@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
...
@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
id
:
str
id
:
str
model
:
str
model
:
str
usage
:
RerankUsage
usage
:
RerankUsage
results
:
L
ist
[
RerankResult
]
results
:
l
ist
[
RerankResult
]
class
CompletionLogProbs
(
OpenAIBaseModel
):
class
CompletionLogProbs
(
OpenAIBaseModel
):
text_offset
:
L
ist
[
int
]
=
Field
(
default_factory
=
list
)
text_offset
:
l
ist
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
L
ist
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
token_logprobs
:
l
ist
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
L
ist
[
str
]
=
Field
(
default_factory
=
list
)
tokens
:
l
ist
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
L
ist
[
Optional
[
D
ict
[
str
,
top_logprobs
:
l
ist
[
Optional
[
d
ict
[
str
,
float
]]]
=
Field
(
default_factory
=
list
)
float
]]]
=
Field
(
default_factory
=
list
)
...
@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
...
@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason "
"to stop, None if the completion finished for some other reason "
"including encountering the EOS token"
),
"including encountering the EOS token"
),
)
)
prompt_logprobs
:
Optional
[
L
ist
[
Optional
[
D
ict
[
int
,
Logprob
]]]]
=
None
prompt_logprobs
:
Optional
[
l
ist
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
=
None
class
CompletionResponse
(
OpenAIBaseModel
):
class
CompletionResponse
(
OpenAIBaseModel
):
...
@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
...
@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
object
:
str
=
"text_completion"
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
L
ist
[
CompletionResponseChoice
]
choices
:
l
ist
[
CompletionResponseChoice
]
usage
:
UsageInfo
usage
:
UsageInfo
...
@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
...
@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
object
:
str
=
"text_completion"
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
L
ist
[
CompletionResponseStreamChoice
]
choices
:
l
ist
[
CompletionResponseStreamChoice
]
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
class
EmbeddingResponseData
(
OpenAIBaseModel
):
class
EmbeddingResponseData
(
OpenAIBaseModel
):
index
:
int
index
:
int
object
:
str
=
"embedding"
object
:
str
=
"embedding"
embedding
:
Union
[
L
ist
[
float
],
str
]
embedding
:
Union
[
l
ist
[
float
],
str
]
class
EmbeddingResponse
(
OpenAIBaseModel
):
class
EmbeddingResponse
(
OpenAIBaseModel
):
...
@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
...
@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
object
:
str
=
"list"
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
data
:
L
ist
[
EmbeddingResponseData
]
data
:
l
ist
[
EmbeddingResponseData
]
usage
:
UsageInfo
usage
:
UsageInfo
class
PoolingResponseData
(
OpenAIBaseModel
):
class
PoolingResponseData
(
OpenAIBaseModel
):
index
:
int
index
:
int
object
:
str
=
"pooling"
object
:
str
=
"pooling"
data
:
Union
[
L
ist
[
L
ist
[
float
]],
L
ist
[
float
],
str
]
data
:
Union
[
l
ist
[
l
ist
[
float
]],
l
ist
[
float
],
str
]
class
PoolingResponse
(
OpenAIBaseModel
):
class
PoolingResponse
(
OpenAIBaseModel
):
...
@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
...
@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
object
:
str
=
"list"
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
data
:
L
ist
[
PoolingResponseData
]
data
:
l
ist
[
PoolingResponseData
]
usage
:
UsageInfo
usage
:
UsageInfo
...
@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
...
@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
object
:
str
=
"list"
object
:
str
=
"list"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
data
:
L
ist
[
ScoreResponseData
]
data
:
l
ist
[
ScoreResponseData
]
usage
:
UsageInfo
usage
:
UsageInfo
...
@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
...
@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
tools_called
:
bool
tools_called
:
bool
# extracted tool calls
# extracted tool calls
tool_calls
:
L
ist
[
ToolCall
]
tool_calls
:
l
ist
[
ToolCall
]
# content - per OpenAI spec, content AND tool calls can be returned rarely
# content - per OpenAI spec, content AND tool calls can be returned rarely
# But some models will do this intentionally
# But some models will do this intentionally
...
@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
...
@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
role
:
str
role
:
str
reasoning_content
:
Optional
[
str
]
=
None
reasoning_content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
tool_calls
:
L
ist
[
ToolCall
]
=
Field
(
default_factory
=
list
)
tool_calls
:
l
ist
[
ToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
class
ChatCompletionLogProb
(
OpenAIBaseModel
):
token
:
str
token
:
str
logprob
:
float
=
-
9999.0
logprob
:
float
=
-
9999.0
bytes
:
Optional
[
L
ist
[
int
]]
=
None
bytes
:
Optional
[
l
ist
[
int
]]
=
None
class
ChatCompletionLogProbsContent
(
ChatCompletionLogProb
):
class
ChatCompletionLogProbsContent
(
ChatCompletionLogProb
):
top_logprobs
:
L
ist
[
ChatCompletionLogProb
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
l
ist
[
ChatCompletionLogProb
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionLogProbs
(
OpenAIBaseModel
):
class
ChatCompletionLogProbs
(
OpenAIBaseModel
):
content
:
Optional
[
L
ist
[
ChatCompletionLogProbsContent
]]
=
None
content
:
Optional
[
l
ist
[
ChatCompletionLogProbsContent
]]
=
None
class
ChatCompletionResponseChoice
(
OpenAIBaseModel
):
class
ChatCompletionResponseChoice
(
OpenAIBaseModel
):
...
@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
...
@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
object
:
Literal
[
"chat.completion"
]
=
"chat.completion"
object
:
Literal
[
"chat.completion"
]
=
"chat.completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
L
ist
[
ChatCompletionResponseChoice
]
choices
:
l
ist
[
ChatCompletionResponseChoice
]
usage
:
UsageInfo
usage
:
UsageInfo
prompt_logprobs
:
Optional
[
L
ist
[
Optional
[
D
ict
[
int
,
Logprob
]]]]
=
None
prompt_logprobs
:
Optional
[
l
ist
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
=
None
class
DeltaMessage
(
OpenAIBaseModel
):
class
DeltaMessage
(
OpenAIBaseModel
):
role
:
Optional
[
str
]
=
None
role
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
reasoning_content
:
Optional
[
str
]
=
None
reasoning_content
:
Optional
[
str
]
=
None
tool_calls
:
L
ist
[
DeltaToolCall
]
=
Field
(
default_factory
=
list
)
tool_calls
:
l
ist
[
DeltaToolCall
]
=
Field
(
default_factory
=
list
)
class
ChatCompletionResponseStreamChoice
(
OpenAIBaseModel
):
class
ChatCompletionResponseStreamChoice
(
OpenAIBaseModel
):
...
@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
...
@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
object
:
Literal
[
"chat.completion.chunk"
]
=
"chat.completion.chunk"
object
:
Literal
[
"chat.completion.chunk"
]
=
"chat.completion.chunk"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
model
:
str
choices
:
L
ist
[
ChatCompletionResponseStreamChoice
]
choices
:
l
ist
[
ChatCompletionResponseStreamChoice
]
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
usage
:
Optional
[
UsageInfo
]
=
Field
(
default
=
None
)
...
@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
...
@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
class
TokenizeChatRequest
(
OpenAIBaseModel
):
class
TokenizeChatRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
messages
:
L
ist
[
ChatCompletionMessageParam
]
messages
:
l
ist
[
ChatCompletionMessageParam
]
add_generation_prompt
:
bool
=
Field
(
add_generation_prompt
:
bool
=
Field
(
default
=
True
,
default
=
True
,
...
@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
...
@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
"does not define one."
),
)
)
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the template renderer. "
description
=
(
"Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."
),
"Will be accessible by the chat template."
),
)
)
mm_processor_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
Field
(
mm_processor_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
Field
(
default
=
None
,
default
=
None
,
description
=
(
"Additional kwargs to pass to the HF processor."
),
description
=
(
"Additional kwargs to pass to the HF processor."
),
)
)
...
@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
...
@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
class
TokenizeResponse
(
OpenAIBaseModel
):
class
TokenizeResponse
(
OpenAIBaseModel
):
count
:
int
count
:
int
max_model_len
:
int
max_model_len
:
int
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
class
DetokenizeRequest
(
OpenAIBaseModel
):
class
DetokenizeRequest
(
OpenAIBaseModel
):
model
:
Optional
[
str
]
=
None
model
:
Optional
[
str
]
=
None
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
class
DetokenizeResponse
(
OpenAIBaseModel
):
class
DetokenizeResponse
(
OpenAIBaseModel
):
...
@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
...
@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to automatically increase the temperature until certain thresholds are hit.
to automatically increase the temperature until certain thresholds are hit.
"""
"""
timestamp_granularities
:
L
ist
[
Literal
[
"word"
,
"segment"
]]
=
Field
(
timestamp_granularities
:
l
ist
[
Literal
[
"word"
,
"segment"
]]
=
Field
(
alias
=
"timestamp_granularities[]"
,
default
=
[])
alias
=
"timestamp_granularities[]"
,
default
=
[])
"""The timestamp granularities to populate for this transcription.
"""The timestamp granularities to populate for this transcription.
...
@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
...
@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
text
:
str
text
:
str
"""Text content of the segment."""
"""Text content of the segment."""
tokens
:
L
ist
[
int
]
tokens
:
l
ist
[
int
]
"""Array of token IDs for the text content."""
"""Array of token IDs for the text content."""
...
@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
...
@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
text
:
str
text
:
str
"""The transcribed text."""
"""The transcribed text."""
segments
:
Optional
[
L
ist
[
TranscriptionSegment
]]
=
None
segments
:
Optional
[
l
ist
[
TranscriptionSegment
]]
=
None
"""Segments of the transcribed text and their corresponding details."""
"""Segments of the transcribed text and their corresponding details."""
words
:
Optional
[
L
ist
[
TranscriptionWord
]]
=
None
words
:
Optional
[
l
ist
[
TranscriptionWord
]]
=
None
"""Extracted words and their corresponding timestamps."""
"""Extracted words and their corresponding timestamps."""
vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
from
collections.abc
import
Sequence
from
functools
import
cached_property
from
functools
import
cached_property
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Tuple
,
Type
,
Union
from
typing
import
Callable
,
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
)
DeltaMessage
)
...
@@ -25,14 +26,14 @@ class ReasoningParser:
...
@@ -25,14 +26,14 @@ class ReasoningParser:
self
.
model_tokenizer
=
tokenizer
self
.
model_tokenizer
=
tokenizer
@
cached_property
@
cached_property
def
vocab
(
self
)
->
D
ict
[
str
,
int
]:
def
vocab
(
self
)
->
d
ict
[
str
,
int
]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
# whereas all tokenizers have .get_vocab()
return
self
.
model_tokenizer
.
get_vocab
()
return
self
.
model_tokenizer
.
get_vocab
()
def
extract_reasoning_content
(
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
"""
"""
Extract reasoning content from a complete model-generated string.
Extract reasoning content from a complete model-generated string.
...
@@ -47,7 +48,7 @@ class ReasoningParser:
...
@@ -47,7 +48,7 @@ class ReasoningParser:
The request object that was used to generate the model_output.
The request object that was used to generate the model_output.
Returns:
Returns:
T
uple[Optional[str], Optional[str]]
t
uple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
A tuple containing the reasoning content and the content.
"""
"""
...
@@ -77,10 +78,10 @@ class ReasoningParser:
...
@@ -77,10 +78,10 @@ class ReasoningParser:
class
ReasoningParserManager
:
class
ReasoningParserManager
:
reasoning_parsers
:
D
ict
[
str
,
T
ype
]
=
{}
reasoning_parsers
:
d
ict
[
str
,
t
ype
]
=
{}
@
classmethod
@
classmethod
def
get_reasoning_parser
(
cls
,
name
)
->
T
ype
:
def
get_reasoning_parser
(
cls
,
name
)
->
t
ype
:
"""
"""
Get reasoning parser by name which is registered by `register_module`.
Get reasoning parser by name which is registered by `register_module`.
...
@@ -94,8 +95,8 @@ class ReasoningParserManager:
...
@@ -94,8 +95,8 @@ class ReasoningParserManager:
@
classmethod
@
classmethod
def
_register_module
(
cls
,
def
_register_module
(
cls
,
module
:
T
ype
,
module
:
t
ype
,
module_name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
module_name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
)
->
None
:
force
:
bool
=
True
)
->
None
:
if
not
issubclass
(
module
,
ReasoningParser
):
if
not
issubclass
(
module
,
ReasoningParser
):
raise
TypeError
(
"module must be subclass of ReasoningParser, "
raise
TypeError
(
"module must be subclass of ReasoningParser, "
...
@@ -114,9 +115,9 @@ class ReasoningParserManager:
...
@@ -114,9 +115,9 @@ class ReasoningParserManager:
@
classmethod
@
classmethod
def
register_module
(
def
register_module
(
cls
,
cls
,
name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
,
force
:
bool
=
True
,
module
:
Union
[
T
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
module
:
Union
[
t
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
"""
"""
Register module with the given name or name list. it can be used as a
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
decoder(with module as None) or normal function(with module as not
...
...
vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
re
import
re
from
typing
import
Optional
,
Sequence
,
Tuple
,
Union
from
collections.abc
import
Sequence
from
typing
import
Optional
,
Union
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
PreTrainedTokenizerBase
...
@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
...
@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
def
extract_reasoning_content
(
def
extract_reasoning_content
(
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
self
,
model_output
:
str
,
request
:
ChatCompletionRequest
)
->
T
uple
[
Optional
[
str
],
Optional
[
str
]]:
)
->
t
uple
[
Optional
[
str
],
Optional
[
str
]]:
# DeepSeek R1 doesn't generate <think> now.
# DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start.
# Thus we assume the reasoning content is always at the start.
...
...
vllm/entrypoints/openai/run_batch.py
View file @
cf069aa8
...
@@ -2,9 +2,10 @@
...
@@ -2,9 +2,10 @@
import
asyncio
import
asyncio
import
tempfile
import
tempfile
from
collections.abc
import
Awaitable
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
io
import
StringIO
from
io
import
StringIO
from
typing
import
Awaitable
,
Callable
,
List
,
Optional
from
typing
import
Callable
,
Optional
import
aiohttp
import
aiohttp
import
torch
import
torch
...
@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
...
@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
async
def
write_local_file
(
output_path
:
str
,
async
def
write_local_file
(
output_path
:
str
,
batch_outputs
:
L
ist
[
BatchRequestOutput
])
->
None
:
batch_outputs
:
l
ist
[
BatchRequestOutput
])
->
None
:
"""
"""
Write the responses to a local file.
Write the responses to a local file.
output_path: The path to write the responses to.
output_path: The path to write the responses to.
...
@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
...
@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
f
"Error message:
{
str
(
e
)
}
."
)
from
e
f
"Error message:
{
str
(
e
)
}
."
)
from
e
async
def
write_file
(
path_or_url
:
str
,
batch_outputs
:
L
ist
[
BatchRequestOutput
],
async
def
write_file
(
path_or_url
:
str
,
batch_outputs
:
l
ist
[
BatchRequestOutput
],
output_tmp_dir
:
str
)
->
None
:
output_tmp_dir
:
str
)
->
None
:
"""
"""
Write batch_outputs to a file or upload to a URL.
Write batch_outputs to a file or upload to a URL.
...
@@ -353,7 +354,7 @@ async def main(args):
...
@@ -353,7 +354,7 @@ async def main(args):
logger
.
info
(
"Reading batch from %s..."
,
args
.
input_file
)
logger
.
info
(
"Reading batch from %s..."
,
args
.
input_file
)
# Submit all requests in the file to the engine "concurrently".
# Submit all requests in the file to the engine "concurrently".
response_futures
:
L
ist
[
Awaitable
[
BatchRequestOutput
]]
=
[]
response_futures
:
l
ist
[
Awaitable
[
BatchRequestOutput
]]
=
[]
for
request_json
in
(
await
read_file
(
args
.
input_file
)).
strip
().
split
(
"
\n
"
):
for
request_json
in
(
await
read_file
(
args
.
input_file
)).
strip
().
split
(
"
\n
"
):
# Skip empty lines.
# Skip empty lines.
request_json
=
request_json
.
strip
()
request_json
=
request_json
.
strip
()
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
cf069aa8
...
@@ -3,10 +3,9 @@
...
@@ -3,10 +3,9 @@
import
asyncio
import
asyncio
import
json
import
json
import
time
import
time
from
typing
import
(
AsyncGenerator
,
AsyncIterator
,
Callable
,
Dict
,
Final
,
List
,
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
Optional
)
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Callable
,
Final
,
Optional
,
Union
from
typing
import
Union
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing):
raw_request
.
state
.
request_metadata
=
request_metadata
raw_request
.
state
.
request_metadata
=
request_metadata
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
...
@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator
:
AsyncIterator
[
RequestOutput
],
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
,
request_id
:
str
,
model_name
:
str
,
model_name
:
str
,
conversation
:
L
ist
[
ConversationMessage
],
conversation
:
l
ist
[
ConversationMessage
],
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
request_metadata
:
RequestResponseMetadata
,
)
->
AsyncGenerator
[
str
,
None
]:
)
->
AsyncGenerator
[
str
,
None
]:
...
@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing):
should_stream_with_reasoning_parsing
=
(
should_stream_with_reasoning_parsing
=
(
self
.
_should_stream_with_reasoning_parsing
(
request
))
self
.
_should_stream_with_reasoning_parsing
(
request
))
all_previous_token_ids
:
Optional
[
L
ist
[
L
ist
[
int
]]]
all_previous_token_ids
:
Optional
[
l
ist
[
l
ist
[
int
]]]
# Only one of these will be used, thus previous_texts and
# Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration.
# all_previous_token_ids will not be used twice in the same iteration.
...
@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing):
# Prepare the tool parser if it's needed
# Prepare the tool parser if it's needed
try
:
try
:
if
tool_choice_auto
and
self
.
tool_parser
:
if
tool_choice_auto
and
self
.
tool_parser
:
tool_parsers
:
L
ist
[
Optional
[
ToolParser
]]
=
[
tool_parsers
:
l
ist
[
Optional
[
ToolParser
]]
=
[
self
.
tool_parser
(
tokenizer
)
self
.
tool_parser
(
tokenizer
)
]
*
num_choices
]
*
num_choices
else
:
else
:
...
@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing):
# Send response to echo the input portion of the
# Send response to echo the input portion of the
# last message
# last message
if
request
.
echo
:
if
request
.
echo
:
last_msg_content
:
Union
[
str
,
L
ist
[
D
ict
[
str
,
str
]]]
=
""
last_msg_content
:
Union
[
str
,
l
ist
[
d
ict
[
str
,
str
]]]
=
""
if
conversation
and
"content"
in
conversation
[
if
conversation
and
"content"
in
conversation
[
-
1
]
and
conversation
[
-
1
].
get
(
"role"
)
==
role
:
-
1
]
and
conversation
[
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
...
@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator
:
AsyncIterator
[
RequestOutput
],
result_generator
:
AsyncIterator
[
RequestOutput
],
request_id
:
str
,
request_id
:
str
,
model_name
:
str
,
model_name
:
str
,
conversation
:
L
ist
[
ConversationMessage
],
conversation
:
l
ist
[
ConversationMessage
],
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
request_metadata
:
RequestResponseMetadata
,
)
->
Union
[
ErrorResponse
,
ChatCompletionResponse
]:
)
->
Union
[
ErrorResponse
,
ChatCompletionResponse
]:
...
@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing):
assert
final_res
is
not
None
assert
final_res
is
not
None
choices
:
L
ist
[
ChatCompletionResponseChoice
]
=
[]
choices
:
l
ist
[
ChatCompletionResponseChoice
]
=
[]
role
=
self
.
get_chat_request_role
(
request
)
role
=
self
.
get_chat_request_role
(
request
)
for
output
in
final_res
.
outputs
:
for
output
in
final_res
.
outputs
:
...
@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing):
choices
.
append
(
choice_data
)
choices
.
append
(
choice_data
)
if
request
.
echo
:
if
request
.
echo
:
last_msg_content
:
Union
[
str
,
L
ist
[
D
ict
[
str
,
str
]]]
=
""
last_msg_content
:
Union
[
str
,
l
ist
[
d
ict
[
str
,
str
]]]
=
""
if
conversation
and
"content"
in
conversation
[
-
1
]
and
conversation
[
if
conversation
and
"content"
in
conversation
[
-
1
]
and
conversation
[
-
1
].
get
(
"role"
)
==
role
:
-
1
].
get
(
"role"
)
==
role
:
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
last_msg_content
=
conversation
[
-
1
][
"content"
]
or
""
...
@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing):
return
response
return
response
def
_get_top_logprobs
(
def
_get_top_logprobs
(
self
,
logprobs
:
D
ict
[
int
,
Logprob
],
top_logprobs
:
Optional
[
int
],
self
,
logprobs
:
d
ict
[
int
,
Logprob
],
top_logprobs
:
Optional
[
int
],
tokenizer
:
AnyTokenizer
)
->
L
ist
[
ChatCompletionLogProb
]:
tokenizer
:
AnyTokenizer
)
->
l
ist
[
ChatCompletionLogProb
]:
return
[
return
[
ChatCompletionLogProb
(
token
=
(
token
:
=
self
.
_get_decoded_token
(
ChatCompletionLogProb
(
token
=
(
token
:
=
self
.
_get_decoded_token
(
p
[
1
],
p
[
1
],
...
@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing):
...
@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing):
def
_create_chat_logprobs
(
def
_create_chat_logprobs
(
self
,
self
,
token_ids
:
GenericSequence
[
int
],
token_ids
:
GenericSequence
[
int
],
top_logprobs
:
GenericSequence
[
Optional
[
D
ict
[
int
,
Logprob
]]],
top_logprobs
:
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]],
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
)
->
ChatCompletionLogProbs
:
)
->
ChatCompletionLogProbs
:
"""Create OpenAI-style logprobs."""
"""Create OpenAI-style logprobs."""
logprobs_content
:
L
ist
[
ChatCompletionLogProbsContent
]
=
[]
logprobs_content
:
l
ist
[
ChatCompletionLogProbsContent
]
=
[]
for
i
,
token_id
in
enumerate
(
token_ids
):
for
i
,
token_id
in
enumerate
(
token_ids
):
step_top_logprobs
=
top_logprobs
[
i
]
step_top_logprobs
=
top_logprobs
[
i
]
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
cf069aa8
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
import
asyncio
import
asyncio
import
time
import
time
from
typing
import
AsyncGenerator
,
AsyncIterator
,
Dict
,
List
,
Optional
from
collections.abc
import
AsyncGenerator
,
AsyncIterator
from
typing
import
Sequence
as
GenericSequence
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Tuple
,
Union
,
cast
from
typing
import
Optional
,
Union
,
cast
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
RequestOutput
,
None
]]
=
[]
try
:
try
:
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
sampling_params
:
Union
[
SamplingParams
,
BeamSearchParams
]
...
@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_metadata
=
request_metadata
)
request_metadata
=
request_metadata
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
RequestOutput
]]
=
[
None
]
*
num_prompts
final_res_batch
:
l
ist
[
Optional
[
RequestOutput
]]
=
[
None
]
*
num_prompts
try
:
try
:
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
final_res_batch
[
i
]
=
res
final_res_batch
[
i
]
=
res
...
@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
if
final_res
.
prompt
is
None
:
if
final_res
.
prompt
is
None
:
final_res
.
prompt
=
request_prompts
[
i
][
"prompt"
]
final_res
.
prompt
=
request_prompts
[
i
][
"prompt"
]
final_res_batch_checked
=
cast
(
L
ist
[
RequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
RequestOutput
],
final_res_batch
)
final_res_batch
)
response
=
self
.
request_output_to_completion_response
(
response
=
self
.
request_output_to_completion_response
(
...
@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing):
async
def
completion_stream_generator
(
async
def
completion_stream_generator
(
self
,
self
,
request
:
CompletionRequest
,
request
:
CompletionRequest
,
result_generator
:
AsyncIterator
[
T
uple
[
int
,
RequestOutput
]],
result_generator
:
AsyncIterator
[
t
uple
[
int
,
RequestOutput
]],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
model_name
:
str
,
model_name
:
str
,
...
@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing):
num_prompt_tokens
[
prompt_idx
]
=
len
(
res
.
prompt_token_ids
)
num_prompt_tokens
[
prompt_idx
]
=
len
(
res
.
prompt_token_ids
)
delta_token_ids
:
GenericSequence
[
int
]
delta_token_ids
:
GenericSequence
[
int
]
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
D
ict
[
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
int
,
Logprob
]]]]
for
output
in
res
.
outputs
:
for
output
in
res
.
outputs
:
...
@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing):
def
request_output_to_completion_response
(
def
request_output_to_completion_response
(
self
,
self
,
final_res_batch
:
L
ist
[
RequestOutput
],
final_res_batch
:
l
ist
[
RequestOutput
],
request
:
CompletionRequest
,
request
:
CompletionRequest
,
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
...
@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing):
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
request_metadata
:
RequestResponseMetadata
,
)
->
CompletionResponse
:
)
->
CompletionResponse
:
choices
:
L
ist
[
CompletionResponseChoice
]
=
[]
choices
:
l
ist
[
CompletionResponseChoice
]
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
num_generated_tokens
=
0
num_generated_tokens
=
0
...
@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text
=
final_res
.
prompt
prompt_text
=
final_res
.
prompt
token_ids
:
GenericSequence
[
int
]
token_ids
:
GenericSequence
[
int
]
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
D
ict
[
int
,
out_logprobs
:
Optional
[
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]]]
Logprob
]]]]
for
output
in
final_res
.
outputs
:
for
output
in
final_res
.
outputs
:
...
@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing):
...
@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing):
def
_create_completion_logprobs
(
def
_create_completion_logprobs
(
self
,
self
,
token_ids
:
GenericSequence
[
int
],
token_ids
:
GenericSequence
[
int
],
top_logprobs
:
GenericSequence
[
Optional
[
D
ict
[
int
,
Logprob
]]],
top_logprobs
:
GenericSequence
[
Optional
[
d
ict
[
int
,
Logprob
]]],
num_output_top_logprobs
:
int
,
num_output_top_logprobs
:
int
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
initial_text_offset
:
int
=
0
,
initial_text_offset
:
int
=
0
,
)
->
CompletionLogProbs
:
)
->
CompletionLogProbs
:
"""Create logprobs for OpenAI Completion API."""
"""Create logprobs for OpenAI Completion API."""
out_text_offset
:
L
ist
[
int
]
=
[]
out_text_offset
:
l
ist
[
int
]
=
[]
out_token_logprobs
:
L
ist
[
Optional
[
float
]]
=
[]
out_token_logprobs
:
l
ist
[
Optional
[
float
]]
=
[]
out_tokens
:
L
ist
[
str
]
=
[]
out_tokens
:
l
ist
[
str
]
=
[]
out_top_logprobs
:
L
ist
[
Optional
[
D
ict
[
str
,
float
]]]
=
[]
out_top_logprobs
:
l
ist
[
Optional
[
d
ict
[
str
,
float
]]]
=
[]
last_token_len
=
0
last_token_len
=
0
...
...
vllm/entrypoints/openai/serving_embedding.py
View file @
cf069aa8
...
@@ -3,7 +3,8 @@
...
@@ -3,7 +3,8 @@
import
asyncio
import
asyncio
import
base64
import
base64
import
time
import
time
from
typing
import
AsyncGenerator
,
Final
,
List
,
Literal
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Final
,
Literal
,
Optional
,
Union
,
cast
import
numpy
as
np
import
numpy
as
np
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -31,7 +32,7 @@ logger = init_logger(__name__)
...
@@ -31,7 +32,7 @@ logger = init_logger(__name__)
def
_get_embedding
(
def
_get_embedding
(
output
:
EmbeddingOutput
,
output
:
EmbeddingOutput
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
Union
[
L
ist
[
float
],
str
]:
)
->
Union
[
l
ist
[
float
],
str
]:
if
encoding_format
==
"float"
:
if
encoding_format
==
"float"
:
return
output
.
embedding
return
output
.
embedding
elif
encoding_format
==
"base64"
:
elif
encoding_format
==
"base64"
:
...
@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
try
:
try
:
pooling_params
=
request
.
to_pooling_params
()
pooling_params
=
request
.
to_pooling_params
()
...
@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing):
num_prompts
=
len
(
engine_prompts
)
num_prompts
=
len
(
engine_prompts
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
=
[
None
]
*
num_prompts
final_res_batch
=
[
None
]
*
num_prompts
try
:
try
:
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
...
@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing):
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
final_res_batch_checked
=
cast
(
L
ist
[
PoolingRequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
PoolingRequestOutput
],
final_res_batch
)
final_res_batch
)
response
=
self
.
request_output_to_embedding_response
(
response
=
self
.
request_output_to_embedding_response
(
...
@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing):
...
@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing):
def
request_output_to_embedding_response
(
def
request_output_to_embedding_response
(
self
,
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
model_name
:
str
,
model_name
:
str
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
EmbeddingResponse
:
)
->
EmbeddingResponse
:
items
:
L
ist
[
EmbeddingResponseData
]
=
[]
items
:
l
ist
[
EmbeddingResponseData
]
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
json
import
json
from
collections.abc
import
Iterable
,
Iterator
,
Mapping
,
Sequence
from
concurrent.futures.thread
import
ThreadPoolExecutor
from
concurrent.futures.thread
import
ThreadPoolExecutor
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
(
Any
,
Callable
,
Dict
,
Iterable
,
Iterator
,
List
,
Mapping
,
from
typing
import
Annotated
,
Any
,
Callable
,
Optional
,
TypedDict
,
Union
Optional
,
Sequence
,
Tuple
,
TypedDict
,
Union
)
from
fastapi
import
Request
from
fastapi
import
Request
from
pydantic
import
Field
from
pydantic
import
Field
from
starlette.datastructures
import
Headers
from
starlette.datastructures
import
Headers
from
typing_extensions
import
Annotated
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
...
@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
...
@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
class
TextTokensPrompt
(
TypedDict
):
class
TextTokensPrompt
(
TypedDict
):
prompt
:
str
prompt
:
str
prompt_token_ids
:
L
ist
[
int
]
prompt_token_ids
:
l
ist
[
int
]
RequestPrompt
=
Union
[
L
ist
[
int
],
str
,
TextTokensPrompt
]
RequestPrompt
=
Union
[
l
ist
[
int
],
str
,
TextTokensPrompt
]
class
OpenAIServing
:
class
OpenAIServing
:
...
@@ -144,7 +143,7 @@ class OpenAIServing:
...
@@ -144,7 +143,7 @@ class OpenAIServing:
def
_maybe_get_adapters
(
def
_maybe_get_adapters
(
self
,
request
:
AnyRequest
self
,
request
:
AnyRequest
)
->
Union
[
T
uple
[
None
,
None
],
T
uple
[
LoRARequest
,
None
],
T
uple
[
)
->
Union
[
t
uple
[
None
,
None
],
t
uple
[
LoRARequest
,
None
],
t
uple
[
None
,
PromptAdapterRequest
]]:
None
,
PromptAdapterRequest
]]:
if
self
.
_is_model_supported
(
request
.
model
):
if
self
.
_is_model_supported
(
request
.
model
):
return
None
,
None
return
None
,
None
...
@@ -188,7 +187,7 @@ class OpenAIServing:
...
@@ -188,7 +187,7 @@ class OpenAIServing:
self
,
self
,
request
:
AnyRequest
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
prompt_ids
:
L
ist
[
int
],
prompt_ids
:
l
ist
[
int
],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]],
)
->
TextTokensPrompt
:
)
->
TextTokensPrompt
:
if
truncate_prompt_tokens
is
None
:
if
truncate_prompt_tokens
is
None
:
...
@@ -203,7 +202,7 @@ class OpenAIServing:
...
@@ -203,7 +202,7 @@ class OpenAIServing:
def
_validate_input
(
def
_validate_input
(
self
,
self
,
request
:
AnyRequest
,
request
:
AnyRequest
,
input_ids
:
L
ist
[
int
],
input_ids
:
l
ist
[
int
],
input_text
:
str
,
input_text
:
str
,
)
->
TextTokensPrompt
:
)
->
TextTokensPrompt
:
token_num
=
len
(
input_ids
)
token_num
=
len
(
input_ids
)
...
@@ -259,7 +258,7 @@ class OpenAIServing:
...
@@ -259,7 +258,7 @@ class OpenAIServing:
self
,
self
,
request
:
AnyRequest
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
prompt_input
:
Union
[
str
,
L
ist
[
int
]],
prompt_input
:
Union
[
str
,
l
ist
[
int
]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
add_special_tokens
:
bool
=
True
,
)
->
TextTokensPrompt
:
)
->
TextTokensPrompt
:
...
@@ -280,7 +279,7 @@ class OpenAIServing:
...
@@ -280,7 +279,7 @@ class OpenAIServing:
self
,
self
,
request
:
AnyRequest
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
prompt_inputs
:
Iterable
[
Union
[
str
,
L
ist
[
int
]]],
prompt_inputs
:
Iterable
[
Union
[
str
,
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
add_special_tokens
:
bool
=
True
,
)
->
Iterator
[
TextTokensPrompt
]:
)
->
Iterator
[
TextTokensPrompt
]:
...
@@ -309,10 +308,10 @@ class OpenAIServing:
...
@@ -309,10 +308,10 @@ class OpenAIServing:
self
,
self
,
request
:
AnyRequest
,
request
:
AnyRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
input_or_inputs
:
Union
[
str
,
L
ist
[
str
],
L
ist
[
int
],
L
ist
[
L
ist
[
int
]]],
input_or_inputs
:
Union
[
str
,
l
ist
[
str
],
l
ist
[
int
],
l
ist
[
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
add_special_tokens
:
bool
=
True
,
)
->
L
ist
[
TextTokensPrompt
]:
)
->
l
ist
[
TextTokensPrompt
]:
"""
"""
Tokenize/detokenize depending on the input format.
Tokenize/detokenize depending on the input format.
...
@@ -344,10 +343,10 @@ class OpenAIServing:
...
@@ -344,10 +343,10 @@ class OpenAIServing:
self
,
self
,
request
:
CompletionLikeRequest
,
request
:
CompletionLikeRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
input_or_inputs
:
Union
[
str
,
L
ist
[
str
],
L
ist
[
int
],
L
ist
[
L
ist
[
int
]]],
input_or_inputs
:
Union
[
str
,
l
ist
[
str
],
l
ist
[
int
],
l
ist
[
l
ist
[
int
]]],
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
True
,
add_special_tokens
:
bool
=
True
,
)
->
T
uple
[
L
ist
[
TextTokensPrompt
],
L
ist
[
TokensPrompt
]]:
)
->
t
uple
[
l
ist
[
TextTokensPrompt
],
l
ist
[
TokensPrompt
]]:
request_prompts
=
await
self
.
_tokenize_prompt_input_or_inputs_async
(
request_prompts
=
await
self
.
_tokenize_prompt_input_or_inputs_async
(
request
,
request
,
tokenizer
,
tokenizer
,
...
@@ -367,19 +366,19 @@ class OpenAIServing:
...
@@ -367,19 +366,19 @@ class OpenAIServing:
self
,
self
,
request
:
ChatLikeRequest
,
request
:
ChatLikeRequest
,
tokenizer
:
AnyTokenizer
,
tokenizer
:
AnyTokenizer
,
messages
:
L
ist
[
ChatCompletionMessageParam
],
messages
:
l
ist
[
ChatCompletionMessageParam
],
chat_template
:
Optional
[
str
],
chat_template
:
Optional
[
str
],
chat_template_content_format
:
ChatTemplateContentFormatOption
,
chat_template_content_format
:
ChatTemplateContentFormatOption
,
add_generation_prompt
:
bool
=
True
,
add_generation_prompt
:
bool
=
True
,
continue_final_message
:
bool
=
False
,
continue_final_message
:
bool
=
False
,
tool_dicts
:
Optional
[
L
ist
[
D
ict
[
str
,
Any
]]]
=
None
,
tool_dicts
:
Optional
[
l
ist
[
d
ict
[
str
,
Any
]]]
=
None
,
documents
:
Optional
[
L
ist
[
D
ict
[
str
,
str
]]]
=
None
,
documents
:
Optional
[
l
ist
[
d
ict
[
str
,
str
]]]
=
None
,
chat_template_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
,
chat_template_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
,
tool_parser
:
Optional
[
Callable
[[
AnyTokenizer
],
ToolParser
]]
=
None
,
tool_parser
:
Optional
[
Callable
[[
AnyTokenizer
],
ToolParser
]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]]
=
None
,
add_special_tokens
:
bool
=
False
,
add_special_tokens
:
bool
=
False
,
)
->
T
uple
[
L
ist
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
)
->
t
uple
[
l
ist
[
ConversationMessage
],
Sequence
[
RequestPrompt
],
L
ist
[
TokensPrompt
]]:
l
ist
[
TokensPrompt
]]:
resolved_content_format
=
resolve_chat_template_content_format
(
resolved_content_format
=
resolve_chat_template_content_format
(
chat_template
,
chat_template
,
chat_template_content_format
,
chat_template_content_format
,
...
@@ -392,7 +391,7 @@ class OpenAIServing:
...
@@ -392,7 +391,7 @@ class OpenAIServing:
content_format
=
resolved_content_format
,
content_format
=
resolved_content_format
,
)
)
_chat_template_kwargs
:
D
ict
[
str
,
Any
]
=
dict
(
_chat_template_kwargs
:
d
ict
[
str
,
Any
]
=
dict
(
chat_template
=
chat_template
,
chat_template
=
chat_template
,
add_generation_prompt
=
add_generation_prompt
,
add_generation_prompt
=
add_generation_prompt
,
continue_final_message
=
continue_final_message
,
continue_final_message
=
continue_final_message
,
...
@@ -401,7 +400,7 @@ class OpenAIServing:
...
@@ -401,7 +400,7 @@ class OpenAIServing:
)
)
_chat_template_kwargs
.
update
(
chat_template_kwargs
or
{})
_chat_template_kwargs
.
update
(
chat_template_kwargs
or
{})
request_prompt
:
Union
[
str
,
L
ist
[
int
]]
request_prompt
:
Union
[
str
,
l
ist
[
int
]]
if
isinstance
(
tokenizer
,
MistralTokenizer
):
if
isinstance
(
tokenizer
,
MistralTokenizer
):
request_prompt
=
apply_mistral_chat_template
(
request_prompt
=
apply_mistral_chat_template
(
tokenizer
,
tokenizer
,
...
...
vllm/entrypoints/openai/serving_models.py
View file @
cf069aa8
...
@@ -4,7 +4,7 @@ import json
...
@@ -4,7 +4,7 @@ import json
import
pathlib
import
pathlib
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
http
import
HTTPStatus
from
http
import
HTTPStatus
from
typing
import
List
,
Optional
,
Union
from
typing
import
Optional
,
Union
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
...
@@ -53,10 +53,10 @@ class OpenAIServingModels:
...
@@ -53,10 +53,10 @@ class OpenAIServingModels:
self
,
self
,
engine_client
:
EngineClient
,
engine_client
:
EngineClient
,
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
base_model_paths
:
L
ist
[
BaseModelPath
],
base_model_paths
:
l
ist
[
BaseModelPath
],
*
,
*
,
lora_modules
:
Optional
[
L
ist
[
LoRAModulePath
]]
=
None
,
lora_modules
:
Optional
[
l
ist
[
LoRAModulePath
]]
=
None
,
prompt_adapters
:
Optional
[
L
ist
[
PromptAdapterPath
]]
=
None
,
prompt_adapters
:
Optional
[
l
ist
[
PromptAdapterPath
]]
=
None
,
):
):
super
().
__init__
()
super
().
__init__
()
...
@@ -65,7 +65,7 @@ class OpenAIServingModels:
...
@@ -65,7 +65,7 @@ class OpenAIServingModels:
self
.
engine_client
=
engine_client
self
.
engine_client
=
engine_client
self
.
static_lora_modules
=
lora_modules
self
.
static_lora_modules
=
lora_modules
self
.
lora_requests
:
L
ist
[
LoRARequest
]
=
[]
self
.
lora_requests
:
l
ist
[
LoRARequest
]
=
[]
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
lora_id_counter
=
AtomicCounter
(
0
)
self
.
prompt_adapter_requests
=
[]
self
.
prompt_adapter_requests
=
[]
...
...
vllm/entrypoints/openai/serving_pooling.py
View file @
cf069aa8
...
@@ -3,7 +3,8 @@
...
@@ -3,7 +3,8 @@
import
asyncio
import
asyncio
import
base64
import
base64
import
time
import
time
from
typing
import
AsyncGenerator
,
Final
,
List
,
Literal
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Final
,
Literal
,
Optional
,
Union
,
cast
import
numpy
as
np
import
numpy
as
np
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -29,7 +30,7 @@ logger = init_logger(__name__)
...
@@ -29,7 +30,7 @@ logger = init_logger(__name__)
def
_get_data
(
def
_get_data
(
output
:
PoolingOutput
,
output
:
PoolingOutput
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
Union
[
L
ist
[
float
],
str
]:
)
->
Union
[
l
ist
[
float
],
str
]:
if
encoding_format
==
"float"
:
if
encoding_format
==
"float"
:
return
output
.
data
.
tolist
()
return
output
.
data
.
tolist
()
elif
encoding_format
==
"base64"
:
elif
encoding_format
==
"base64"
:
...
@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing):
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
try
:
try
:
pooling_params
=
request
.
to_pooling_params
()
pooling_params
=
request
.
to_pooling_params
()
...
@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
num_prompts
=
len
(
engine_prompts
)
num_prompts
=
len
(
engine_prompts
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
final_res_batch
=
[
None
]
*
num_prompts
final_res_batch
=
[
None
]
*
num_prompts
try
:
try
:
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
...
@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing):
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
assert
all
(
final_res
is
not
None
for
final_res
in
final_res_batch
)
final_res_batch_checked
=
cast
(
L
ist
[
PoolingRequestOutput
],
final_res_batch_checked
=
cast
(
l
ist
[
PoolingRequestOutput
],
final_res_batch
)
final_res_batch
)
response
=
self
.
request_output_to_pooling_response
(
response
=
self
.
request_output_to_pooling_response
(
...
@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing):
...
@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing):
def
request_output_to_pooling_response
(
def
request_output_to_pooling_response
(
self
,
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
model_name
:
str
,
model_name
:
str
,
encoding_format
:
Literal
[
"float"
,
"base64"
],
encoding_format
:
Literal
[
"float"
,
"base64"
],
)
->
PoolingResponse
:
)
->
PoolingResponse
:
items
:
L
ist
[
PoolingResponseData
]
=
[]
items
:
l
ist
[
PoolingResponseData
]
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
...
vllm/entrypoints/openai/serving_score.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
asyncio
import
time
import
time
from
typing
import
Any
,
AsyncGenerator
,
Dict
,
List
,
Mapping
,
Optional
,
Union
from
collections.abc
import
AsyncGenerator
,
Mapping
from
typing
import
Any
,
Optional
,
Union
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing):
...
@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing):
async
def
_embedding_score
(
async
def
_embedding_score
(
self
,
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
texts_1
:
L
ist
[
str
],
texts_1
:
l
ist
[
str
],
texts_2
:
L
ist
[
str
],
texts_2
:
l
ist
[
str
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request_id
=
str
,
request_id
=
str
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
...
@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing):
...
@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing):
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
None
]]
=
None
,
None
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
input_texts
=
texts_1
+
texts_2
input_texts
=
texts_1
+
texts_2
engine_prompts
:
L
ist
[
TokensPrompt
]
=
[]
engine_prompts
:
l
ist
[
TokensPrompt
]
=
[]
tokenize_async
=
make_async
(
tokenizer
.
__call__
,
tokenize_async
=
make_async
(
tokenizer
.
__call__
,
executor
=
self
.
_tokenizer_executor
)
executor
=
self
.
_tokenizer_executor
)
...
@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing):
...
@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing):
prompt_token_ids
=
text_token_prompt
[
"prompt_token_ids"
]))
prompt_token_ids
=
text_token_prompt
[
"prompt_token_ids"
]))
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
()
pooling_params
=
request
.
to_pooling_params
()
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
...
@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing):
...
@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
result_generator
=
merge_async_iterators
(
*
generators
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
L
ist
[
PoolingRequestOutput
]
=
[]
final_res_batch
:
l
ist
[
PoolingRequestOutput
]
=
[]
embeddings
:
L
ist
[
Optional
[
PoolingRequestOutput
]]
=
\
embeddings
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
=
\
[
None
]
*
len
(
engine_prompts
)
[
None
]
*
len
(
engine_prompts
)
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
embeddings
[
i
]
=
res
embeddings
[
i
]
=
res
emb_texts_1
:
L
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_1
:
l
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_2
:
L
ist
[
PoolingRequestOutput
]
=
[]
emb_texts_2
:
l
ist
[
PoolingRequestOutput
]
=
[]
for
i
in
range
(
0
,
len
(
texts_1
)):
for
i
in
range
(
0
,
len
(
texts_1
)):
assert
(
emb
:
=
embeddings
[
i
])
is
not
None
assert
(
emb
:
=
embeddings
[
i
])
is
not
None
...
@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing):
...
@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing):
async
def
_cross_encoding_score
(
async
def
_cross_encoding_score
(
self
,
self
,
tokenizer
:
Union
[
AnyTokenizer
],
tokenizer
:
Union
[
AnyTokenizer
],
texts_1
:
L
ist
[
str
],
texts_1
:
l
ist
[
str
],
texts_2
:
L
ist
[
str
],
texts_2
:
l
ist
[
str
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request
:
Union
[
RerankRequest
,
ScoreRequest
],
request_id
=
str
,
request_id
=
str
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
tokenization_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
...
@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing):
...
@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing):
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
prompt_adapter_request
:
Optional
[
Union
[
PromptAdapterRequest
,
None
]]
=
None
,
None
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
trace_headers
:
Optional
[
Mapping
[
str
,
str
]]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
request_prompts
:
L
ist
[
str
]
=
[]
request_prompts
:
l
ist
[
str
]
=
[]
engine_prompts
:
L
ist
[
TokensPrompt
]
=
[]
engine_prompts
:
l
ist
[
TokensPrompt
]
=
[]
if
len
(
texts_1
)
==
1
:
if
len
(
texts_1
)
==
1
:
texts_1
=
texts_1
*
len
(
texts_2
)
texts_1
=
texts_1
*
len
(
texts_2
)
...
@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing):
...
@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing):
engine_prompts
.
append
(
engine_prompt
)
engine_prompts
.
append
(
engine_prompt
)
# Schedule the request and get the result generator.
# Schedule the request and get the result generator.
generators
:
L
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
generators
:
l
ist
[
AsyncGenerator
[
PoolingRequestOutput
,
None
]]
=
[]
pooling_params
=
request
.
to_pooling_params
()
pooling_params
=
request
.
to_pooling_params
()
...
@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing):
...
@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing):
result_generator
=
merge_async_iterators
(
*
generators
)
result_generator
=
merge_async_iterators
(
*
generators
)
# Non-streaming response
# Non-streaming response
final_res_batch
:
L
ist
[
final_res_batch
:
l
ist
[
Optional
[
PoolingRequestOutput
]]
=
[
None
]
*
len
(
engine_prompts
)
Optional
[
PoolingRequestOutput
]]
=
[
None
]
*
len
(
engine_prompts
)
async
for
i
,
res
in
result_generator
:
async
for
i
,
res
in
result_generator
:
...
@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing):
...
@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing):
request_id
:
str
,
request_id
:
str
,
raw_request
:
Optional
[
Request
]
=
None
,
raw_request
:
Optional
[
Request
]
=
None
,
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
truncate_prompt_tokens
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
PoolingRequestOutput
]:
)
->
l
ist
[
PoolingRequestOutput
]:
tokenization_kwargs
:
D
ict
[
str
,
Any
]
=
{}
tokenization_kwargs
:
d
ict
[
str
,
Any
]
=
{}
if
truncate_prompt_tokens
is
not
None
:
if
truncate_prompt_tokens
is
not
None
:
tokenization_kwargs
[
"truncation"
]
=
True
tokenization_kwargs
[
"truncation"
]
=
True
tokenization_kwargs
[
"max_length"
]
=
truncate_prompt_tokens
tokenization_kwargs
[
"max_length"
]
=
truncate_prompt_tokens
...
@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing):
...
@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing):
def
request_output_to_score_response
(
def
request_output_to_score_response
(
self
,
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
request_id
:
str
,
created_time
:
int
,
created_time
:
int
,
model_name
:
str
,
model_name
:
str
,
)
->
ScoreResponse
:
)
->
ScoreResponse
:
items
:
L
ist
[
ScoreResponseData
]
=
[]
items
:
l
ist
[
ScoreResponseData
]
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
...
@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing):
...
@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing):
)
)
def
request_output_to_rerank_response
(
def
request_output_to_rerank_response
(
self
,
final_res_batch
:
L
ist
[
PoolingRequestOutput
],
request_id
:
str
,
self
,
final_res_batch
:
l
ist
[
PoolingRequestOutput
],
request_id
:
str
,
model_name
:
str
,
documents
:
L
ist
[
str
],
model_name
:
str
,
documents
:
l
ist
[
str
],
top_n
:
int
)
->
RerankResponse
:
top_n
:
int
)
->
RerankResponse
:
"""
"""
Convert the output of do_rank to a RerankResponse
Convert the output of do_rank to a RerankResponse
"""
"""
results
:
L
ist
[
RerankResult
]
=
[]
results
:
l
ist
[
RerankResult
]
=
[]
num_prompt_tokens
=
0
num_prompt_tokens
=
0
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
for
idx
,
final_res
in
enumerate
(
final_res_batch
):
classify_res
=
ScoringRequestOutput
.
from_base
(
final_res
)
classify_res
=
ScoringRequestOutput
.
from_base
(
final_res
)
...
...
vllm/entrypoints/openai/serving_tokenization.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Final
,
List
,
Optional
,
Union
from
typing
import
Final
,
Optional
,
Union
from
fastapi
import
Request
from
fastapi
import
Request
...
@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing):
...
@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing):
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
logger
.
exception
(
"Error in preprocessing prompt inputs"
)
return
self
.
create_error_response
(
str
(
e
))
return
self
.
create_error_response
(
str
(
e
))
input_ids
:
L
ist
[
int
]
=
[]
input_ids
:
l
ist
[
int
]
=
[]
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
for
i
,
engine_prompt
in
enumerate
(
engine_prompts
):
self
.
_log_inputs
(
request_id
,
self
.
_log_inputs
(
request_id
,
request_prompts
[
i
],
request_prompts
[
i
],
...
...
vllm/entrypoints/openai/serving_transcription.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
asyncio
import
asyncio
import
io
import
io
from
typing
import
AsyncGenerator
,
Optional
,
Union
,
cast
from
collections.abc
import
AsyncGenerator
from
typing
import
Optional
,
Union
,
cast
from
fastapi
import
Request
from
fastapi
import
Request
...
...
vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
os
from
collections.abc
import
Sequence
from
functools
import
cached_property
from
functools
import
cached_property
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Sequence
,
Type
,
Union
from
typing
import
Callable
,
Optional
,
Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
,
DeltaMessage
,
...
@@ -22,16 +23,16 @@ class ToolParser:
...
@@ -22,16 +23,16 @@ class ToolParser:
"""
"""
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
):
self
.
prev_tool_call_arr
:
L
ist
[
D
ict
]
=
[]
self
.
prev_tool_call_arr
:
l
ist
[
d
ict
]
=
[]
# the index of the tool call that is currently being parsed
# the index of the tool call that is currently being parsed
self
.
current_tool_id
:
int
=
-
1
self
.
current_tool_id
:
int
=
-
1
self
.
current_tool_name_sent
:
bool
=
False
self
.
current_tool_name_sent
:
bool
=
False
self
.
streamed_args_for_tool
:
L
ist
[
str
]
=
[]
self
.
streamed_args_for_tool
:
l
ist
[
str
]
=
[]
self
.
model_tokenizer
=
tokenizer
self
.
model_tokenizer
=
tokenizer
@
cached_property
@
cached_property
def
vocab
(
self
)
->
D
ict
[
str
,
int
]:
def
vocab
(
self
)
->
d
ict
[
str
,
int
]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
# whereas all tokenizers have .get_vocab()
return
self
.
model_tokenizer
.
get_vocab
()
return
self
.
model_tokenizer
.
get_vocab
()
...
@@ -79,10 +80,10 @@ class ToolParser:
...
@@ -79,10 +80,10 @@ class ToolParser:
class
ToolParserManager
:
class
ToolParserManager
:
tool_parsers
:
D
ict
[
str
,
T
ype
]
=
{}
tool_parsers
:
d
ict
[
str
,
t
ype
]
=
{}
@
classmethod
@
classmethod
def
get_tool_parser
(
cls
,
name
)
->
T
ype
:
def
get_tool_parser
(
cls
,
name
)
->
t
ype
:
"""
"""
Get tool parser by name which is registered by `register_module`.
Get tool parser by name which is registered by `register_module`.
...
@@ -95,8 +96,8 @@ class ToolParserManager:
...
@@ -95,8 +96,8 @@ class ToolParserManager:
@
classmethod
@
classmethod
def
_register_module
(
cls
,
def
_register_module
(
cls
,
module
:
T
ype
,
module
:
t
ype
,
module_name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
module_name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
)
->
None
:
force
:
bool
=
True
)
->
None
:
if
not
issubclass
(
module
,
ToolParser
):
if
not
issubclass
(
module
,
ToolParser
):
raise
TypeError
(
raise
TypeError
(
...
@@ -116,9 +117,9 @@ class ToolParserManager:
...
@@ -116,9 +117,9 @@ class ToolParserManager:
@
classmethod
@
classmethod
def
register_module
(
def
register_module
(
cls
,
cls
,
name
:
Optional
[
Union
[
str
,
L
ist
[
str
]]]
=
None
,
name
:
Optional
[
Union
[
str
,
l
ist
[
str
]]]
=
None
,
force
:
bool
=
True
,
force
:
bool
=
True
,
module
:
Union
[
T
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
module
:
Union
[
t
ype
,
None
]
=
None
)
->
Union
[
type
,
Callable
]:
"""
"""
Register module with the given name or name list. it can be used as a
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
decoder(with module as None) or normal function(with module as not
...
...
vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
View file @
cf069aa8
...
@@ -2,8 +2,9 @@
...
@@ -2,8 +2,9 @@
import
json
import
json
import
re
import
re
from
collections.abc
import
Sequence
from
json
import
JSONDecoder
from
json
import
JSONDecoder
from
typing
import
Dict
,
Sequence
,
Union
from
typing
import
Union
import
partial_json_parser
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
from
partial_json_parser.core.options
import
Allow
...
@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser):
...
@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser):
return
None
return
None
# select as the current tool call the one we're on the state at
# select as the current tool call the one we're on the state at
current_tool_call
:
D
ict
=
tool_call_arr
[
self
.
current_tool_id
]
\
current_tool_call
:
d
ict
=
tool_call_arr
[
self
.
current_tool_id
]
\
if
len
(
tool_call_arr
)
>
0
else
{}
if
len
(
tool_call_arr
)
>
0
else
{}
# case -- if no tokens have been streamed for the tool, e.g.
# case -- if no tokens have been streamed for the tool, e.g.
...
...
vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
json
import
json
from
typing
import
Dict
,
Sequence
,
Union
from
collections.abc
import
Sequence
from
typing
import
Union
import
partial_json_parser
import
partial_json_parser
from
partial_json_parser.core.options
import
Allow
from
partial_json_parser.core.options
import
Allow
...
@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser):
...
@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser):
return
None
return
None
# select as the current tool call the one we're on the state at
# select as the current tool call the one we're on the state at
current_tool_call
:
D
ict
=
tool_call_arr
[
self
.
current_tool_id
]
current_tool_call
:
d
ict
=
tool_call_arr
[
self
.
current_tool_id
]
delta
=
None
delta
=
None
# case: we are starting a new tool in the array
# case: we are starting a new tool in the array
...
...
Prev
1
…
8
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment