Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f59fc60f
Unverified
Commit
f59fc60f
authored
Jun 25, 2025
by
Max Wittig
Committed by
GitHub
Jun 25, 2025
Browse files
[Feat][CLI] enforce-include-usage (#19695)
Signed-off-by:
Max Wittig
<
max.wittig@siemens.com
>
parent
879f69be
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
34 additions
and
9 deletions
+34
-9
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+2
-0
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+5
-0
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+15
-4
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+8
-3
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+4
-2
No files found.
vllm/entrypoints/openai/api_server.py
View file @
f59fc60f
...
...
@@ -1190,6 +1190,7 @@ async def init_app_state(
tool_parser
=
args
.
tool_call_parser
,
reasoning_parser
=
args
.
reasoning_parser
,
enable_prompt_tokens_details
=
args
.
enable_prompt_tokens_details
,
enable_force_include_usage
=
args
.
enable_force_include_usage
,
)
if
model_config
.
runner_type
==
"generate"
else
None
state
.
openai_serving_completion
=
OpenAIServingCompletion
(
engine_client
,
...
...
@@ -1197,6 +1198,7 @@ async def init_app_state(
state
.
openai_serving_models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
args
.
return_tokens_as_token_ids
,
enable_force_include_usage
=
args
.
enable_force_include_usage
,
)
if
model_config
.
runner_type
==
"generate"
else
None
state
.
openai_serving_pooling
=
OpenAIServingPooling
(
engine_client
,
...
...
vllm/entrypoints/openai/cli_args.py
View file @
f59fc60f
...
...
@@ -272,6 +272,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
action
=
'store_true'
,
default
=
False
,
help
=
"If set to True, enable prompt_tokens_details in usage."
)
parser
.
add_argument
(
"--enable-force-include-usage"
,
action
=
'store_true'
,
default
=
False
,
help
=
"If set to True, including usage on every request."
)
parser
.
add_argument
(
"--enable-server-load-tracking"
,
action
=
'store_true'
,
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
f59fc60f
...
...
@@ -64,12 +64,14 @@ class OpenAIServingChat(OpenAIServing):
enable_auto_tools
:
bool
=
False
,
tool_parser
:
Optional
[
str
]
=
None
,
enable_prompt_tokens_details
:
bool
=
False
,
enable_force_include_usage
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
)
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
enable_force_include_usage
=
enable_force_include_usage
)
self
.
response_role
=
response_role
self
.
chat_template
=
chat_template
...
...
@@ -110,6 +112,7 @@ class OpenAIServingChat(OpenAIServing):
"been registered"
)
from
e
self
.
enable_prompt_tokens_details
=
enable_prompt_tokens_details
self
.
enable_force_include_usage
=
enable_force_include_usage
self
.
default_sampling_params
=
(
self
.
model_config
.
get_diff_sampling_param
())
if
self
.
default_sampling_params
:
...
...
@@ -261,8 +264,14 @@ class OpenAIServingChat(OpenAIServing):
# Streaming response
if
request
.
stream
:
return
self
.
chat_completion_stream_generator
(
request
,
result_generator
,
request_id
,
model_name
,
conversation
,
tokenizer
,
request_metadata
)
request
,
result_generator
,
request_id
,
model_name
,
conversation
,
tokenizer
,
request_metadata
,
enable_force_include_usage
=
self
.
enable_force_include_usage
)
try
:
return
await
self
.
chat_completion_full_generator
(
...
...
@@ -405,6 +414,7 @@ class OpenAIServingChat(OpenAIServing):
conversation
:
list
[
ConversationMessage
],
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
enable_force_include_usage
:
bool
,
)
->
AsyncGenerator
[
str
,
None
]:
created_time
=
int
(
time
.
time
())
chunk_object_type
:
Final
=
"chat.completion.chunk"
...
...
@@ -471,7 +481,8 @@ class OpenAIServingChat(OpenAIServing):
stream_options
=
request
.
stream_options
if
stream_options
:
include_usage
=
stream_options
.
include_usage
include_usage
=
stream_options
.
include_usage
\
or
enable_force_include_usage
include_continuous_usage
=
include_usage
and
\
stream_options
.
continuous_usage_stats
else
:
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
f59fc60f
...
...
@@ -52,12 +52,14 @@ class OpenAIServingCompletion(OpenAIServing):
*
,
request_logger
:
Optional
[
RequestLogger
],
return_tokens_as_token_ids
:
bool
=
False
,
enable_force_include_usage
:
bool
=
False
,
):
super
().
__init__
(
engine_client
=
engine_client
,
model_config
=
model_config
,
models
=
models
,
request_logger
=
request_logger
,
return_tokens_as_token_ids
=
return_tokens_as_token_ids
)
return_tokens_as_token_ids
=
return_tokens_as_token_ids
,
enable_force_include_usage
=
enable_force_include_usage
)
self
.
default_sampling_params
=
(
self
.
model_config
.
get_diff_sampling_param
())
if
self
.
default_sampling_params
:
...
...
@@ -227,7 +229,8 @@ class OpenAIServingCompletion(OpenAIServing):
model_name
,
num_prompts
=
num_prompts
,
tokenizer
=
tokenizer
,
request_metadata
=
request_metadata
)
request_metadata
=
request_metadata
,
enable_force_include_usage
=
self
.
enable_force_include_usage
)
# Non-streaming response
final_res_batch
:
list
[
Optional
[
RequestOutput
]]
=
[
None
]
*
num_prompts
...
...
@@ -289,6 +292,7 @@ class OpenAIServingCompletion(OpenAIServing):
num_prompts
:
int
,
tokenizer
:
AnyTokenizer
,
request_metadata
:
RequestResponseMetadata
,
enable_force_include_usage
:
bool
,
)
->
AsyncGenerator
[
str
,
None
]:
num_choices
=
1
if
request
.
n
is
None
else
request
.
n
previous_text_lens
=
[
0
]
*
num_choices
*
num_prompts
...
...
@@ -298,7 +302,8 @@ class OpenAIServingCompletion(OpenAIServing):
stream_options
=
request
.
stream_options
if
stream_options
:
include_usage
=
stream_options
.
include_usage
include_usage
=
stream_options
.
include_usage
or
\
enable_force_include_usage
include_continuous_usage
=
include_usage
and
\
stream_options
.
continuous_usage_stats
else
:
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
f59fc60f
...
...
@@ -132,7 +132,7 @@ RequestT = TypeVar("RequestT", bound=AnyRequest)
class
RequestProcessingMixin
(
BaseModel
):
"""
Mixin for request processing,
Mixin for request processing,
handling prompt preparation and engine input.
"""
request_prompts
:
Optional
[
Sequence
[
RequestPrompt
]]
=
[]
...
...
@@ -144,7 +144,7 @@ class RequestProcessingMixin(BaseModel):
class
ResponseGenerationMixin
(
BaseModel
):
"""
Mixin for response generation,
Mixin for response generation,
managing result generators and final batch results.
"""
result_generator
:
Optional
[
AsyncGenerator
[
tuple
[
int
,
Union
[
...
...
@@ -208,6 +208,7 @@ class OpenAIServing:
*
,
request_logger
:
Optional
[
RequestLogger
],
return_tokens_as_token_ids
:
bool
=
False
,
enable_force_include_usage
:
bool
=
False
,
):
super
().
__init__
()
...
...
@@ -219,6 +220,7 @@ class OpenAIServing:
self
.
request_logger
=
request_logger
self
.
return_tokens_as_token_ids
=
return_tokens_as_token_ids
self
.
enable_force_include_usage
=
enable_force_include_usage
self
.
_tokenizer_executor
=
ThreadPoolExecutor
(
max_workers
=
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment