Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3b312fb7
Unverified
Commit
3b312fb7
authored
Dec 29, 2025
by
Nick Hill
Committed by
GitHub
Dec 29, 2025
Browse files
[Minor] Various small code cleanups/simplifications (#31508)
Signed-off-by:
njhill
<
nickhill123@gmail.com
>
parent
f84bf7d7
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
29 additions
and
55 deletions
+29
-55
vllm/config/model.py
vllm/config/model.py
+4
-4
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+0
-1
vllm/entrypoints/renderer.py
vllm/entrypoints/renderer.py
+2
-5
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+1
-5
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+2
-11
vllm/v1/engine/core.py
vllm/v1/engine/core.py
+2
-4
vllm/v1/engine/input_processor.py
vllm/v1/engine/input_processor.py
+1
-1
vllm/v1/engine/output_processor.py
vllm/v1/engine/output_processor.py
+2
-7
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+1
-1
vllm/v1/request.py
vllm/v1/request.py
+1
-2
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+8
-6
vllm/v1/structured_output/request.py
vllm/v1/structured_output/request.py
+3
-6
vllm/v1/worker/gpu_input_batch.py
vllm/v1/worker/gpu_input_batch.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
No files found.
vllm/config/model.py
View file @
3b312fb7
...
@@ -1579,14 +1579,14 @@ class ModelConfig:
...
@@ -1579,14 +1579,14 @@ class ModelConfig:
@
property
@
property
def
is_hybrid
(
self
)
->
bool
:
def
is_hybrid
(
self
)
->
bool
:
if
not
self
.
_model_info
.
is_hybrid
:
return
False
# Handle granite-4.0-micro case which uses hybrid config but does not
# Handle granite-4.0-micro case which uses hybrid config but does not
# actually contain any non-attention layers.
# actually contain any non-attention layers.
layer_types
=
getattr
(
self
.
hf_config
,
"layer_types"
,
None
)
layer_types
=
getattr
(
self
.
hf_config
,
"layer_types"
,
None
)
if
layer_types
is
not
None
and
all
(
return
layer_types
is
None
or
not
all
(
layer
==
"attention"
for
layer
in
layer_types
layer
==
"attention"
for
layer
in
layer_types
):
)
return
False
return
self
.
_model_info
.
is_hybrid
@
property
@
property
def
has_noops
(
self
)
->
bool
:
def
has_noops
(
self
)
->
bool
:
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
3b312fb7
...
@@ -2005,7 +2005,6 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -2005,7 +2005,6 @@ class OpenAIServingResponses(OpenAIServing):
return
event
return
event
async
with
AsyncExitStack
()
as
exit_stack
:
async
with
AsyncExitStack
()
as
exit_stack
:
processer
=
None
if
self
.
use_harmony
:
if
self
.
use_harmony
:
# TODO: in streaming, we noticed this bug:
# TODO: in streaming, we noticed this bug:
# https://github.com/vllm-project/vllm/issues/25697
# https://github.com/vllm-project/vllm/issues/25697
...
...
vllm/entrypoints/renderer.py
View file @
3b312fb7
...
@@ -44,11 +44,8 @@ class RenderConfig:
...
@@ -44,11 +44,8 @@ class RenderConfig:
def
verify_truncate_prompt_tokens
(
self
,
model_config
:
ModelConfig
)
->
int
|
None
:
def
verify_truncate_prompt_tokens
(
self
,
model_config
:
ModelConfig
)
->
int
|
None
:
"""Validate and normalize `truncate_prompt_tokens` parameter."""
"""Validate and normalize `truncate_prompt_tokens` parameter."""
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
truncate_prompt_tokens
=
self
.
truncate_prompt_tokens
if
truncate_prompt_tokens
is
None
:
if
truncate_prompt_tokens
is
None
or
truncate_prompt_tokens
==
0
:
return
None
return
truncate_prompt_tokens
if
truncate_prompt_tokens
==
0
:
return
0
if
truncate_prompt_tokens
<
0
:
if
truncate_prompt_tokens
<
0
:
truncate_prompt_tokens
=
model_config
.
max_model_len
truncate_prompt_tokens
=
model_config
.
max_model_len
...
...
vllm/inputs/preprocess.py
View file @
3b312fb7
...
@@ -686,11 +686,7 @@ class InputPreprocessor:
...
@@ -686,11 +686,7 @@ class InputPreprocessor:
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
mm_uuids
:
MultiModalUUIDDict
|
None
=
None
,
)
->
ProcessorInputs
:
)
->
ProcessorInputs
:
"""Preprocess the input prompt."""
"""Preprocess the input prompt."""
res
=
self
.
_preprocess
(
res
=
self
.
_preprocess
(
prompt
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
)
prompt
,
tokenization_kwargs
,
mm_uuids
=
mm_uuids
,
)
if
self
.
mm_processor_cache
and
self
.
mm_cache_stats
is
not
None
:
if
self
.
mm_processor_cache
and
self
.
mm_cache_stats
is
not
None
:
delta
=
self
.
mm_processor_cache
.
make_stats
(
delta
=
True
)
delta
=
self
.
mm_processor_cache
.
make_stats
(
delta
=
True
)
...
...
vllm/multimodal/inputs.py
View file @
3b312fb7
...
@@ -171,10 +171,7 @@ class PlaceholderRange:
...
@@ -171,10 +171,7 @@ class PlaceholderRange:
@
cached_property
@
cached_property
def
embeds_cumsum
(
self
)
->
torch
.
Tensor
|
None
:
def
embeds_cumsum
(
self
)
->
torch
.
Tensor
|
None
:
if
self
.
is_embed
is
None
:
return
None
if
self
.
is_embed
is
None
else
self
.
is_embed
.
cumsum
(
dim
=
0
)
return
None
return
self
.
is_embed
.
cumsum
(
dim
=
0
)
@
cached_property
@
cached_property
def
get_num_embeds
(
self
)
->
int
:
def
get_num_embeds
(
self
)
->
int
:
...
@@ -308,13 +305,7 @@ def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> boo
...
@@ -308,13 +305,7 @@ def batched_tensors_equal(a: BatchedTensorInputs, b: BatchedTensorInputs) -> boo
Equality check between
Equality check between
[`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
[`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
"""
"""
for
k
in
a
:
return
all
(
k
in
b
and
nested_tensors_equal
(
a
[
k
],
b
[
k
])
for
k
in
a
)
if
k
not
in
b
:
return
False
if
not
nested_tensors_equal
(
a
[
k
],
b
[
k
]):
return
False
return
True
@
dataclass
@
dataclass
...
...
vllm/v1/engine/core.py
View file @
3b312fb7
...
@@ -487,10 +487,8 @@ class EngineCore:
...
@@ -487,10 +487,8 @@ class EngineCore:
request_ids
=
[]
request_ids
=
[]
while
not
self
.
aborts_queue
.
empty
():
while
not
self
.
aborts_queue
.
empty
():
ids
=
self
.
aborts_queue
.
get_nowait
()
ids
=
self
.
aborts_queue
.
get_nowait
()
if
isinstance
(
ids
,
str
):
# Should be a list here, but also handle string just in case.
# Should be a list here, but also handle string just in case.
ids
=
(
ids
,)
request_ids
.
extend
((
ids
,)
if
isinstance
(
ids
,
str
)
else
ids
)
request_ids
.
extend
(
ids
)
# More efficient to abort all as a single batch.
# More efficient to abort all as a single batch.
self
.
abort_requests
(
request_ids
)
self
.
abort_requests
(
request_ids
)
...
...
vllm/v1/engine/input_processor.py
View file @
3b312fb7
...
@@ -618,7 +618,7 @@ class InputProcessor:
...
@@ -618,7 +618,7 @@ class InputProcessor:
tokenizer
=
self
.
tokenizer
tokenizer
=
self
.
tokenizer
if
tokenizer
is
not
None
:
if
tokenizer
is
not
None
:
max_input_id
=
max
(
prompt_ids
or
[]
,
default
=
0
)
max_input_id
=
max
(
prompt_ids
or
()
,
default
=
0
)
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
# NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
# self.model_config.get_vocab_size() is the model’s vocab size.
# self.model_config.get_vocab_size() is the model’s vocab size.
...
...
vllm/v1/engine/output_processor.py
View file @
3b312fb7
...
@@ -339,10 +339,7 @@ class RequestState:
...
@@ -339,10 +339,7 @@ class RequestState:
stop_reason
=
stop_reason
if
finished
else
None
,
stop_reason
=
stop_reason
if
finished
else
None
,
)
)
def
_new_pooling_output
(
def
_new_pooling_output
(
self
,
pooling_output
:
torch
.
Tensor
)
->
PoolingOutput
:
self
,
pooling_output
:
torch
.
Tensor
,
)
->
PoolingOutput
:
return
PoolingOutput
(
data
=
pooling_output
)
return
PoolingOutput
(
data
=
pooling_output
)
...
@@ -695,9 +692,7 @@ class OutputProcessor:
...
@@ -695,9 +692,7 @@ class OutputProcessor:
assert
req_state
.
stats
is
not
None
assert
req_state
.
stats
is
not
None
iteration_stats
.
update_from_finished_request
(
iteration_stats
.
update_from_finished_request
(
finish_reason
=
finish_reason
,
finish_reason
=
finish_reason
,
num_prompt_tokens
=
length_from_prompt_token_ids_or_embeds
(
num_prompt_tokens
=
req_state
.
prompt_len
,
req_state
.
prompt_token_ids
,
req_state
.
prompt_embeds
),
max_tokens_param
=
req_state
.
max_tokens_param
,
max_tokens_param
=
req_state
.
max_tokens_param
,
req_stats
=
req_state
.
stats
,
req_stats
=
req_state
.
stats
,
num_cached_tokens
=
req_state
.
num_cached_tokens
,
num_cached_tokens
=
req_state
.
num_cached_tokens
,
...
...
vllm/v1/executor/multiproc_executor.py
View file @
3b312fb7
...
@@ -695,7 +695,7 @@ class WorkerProc:
...
@@ -695,7 +695,7 @@ class WorkerProc:
worker
=
None
worker
=
None
# tuple[Connection, Connection]
# tuple[Connection, Connection]
reader
,
ready_writer
=
kwargs
.
pop
(
"ready_pipe"
)
reader
,
ready_writer
=
kwargs
.
pop
(
"ready_pipe"
)
death_pipe
=
kwargs
.
pop
(
"death_pipe"
,
None
)
death_pipe
:
Connection
|
None
=
kwargs
.
pop
(
"death_pipe"
,
None
)
shutdown_event
=
threading
.
Event
()
shutdown_event
=
threading
.
Event
()
# Start death monitoring thread if death_pipe is provided
# Start death monitoring thread if death_pipe is provided
if
death_pipe
is
not
None
:
if
death_pipe
is
not
None
:
...
...
vllm/v1/request.py
View file @
3b312fb7
...
@@ -211,8 +211,7 @@ class Request:
...
@@ -211,8 +211,7 @@ class Request:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
assert
input_id
<
len
(
self
.
mm_features
)
assert
input_id
<
len
(
self
.
mm_features
)
num_embeds
=
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
return
num_embeds
def
record_event
(
def
record_event
(
self
,
self
,
...
...
vllm/v1/structured_output/__init__.py
View file @
3b312fb7
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
itertools
import
multiprocessing
import
multiprocessing
from
collections.abc
import
Iterable
from
concurrent.futures
import
Future
,
ThreadPoolExecutor
from
concurrent.futures
import
Future
,
ThreadPoolExecutor
from
typing
import
TYPE_CHECKING
from
typing
import
TYPE_CHECKING
...
@@ -172,7 +174,7 @@ class StructuredOutputManager:
...
@@ -172,7 +174,7 @@ class StructuredOutputManager:
def
_fill_bitmasks
(
def
_fill_bitmasks
(
self
,
self
,
batch
:
list
[
tuple
[
StructuredOutputGrammar
,
int
,
bool
]],
batch
:
Iterable
[
tuple
[
StructuredOutputGrammar
,
int
,
bool
]],
)
->
None
:
)
->
None
:
assert
self
.
_grammar_bitmask
is
not
None
assert
self
.
_grammar_bitmask
is
not
None
for
grammar
,
index
,
apply_bitmask
in
batch
:
for
grammar
,
index
,
apply_bitmask
in
batch
:
...
@@ -265,16 +267,16 @@ class StructuredOutputManager:
...
@@ -265,16 +267,16 @@ class StructuredOutputManager:
apply_bitmask
=
self
.
should_fill_bitmask
(
request
)
apply_bitmask
=
self
.
should_fill_bitmask
(
request
)
state_advancements
=
0
state_advancements
=
0
req_tokens
=
scheduled_spec_decode_tokens
.
get
(
req_id
,
[]
)
req_tokens
=
scheduled_spec_decode_tokens
.
get
(
req_id
,
()
)
for
i
,
token
in
enumerate
(
req_tokens
+
[
None
]
):
for
token
in
itertools
.
chain
(
req_tokens
,
(
None
,)
):
self
.
_fill_bitmasks
(
self
.
_fill_bitmasks
(
[
(
(
(
structured_output_request
.
grammar
,
structured_output_request
.
grammar
,
cumulative_index
,
cumulative_index
,
apply_bitmask
,
apply_bitmask
,
),
)
)
]
)
)
if
(
if
(
...
...
vllm/v1/structured_output/request.py
View file @
3b312fb7
...
@@ -28,12 +28,9 @@ class StructuredOutputRequest:
...
@@ -28,12 +28,9 @@ class StructuredOutputRequest:
if
sampling_params
is
None
:
if
sampling_params
is
None
:
return
None
return
None
params
=
sampling_params
.
structured_outputs
params
=
sampling_params
.
structured_outputs
if
params
:
if
not
params
or
params
.
all_constraints_none
():
if
params
.
all_constraints_none
():
return
None
return
None
else
:
return
StructuredOutputRequest
(
params
=
params
)
return
StructuredOutputRequest
(
params
=
params
)
return
None
def
_check_grammar_completion
(
self
)
->
bool
:
def
_check_grammar_completion
(
self
)
->
bool
:
# NOTE: We have to lazy import to gate circular imports
# NOTE: We have to lazy import to gate circular imports
...
...
vllm/v1/worker/gpu_input_batch.py
View file @
3b312fb7
...
@@ -829,7 +829,7 @@ class InputBatch:
...
@@ -829,7 +829,7 @@ class InputBatch:
presence_penalties
=
self
.
presence_penalties
[:
num_reqs
],
presence_penalties
=
self
.
presence_penalties
[:
num_reqs
],
repetition_penalties
=
self
.
repetition_penalties
[:
num_reqs
],
repetition_penalties
=
self
.
repetition_penalties
[:
num_reqs
],
output_token_ids
=
output_token_ids
,
output_token_ids
=
output_token_ids
,
spec_token_ids
=
cast
(
list
[
list
[
int
]],
self
.
spec_token_ids
)
,
spec_token_ids
=
self
.
spec_token_ids
,
no_penalties
=
self
.
no_penalties
,
no_penalties
=
self
.
no_penalties
,
allowed_token_ids_mask
=
allowed_token_ids_mask
,
allowed_token_ids_mask
=
allowed_token_ids_mask
,
bad_words_token_ids
=
self
.
bad_words_token_ids
,
bad_words_token_ids
=
self
.
bad_words_token_ids
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
3b312fb7
...
@@ -1026,7 +1026,7 @@ class GPUModelRunner(
...
@@ -1026,7 +1026,7 @@ class GPUModelRunner(
each sequence, and a shifting is done during the next iteration
each sequence, and a shifting is done during the next iteration
based on the number of accepted tokens.
based on the number of accepted tokens.
"""
"""
if
not
self
.
model_config
.
is_hybrid
or
not
self
.
speculative_config
:
if
not
self
.
speculative_config
or
not
self
.
model_config
.
is_hybrid
:
return
return
# Find the number of accepted tokens for each sequence.
# Find the number of accepted tokens for each sequence.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment