Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d7e1e599
Unverified
Commit
d7e1e599
authored
Sep 03, 2025
by
Didier Durand
Committed by
GitHub
Sep 02, 2025
Browse files
[Doc]: fix typos in Python comments (#24093)
Signed-off-by:
Didier Durand
<
durand.didier@gmail.com
>
parent
c4ed78b1
Changes
15
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
23 additions
and
23 deletions
+23
-23
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+1
-1
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
.../openai/correctness/test_transcription_api_correctness.py
+1
-1
tests/entrypoints/openai/test_return_token_ids.py
tests/entrypoints/openai/test_return_token_ids.py
+1
-1
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+1
-1
tests/kernels/utils.py
tests/kernels/utils.py
+1
-1
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+2
-2
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+1
-1
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+2
-2
tests/v1/spec_decode/test_tree_attention.py
tests/v1/spec_decode/test_tree_attention.py
+2
-2
vllm/lora/utils.py
vllm/lora/utils.py
+1
-1
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
..._executor/layers/quantization/compressed_tensors/utils.py
+1
-1
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+5
-5
vllm/v1/attention/backends/utils.py
vllm/v1/attention/backends/utils.py
+1
-1
vllm/v1/structured_output/utils.py
vllm/v1/structured_output/utils.py
+2
-2
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/tpu_worker.py
+1
-1
No files found.
tests/core/test_scheduler.py
View file @
d7e1e599
...
@@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update():
# Nothing is preempted.
# Nothing is preempted.
assert
output
.
blocks_to_swap_out
==
[]
assert
output
.
blocks_to_swap_out
==
[]
# Since append_slot returns the source -> dist mapping, it should
# Since append_slot returns the source -> dist mapping, it should
# applied.
#
be
applied.
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
...
...
tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
View file @
d7e1e599
...
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
...
@@ -32,7 +32,7 @@ def to_bytes(y, sr):
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
async
def
transcribe_audio
(
client
,
tokenizer
,
y
,
sr
):
# Send loaded audio directly instead of loading from disk,
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
# don
'
t account for that time though
with
to_bytes
(
y
,
sr
)
as
f
:
with
to_bytes
(
y
,
sr
)
as
f
:
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
transcription
=
await
client
.
audio
.
transcriptions
.
create
(
...
...
tests/entrypoints/openai/test_return_token_ids.py
View file @
d7e1e599
...
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
...
@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
logprobs_token_ids
.
append
(
token_id
)
logprobs_token_ids
.
append
(
token_id
)
# When echo=True, the logprobs include both prompt and response tokens
# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the
the
suffix of response portion
# The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion
# The prompt_token_ids should match the prompt portion
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
assert
len
(
completion
.
choices
[
0
].
token_ids
)
<
len
(
logprobs_token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
response_token_ids_length
=
len
(
completion
.
choices
[
0
].
token_ids
)
...
...
tests/entrypoints/openai/test_serving_chat.py
View file @
d7e1e599
...
@@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
...
@@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
}],
}],
)
)
# By default cache_salt in the engine prompt is not set
# By default
,
cache_salt in the engine prompt is not set
with
suppress
(
Exception
):
with
suppress
(
Exception
):
await
serving_chat
.
create_chat_completion
(
req
)
await
serving_chat
.
create_chat_completion
(
req
)
assert
"cache_salt"
not
in
mock_engine
.
generate
.
call_args
.
args
[
0
]
assert
"cache_salt"
not
in
mock_engine
.
generate
.
call_args
.
args
[
0
]
...
...
tests/kernels/utils.py
View file @
d7e1e599
...
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
...
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
# We treat N-dimensional group scaling as extended numpy-style broadcasting
# We treat N-dimensional group scaling as extended numpy-style broadcasting
# in numpy simply stretches dimensions with an extent of 1 to match
the
# in numpy simply stretches dimensions with an extent of 1 to match
# the target shape by repeating the data along that dimension (broadcasting)
# the target shape by repeating the data along that dimension (broadcasting)
# , we extend these semantics to say if the extent of a dimension in the
# , we extend these semantics to say if the extent of a dimension in the
# source shape is not 1 and does not match the target shape we repeat each
# source shape is not 1 and does not match the target shape we repeat each
...
...
tests/multimodal/test_utils.py
View file @
d7e1e599
...
@@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
...
@@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
with
torch
.
inference_mode
():
with
torch
.
inference_mode
():
sharded_output
=
run_dp_sharded_vision_model
(
image_input
,
vision_model
)
sharded_output
=
run_dp_sharded_vision_model
(
image_input
,
vision_model
)
# Check that the world size is setup correctly
# Check that the world size is set
up correctly
assert
get_tensor_model_parallel_world_size
()
==
world_size
assert
get_tensor_model_parallel_world_size
()
==
world_size
# Check that the outputs have the same shape
# Check that the outputs have the same shape
...
@@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
...
@@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
rope_type
=
"rope_3d"
)
rope_type
=
"rope_3d"
)
sharded_output
=
torch
.
cat
(
sharded_output
,
dim
=
0
)
sharded_output
=
torch
.
cat
(
sharded_output
,
dim
=
0
)
# Check that the world size is setup correctly
# Check that the world size is set
up correctly
assert
get_tensor_model_parallel_world_size
()
==
world_size
assert
get_tensor_model_parallel_world_size
()
==
world_size
# Compare outputs (only on rank 0)
# Compare outputs (only on rank 0)
...
...
tests/v1/e2e/test_spec_decode.py
View file @
d7e1e599
...
@@ -83,7 +83,7 @@ def test_ngram_correctness(
...
@@ -83,7 +83,7 @@ def test_ngram_correctness(
model_name
:
str
,
model_name
:
str
,
):
):
'''
'''
Compare the outputs of a original LLM and a speculative LLM
Compare the outputs of a
n
original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
should be the same when using ngram speculative decoding.
'''
'''
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
...
tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
View file @
d7e1e599
...
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
...
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
engine_core_outputs
=
scheduler
.
update_from_output
(
scheduler_output
,
engine_core_outputs
=
scheduler
.
update_from_output
(
scheduler_output
,
model_runner_output
)
model_runner_output
)
# Ensure the request is finished after 1 token
s
.
# Ensure the request is finished after 1 token.
assert
request
.
is_finished
()
assert
request
.
is_finished
()
assert
request
.
status
==
RequestStatus
.
FINISHED_LENGTH_CAPPED
assert
request
.
status
==
RequestStatus
.
FINISHED_LENGTH_CAPPED
output
=
engine_core_outputs
[
0
].
outputs
[
0
]
output
=
engine_core_outputs
[
0
].
outputs
[
0
]
...
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
...
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():
def
test_prefix_cache_lifecycle
():
def
test_prefix_cache_lifecycle
():
"""Test that remote decode params still work
s
with a prefix cache hit."""
"""Test that remote decode params still work with a prefix cache hit."""
vllm_config
=
create_vllm_config
()
vllm_config
=
create_vllm_config
()
scheduler
=
create_scheduler
(
vllm_config
)
scheduler
=
create_scheduler
(
vllm_config
)
...
...
tests/v1/spec_decode/test_tree_attention.py
View file @
d7e1e599
...
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
...
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
dtype
=
torch
.
bfloat16
,
dtype
=
torch
.
bfloat16
,
)
)
# Setup the block table and KV cache for paged KV.
# Set
up the block table and KV cache for paged KV.
assert
max_sequence_length
%
block_size
==
0
assert
max_sequence_length
%
block_size
==
0
max_blocks_per_batch
=
max_sequence_length
//
block_size
max_blocks_per_batch
=
max_sequence_length
//
block_size
kv_cache
=
torch
.
randn
(
kv_cache
=
torch
.
randn
(
...
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
...
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
num_alloc_blocks_per_batch
]
=
block_ids
.
view
(
num_alloc_blocks_per_batch
]
=
block_ids
.
view
(
-
1
,
num_alloc_blocks_per_batch
)
-
1
,
num_alloc_blocks_per_batch
)
# Setup the slot mapping for the input KVs.
# Set
up the slot mapping for the input KVs.
tree_positions
=
sequence_position
+
torch
.
arange
(
tree_positions
=
sequence_position
+
torch
.
arange
(
0
,
0
,
tree_size_q
,
tree_size_q
,
...
...
vllm/lora/utils.py
View file @
d7e1e599
...
@@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
...
@@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
except
(
HfHubHTTPError
,
RepositoryNotFoundError
,
EntryNotFoundError
,
except
(
HfHubHTTPError
,
RepositoryNotFoundError
,
EntryNotFoundError
,
HFValidationError
):
HFValidationError
):
# Handle errors that may occur during the download
# Handle errors that may occur during the download
# Return original path instead
instead
of throwing error here
# Return original path instead of throwing error here
logger
.
exception
(
"Error downloading the HuggingFace model"
)
logger
.
exception
(
"Error downloading the HuggingFace model"
)
return
lora_path
return
lora_path
...
...
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
View file @
d7e1e599
...
@@ -94,7 +94,7 @@ def find_matched_target(
...
@@ -94,7 +94,7 @@ def find_matched_target(
config that a layer corresponds to.
config that a layer corresponds to.
Recall that a compressed-tensors configs has a concept of
Recall that a compressed-tensors configs has a concept of
config_groups, where each layer can be quantized with
with
a different
config_groups, where each layer can be quantized with a different
scheme.
scheme.
targets in each config_group will be a list of either layer names
targets in each config_group will be a list of either layer names
...
...
vllm/multimodal/utils.py
View file @
d7e1e599
...
@@ -213,7 +213,7 @@ class MediaConnector:
...
@@ -213,7 +213,7 @@ class MediaConnector:
image_mode
:
str
=
"RGB"
,
image_mode
:
str
=
"RGB"
,
)
->
Image
.
Image
:
)
->
Image
.
Image
:
"""
"""
Load a PIL image from a HTTP or base64 data URL.
Load a PIL image from a
n
HTTP or base64 data URL.
By default, the image is converted into RGB format.
By default, the image is converted into RGB format.
"""
"""
...
@@ -237,7 +237,7 @@ class MediaConnector:
...
@@ -237,7 +237,7 @@ class MediaConnector:
image_mode
:
str
=
"RGB"
,
image_mode
:
str
=
"RGB"
,
)
->
Image
.
Image
:
)
->
Image
.
Image
:
"""
"""
Asynchronously load a PIL image from a HTTP or base64 data URL.
Asynchronously load a PIL image from a
n
HTTP or base64 data URL.
By default, the image is converted into RGB format.
By default, the image is converted into RGB format.
"""
"""
...
@@ -261,7 +261,7 @@ class MediaConnector:
...
@@ -261,7 +261,7 @@ class MediaConnector:
image_mode
:
str
=
"RGB"
,
image_mode
:
str
=
"RGB"
,
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
"""
"""
Load video from a HTTP or base64 data URL.
Load video from a
n
HTTP or base64 data URL.
"""
"""
image_io
=
ImageMediaIO
(
image_mode
=
image_mode
,
image_io
=
ImageMediaIO
(
image_mode
=
image_mode
,
**
self
.
media_io_kwargs
.
get
(
"image"
,
{}))
**
self
.
media_io_kwargs
.
get
(
"image"
,
{}))
...
@@ -281,7 +281,7 @@ class MediaConnector:
...
@@ -281,7 +281,7 @@ class MediaConnector:
image_mode
:
str
=
"RGB"
,
image_mode
:
str
=
"RGB"
,
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
)
->
tuple
[
npt
.
NDArray
,
dict
[
str
,
Any
]]:
"""
"""
Asynchronously load video from a HTTP or base64 data URL.
Asynchronously load video from a
n
HTTP or base64 data URL.
By default, the image is converted into RGB format.
By default, the image is converted into RGB format.
"""
"""
...
@@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(
...
@@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(
def
modality_group_func
(
def
modality_group_func
(
mm_input
:
MultiModalKwargsItems
)
->
Union
[
str
,
int
]:
mm_input
:
MultiModalKwargsItems
)
->
Union
[
str
,
int
]:
# If the input has multiple modalities, return a id as the unique key
# If the input has multiple modalities, return a
n
id as the unique key
# for the mm_input input.
# for the mm_input input.
if
len
(
mm_input
)
>
1
:
if
len
(
mm_input
)
>
1
:
return
id
(
mm_input
)
return
id
(
mm_input
)
...
...
vllm/v1/attention/backends/utils.py
View file @
d7e1e599
...
@@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills(
...
@@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills(
for
i
,
req_id
in
enumerate
(
input_batch
.
req_ids
):
for
i
,
req_id
in
enumerate
(
input_batch
.
req_ids
):
num_tokens
=
scheduler_output
.
num_scheduled_tokens
[
req_id
]
num_tokens
=
scheduler_output
.
num_scheduled_tokens
[
req_id
]
# for now treat 1 scheduled token as "decode" even if its not,
# for now treat 1 scheduled token as "decode" even if it
'
s not,
# we should update this to something like < 8 in the future but
# we should update this to something like < 8 in the future but
# currently the TritonMLA._forward_decode only supports
# currently the TritonMLA._forward_decode only supports
# num_tokens = 1
# num_tokens = 1
...
...
vllm/v1/structured_output/utils.py
View file @
d7e1e599
...
@@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
...
@@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
elif
xdg_cache_home
:
elif
xdg_cache_home
:
return
os
.
path
.
join
(
xdg_cache_home
,
".cache"
,
"outlines"
)
return
os
.
path
.
join
(
xdg_cache_home
,
".cache"
,
"outlines"
)
# If homedir is "/", we may be inside a container, and thus writing to
# If homedir is "/", we may be inside a container, and thus writing to
# root would be problematic, so we fallback to using a tempfile.
# root would be problematic, so we fall
back to using a tempfile.
# Also validate the path exists, since os.path.expanduser does
# Also validate the path exists, since os.path.expanduser does
# not gar
u
ntee existence.
# not g
u
ar
a
ntee existence.
elif
os
.
path
.
isdir
(
home_dir
)
and
home_dir
!=
"/"
:
elif
os
.
path
.
isdir
(
home_dir
)
and
home_dir
!=
"/"
:
# Default Unix fallback: ~/.cache/outlines
# Default Unix fallback: ~/.cache/outlines
return
os
.
path
.
join
(
home_dir
,
".cache"
,
"outlines"
)
return
os
.
path
.
join
(
home_dir
,
".cache"
,
"outlines"
)
...
...
vllm/v1/worker/tpu_worker.py
View file @
d7e1e599
...
@@ -250,7 +250,7 @@ class TPUWorker:
...
@@ -250,7 +250,7 @@ class TPUWorker:
scheduler_output
:
"SchedulerOutput"
,
scheduler_output
:
"SchedulerOutput"
,
)
->
Optional
[
ModelRunnerOutput
]:
)
->
Optional
[
ModelRunnerOutput
]:
output
=
self
.
model_runner
.
execute_model
(
scheduler_output
)
output
=
self
.
model_runner
.
execute_model
(
scheduler_output
)
# every worker's output is needed when kv_transfer_group is setup
# every worker's output is needed when kv_transfer_group is set
up
return
output
if
self
.
is_driver_worker
or
has_kv_transfer_group
(
return
output
if
self
.
is_driver_worker
or
has_kv_transfer_group
(
)
else
None
)
else
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment