Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f4962a6d
Unverified
Commit
f4962a6d
authored
Sep 08, 2025
by
Didier Durand
Committed by
GitHub
Sep 08, 2025
Browse files
[Doc]: fix typos in Python comments (#24417)
Signed-off-by:
Didier Durand
<
durand.didier@gmail.com
>
parent
2f0b833a
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
12 additions
and
12 deletions
+12
-12
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+1
-1
vllm/attention/backends/mla/common.py
vllm/attention/backends/mla/common.py
+1
-1
vllm/config/__init__.py
vllm/config/__init__.py
+1
-1
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
...distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/client.py
+1
-1
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+1
-1
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
...points/openai/tool_parsers/llama4_pythonic_tool_parser.py
+1
-1
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+1
-1
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+1
-1
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+1
-1
vllm/v1/worker/block_table.py
vllm/v1/worker/block_table.py
+1
-1
No files found.
examples/offline_inference/chat_with_tools.py
View file @
f4962a6d
...
...
@@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
print
(
outputs
[
0
].
outputs
[
0
].
text
.
strip
())
# yields
# 'The weather in Dallas, TX is 85 degrees
f
ahrenheit. '
# 'The weather in Dallas, TX is 85 degrees
F
ahrenheit. '
# 'It is partly cloudly, with highs in the 90's.'
vllm/attention/backends/mla/common.py
View file @
f4962a6d
...
...
@@ -1052,7 +1052,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
return
layer
.
weight
# we currently do not have quantized bmm's which are needed for
# `W_UV` and `W_UK_T`, we
we
just store fp16/bf16 copies and perform
# `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform
# the bmm's in 16-bit, the extra memory overhead of this is fairly low
kv_b_proj_weight
=
get_and_maybe_dequant_weights
(
self
.
kv_b_proj
).
T
assert
kv_b_proj_weight
.
shape
==
(
...
...
vllm/config/__init__.py
View file @
f4962a6d
...
...
@@ -1169,7 +1169,7 @@ class ModelConfig:
]
# Any custom overrides will be in quantization_methods so we place
# them at the start of the list so custom overrides have preference
# over the built
in ones.
# over the built
-
in ones.
quantization_methods
=
quantization_methods
+
overrides
# Detect which checkpoint is it
...
...
vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
View file @
f4962a6d
...
...
@@ -770,7 +770,7 @@ class NixlConnectorWorker:
# with joint KV for each block. This minimizes the overhead in
# registerMem allowing faster descs queries. In order to be able to
# split on kv_heads dim as required by heterogeneous TP, one must
# be able to index K/V separately. Hence
the
we double the number
# be able to index K/V separately. Hence we double the number
# of 'virtual' regions here and halve `block_len` below.
self
.
num_regions
*=
2
...
...
vllm/engine/arg_utils.py
View file @
f4962a6d
...
...
@@ -1159,7 +1159,7 @@ class EngineArgs:
# Note(hc): In the current implementation of decode context
# parallel(DCP), tp_size needs to be divisible by dcp_size,
# because the world size does not change by dcp, it simply
# reuse the GPUs of TP group, and split one TP group into
# reuse
s
the GPUs of TP group, and split one TP group into
# tp_size//dcp_size DCP groups.
assert
self
.
tensor_parallel_size
%
self
.
decode_context_parallel_size
\
==
0
,
(
...
...
vllm/engine/multiprocessing/client.py
View file @
f4962a6d
...
...
@@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient):
# therefore we have to inform that the current
# processed requests failed as well. Send back a dead
# engine error give this feedback and also give a
# 'hint' to the server to shutdown next.
# 'hint' to the server to shut
down next.
exception
=
self
.
dead_error
if
request_id
is
None
:
...
...
vllm/entrypoints/openai/cli_args.py
View file @
f4962a6d
...
...
@@ -204,7 +204,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
frontend_kwargs
[
"lora_modules"
][
"type"
]
=
optional_type
(
str
)
frontend_kwargs
[
"lora_modules"
][
"action"
]
=
LoRAParserAction
# Special case: Middleware needs append action
# Special case: Middleware needs
to
append action
frontend_kwargs
[
"middleware"
][
"action"
]
=
"append"
frontend_kwargs
[
"middleware"
][
"type"
]
=
str
if
"nargs"
in
frontend_kwargs
[
"middleware"
]:
...
...
vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
View file @
f4962a6d
...
...
@@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser):
index
]
+=
delta
.
function
.
arguments
# HACK: serving_chat.py inspects the internal state of tool parsers
# when determining it
'
s final streaming delta, automatically
# when determining its final streaming delta, automatically
# adding autocompleted JSON.
# These two lines avoid that nonsense while ensuring finish_reason
# is set to tool_calls when at least one tool is called.
...
...
vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
View file @
f4962a6d
...
...
@@ -143,7 +143,7 @@ class MistralToolParser(ToolParser):
except
json
.
JSONDecodeError
:
# use a regex to find the part corresponding to the tool call.
# NOTE: This use case should not happen if the model is trained
# correctly. It's a easy possible fix so it's included, but
# correctly. It's a
n
easy possible fix so it's included, but
# can be brittle for very complex / highly nested tool calls
raw_tool_call
=
self
.
tool_call_regex
.
findall
(
tool_content
)[
0
]
function_call_arr
=
json
.
loads
(
raw_tool_call
)
...
...
vllm/model_executor/layers/fused_moe/modular_kernel.py
View file @
f4962a6d
...
...
@@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC):
def
max_num_tokens_per_rank
(
self
)
->
Optional
[
int
]:
"""
Some PrepareFinalize All2All implementations are batched. Meaning,
they can process
es
only as set of tokens at a time. This
they can process only as set of tokens at a time. This
function returns the batch size i.e the maximum number of tokens
the implementation can process at a time.
Return None if there are no such restrictions.
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
f4962a6d
...
...
@@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int,
def
marlin_make_workspace_new
(
device
:
torch
.
device
,
max_blocks_per_sm
:
int
=
1
)
->
torch
.
Tensor
:
# In the new marlin kernel, we use the num of threadblocks as workspace
# size. The num of threadblocks is
is
sms_count * max_blocks_per_sm.
# size. The num of threadblocks is sms_count * max_blocks_per_sm.
sms
=
torch
.
cuda
.
get_device_properties
(
device
).
multi_processor_count
return
torch
.
zeros
(
sms
*
max_blocks_per_sm
,
dtype
=
torch
.
int
,
...
...
vllm/v1/worker/block_table.py
View file @
f4962a6d
...
...
@@ -98,7 +98,7 @@ class BlockTable:
# here because M (max_model_len) is not necessarily divisible by
# block_size.
if
self
.
dcp_world_size
>
1
:
# Note(hc): The DCP implement store kvcache with a interleave
# Note(hc): The DCP implement store kvcache with a
n
interleave
# style, the kvcache for the token whose token_idx is i is
# always stored on the GPU whose dcp_rank equals i % cp_world_size:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment