Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d3da2eea
Unverified
Commit
d3da2eea
authored
Aug 28, 2025
by
Didier Durand
Committed by
GitHub
Aug 28, 2025
Browse files
[Doc]: fix typos in Python scripts (#23828)
Signed-off-by:
Didier Durand
<
durand.didier@gmail.com
>
parent
bfab2196
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
12 additions
and
12 deletions
+12
-12
vllm/compilation/backends.py
vllm/compilation/backends.py
+2
-2
vllm/config/cache.py
vllm/config/cache.py
+1
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+1
-1
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-1
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+1
-1
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+1
-1
vllm/v1/cudagraph_dispatcher.py
vllm/v1/cudagraph_dispatcher.py
+2
-2
vllm/v1/worker/block_table.py
vllm/v1/worker/block_table.py
+1
-1
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+1
-1
No files found.
vllm/compilation/backends.py
View file @
d3da2eea
...
@@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
...
@@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
outputs
.
append
(
outputs
.
append
(
SplitItem
(
name
,
graph_id
,
(
graph_id
in
split_op_graphs
),
module
))
SplitItem
(
name
,
graph_id
,
(
graph_id
in
split_op_graphs
),
module
))
# sort by inte
t
ger graph_id, rather than string name
# sort by integer graph_id, rather than string name
outputs
.
sort
(
key
=
lambda
x
:
x
.
graph_id
)
outputs
.
sort
(
key
=
lambda
x
:
x
.
graph_id
)
return
split_gm
,
outputs
return
split_gm
,
outputs
...
@@ -424,7 +424,7 @@ class VllmBackend:
...
@@ -424,7 +424,7 @@ class VllmBackend:
# if the model is initialized with a non-empty prefix,
# if the model is initialized with a non-empty prefix,
# then usually it's enough to use that prefix,
# then usually it's enough to use that prefix,
# e.g. la
u
nguage_model, vision_model, etc.
# e.g. language_model, vision_model, etc.
# when multiple parts are initialized as independent
# when multiple parts are initialized as independent
# models, we need to use the model_tag to distinguish
# models, we need to use the model_tag to distinguish
# them, e.g. backbone (default), eagle_head, etc.
# them, e.g. backbone (default), eagle_head, etc.
...
...
vllm/config/cache.py
View file @
d3da2eea
...
@@ -115,7 +115,7 @@ class CacheConfig:
...
@@ -115,7 +115,7 @@ class CacheConfig:
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
some layers can skip tokens corresponding to prefill. This flag enables
some layers can skip tokens corresponding to prefill. This flag enables
attention metadata for eligible layers to be overriden with metadata
attention metadata for eligible layers to be overrid
d
en with metadata
necessary for implementing this optimization in some models (e.g. Gemma3n)
necessary for implementing this optimization in some models (e.g. Gemma3n)
"""
"""
...
...
vllm/engine/arg_utils.py
View file @
d3da2eea
...
@@ -1053,7 +1053,7 @@ class EngineArgs:
...
@@ -1053,7 +1053,7 @@ class EngineArgs:
self
.
trust_remote_code
,
self
.
revision
,
self
.
trust_remote_code
,
self
.
revision
,
self
.
code_revision
,
self
.
config_format
)
self
.
code_revision
,
self
.
config_format
)
# if loading a SpeculatorsConfig, load the specu
a
ltive_config
# if loading a SpeculatorsConfig, load the specul
a
tive_config
# details from the config directly
# details from the config directly
# no user input required / expected
# no user input required / expected
if
isinstance
(
hf_config
,
SpeculatorsConfig
):
if
isinstance
(
hf_config
,
SpeculatorsConfig
):
...
...
vllm/entrypoints/chat_utils.py
View file @
d3da2eea
...
@@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
...
@@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
def
__init__
(
self
)
->
None
:
def
__init__
(
self
)
->
None
:
super
().
__init__
()
super
().
__init__
()
# stores model placeho
d
lers list with corresponding
# stores model placehol
d
ers list with corresponding
# general MM placeholder:
# general MM placeholder:
# {
# {
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
# "<##IMAGE##>": ["<image>", "<image>", "<image>"],
...
...
vllm/entrypoints/openai/api_server.py
View file @
d3da2eea
...
@@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
...
@@ -1096,7 +1096,7 @@ if envs.VLLM_SERVER_DEV_MODE:
raise
HTTPException
(
status_code
=
HTTPStatus
.
BAD_REQUEST
.
value
,
raise
HTTPException
(
status_code
=
HTTPStatus
.
BAD_REQUEST
.
value
,
detail
=
"Missing 'method' in request body"
)
detail
=
"Missing 'method' in request body"
)
# For security reason, only serialized string args/kwargs are passed.
# For security reason, only serialized string args/kwargs are passed.
# User-defined `method` is responsible for deseralization if needed.
# User-defined `method` is responsible for deser
i
alization if needed.
args
:
list
[
str
]
=
body
.
get
(
"args"
,
[])
args
:
list
[
str
]
=
body
.
get
(
"args"
,
[])
kwargs
:
dict
[
str
,
str
]
=
body
.
get
(
"kwargs"
,
{})
kwargs
:
dict
[
str
,
str
]
=
body
.
get
(
"kwargs"
,
{})
timeout
:
Optional
[
float
]
=
body
.
get
(
"timeout"
)
timeout
:
Optional
[
float
]
=
body
.
get
(
"timeout"
)
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
d3da2eea
...
@@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
...
@@ -579,7 +579,7 @@ class CompressedTensorsConfig(QuantizationConfig):
format
=
scheme_dict
.
get
(
"format"
)
format
=
scheme_dict
.
get
(
"format"
)
# Find the sparsity scheme of the layer
# Find the sparsity scheme of the layer
# assume that fused layers iner
h
it first component's sparsity scheme
# assume that fused layers in
h
erit first component's sparsity scheme
sparsity_targets
=
(
self
.
sparsity_scheme_map
.
keys
()
-
sparsity_targets
=
(
self
.
sparsity_scheme_map
.
keys
()
-
set
(
self
.
sparsity_ignore_list
))
set
(
self
.
sparsity_ignore_list
))
sparsity_scheme
:
Optional
[
SparsityCompressionConfig
]
=
None
sparsity_scheme
:
Optional
[
SparsityCompressionConfig
]
=
None
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
d3da2eea
...
@@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
...
@@ -71,7 +71,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
)
->
"CompressedTensorsMoEMethod"
:
)
->
"CompressedTensorsMoEMethod"
:
# TODO: @dsikka: refactor this to use schemes as other kernels
# TODO: @dsikka: refactor this to use schemes as other kernels
# are supported + check if the layer is being ignored.
# are supported + check if the layer is being ignored.
# Check if a using "Linear" to select sche
e
ms
# Check if a using "Linear" to select schem
e
s
if
"Linear"
in
quant_config
.
target_scheme_map
:
if
"Linear"
in
quant_config
.
target_scheme_map
:
matched_target
=
"Linear"
matched_target
=
"Linear"
else
:
else
:
...
...
vllm/v1/cudagraph_dispatcher.py
View file @
d3da2eea
...
@@ -11,7 +11,7 @@ logger = init_logger(__name__)
...
@@ -11,7 +11,7 @@ logger = init_logger(__name__)
class
CudagraphDispatcher
:
class
CudagraphDispatcher
:
"""
"""
Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
Runtime cudagraph dispatcher to dispa
t
ch keys for multiple set of cudagraphs.
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
for FULL cudagraph runtime mode. The keys are initialized depending on
for FULL cudagraph runtime mode. The keys are initialized depending on
...
@@ -21,7 +21,7 @@ class CudagraphDispatcher:
...
@@ -21,7 +21,7 @@ class CudagraphDispatcher:
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
At runtime, the dispatch method generates the runtime cudagraph mode (FULL,
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
based on the input key. After dispatching (commuicate via forward context),
based on the input key. After dispatching (commu
n
icate via forward context),
the cudagraph wrappers will trust the dispatch key to do either capturing
the cudagraph wrappers will trust the dispatch key to do either capturing
or replaying (if mode matched), or pass through to the underlying runnable
or replaying (if mode matched), or pass through to the underlying runnable
without cudagraph (if mode no match or mode is NONE).
without cudagraph (if mode no match or mode is NONE).
...
...
vllm/v1/worker/block_table.py
View file @
d3da2eea
...
@@ -110,7 +110,7 @@ class BlockTable:
...
@@ -110,7 +110,7 @@ class BlockTable:
self
.
block_table_cpu
.
fill_
(
0
)
self
.
block_table_cpu
.
fill_
(
0
)
def
get_device_tensor
(
self
)
->
torch
.
Tensor
:
def
get_device_tensor
(
self
)
->
torch
.
Tensor
:
"""R
u
turns the device tensor of the block table."""
"""R
e
turns the device tensor of the block table."""
return
self
.
block_table
return
self
.
block_table
def
get_cpu_tensor
(
self
)
->
torch
.
Tensor
:
def
get_cpu_tensor
(
self
)
->
torch
.
Tensor
:
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
d3da2eea
...
@@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
...
@@ -43,7 +43,7 @@ class CPUModelRunner(GPUModelRunner):
Args:
Args:
scheduler_output: The scheduler output.
scheduler_output: The scheduler output.
"""
"""
# Attention free models have zero kv_cache_goups, however models
# Attention free models have zero kv_cache_g
r
oups, however models
# like Mamba are also attention free but use the kv_cache for
# like Mamba are also attention free but use the kv_cache for
# keeping its internal state. This is why we check the number
# keeping its internal state. This is why we check the number
# of kv_cache groups instead of solely checking
# of kv_cache groups instead of solely checking
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment