Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66d3d542
Unverified
Commit
66d3d542
authored
Nov 27, 2025
by
Didier Durand
Committed by
GitHub
Nov 27, 2025
Browse files
[Doc]: fixing typos in diverse files (#29492)
Signed-off-by:
Didier Durand
<
durand.didier@gmail.com
>
parent
bab438ff
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
12 additions
and
10 deletions
+12
-10
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+2
-2
vllm/config/parallel.py
vllm/config/parallel.py
+2
-2
vllm/lora/punica_wrapper/punica_base.py
vllm/lora/punica_wrapper/punica_base.py
+1
-1
vllm/model_executor/models/adapters.py
vllm/model_executor/models/adapters.py
+2
-2
vllm/v1/sample/tpu/sampler.py
vllm/v1/sample/tpu/sampler.py
+1
-1
vllm/v1/worker/dp_utils.py
vllm/v1/worker/dp_utils.py
+4
-2
No files found.
vllm/benchmarks/serve.py
View file @
66d3d542
...
...
@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
help
=
"Key-value pairs (e.g, --header x-additional-info=0.3.3) "
"for headers to be passed with each request. These headers override "
"per backend constants and values set via environment variable, and "
"will be overriden by other arguments (such as request ids)."
,
"will be overrid
d
en by other arguments (such as request ids)."
,
)
parser
.
add_argument
(
"--max-concurrency"
,
...
...
@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--percentile-metrics"
,
type
=
str
,
default
=
None
,
help
=
"Comma-separated list of selected metrics to report percentils. "
help
=
"Comma-separated list of selected metrics to report percentil
e
s. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'If not specified, defaults to "ttft,tpot,itl" for generative models '
...
...
vllm/config/parallel.py
View file @
66d3d542
...
...
@@ -238,9 +238,9 @@ class ParallelConfig:
cp_kv_cache_interleave_size
:
int
=
1
"""Interleave size of kv_cache storage while using DCP or PCP.
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
and `total_cp_world_size = pcp_world_size * dcp_world_s
z
ie`.
and `total_cp_world_size = pcp_world_size * dcp_world_si
z
e`.
store interleave_size tokens on total_cp_rank i,
then store next interleave_size tokens on t
a
otal_cp_rank i+1.
then store next interleave_size tokens on total_cp_rank i+1.
Interleave_size=1: token-level alignment, where token `i` is stored on
total_cp_rank `i % total_cp_world_size`.
Interleave_size=block_size: block-level alignment, where tokens are
...
...
vllm/lora/punica_wrapper/punica_base.py
View file @
66d3d542
...
...
@@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
vocab_size
:
int
,
):
# NOTE We have remove lora extra vocab support for now. So we set
# extra_vocab_size alway
z
s to 0, and extra_vocab_size will be removed.
# extra_vocab_size always to 0, and extra_vocab_size will be removed.
extra_vocab_size
=
0
(
...
...
vllm/model_executor/models/adapters.py
View file @
66d3d542
...
...
@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax(
)
if
text_config
.
tie_word_embeddings
:
# embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fallback to get_input_embeddings(), which is used by
# have this attribute, we fall
back to get_input_embeddings(), which is used by
# the Transformers modeling backend.
embed_tokens
=
(
model
.
model
.
embed_tokens
...
...
@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
)
if
text_config
.
tie_word_embeddings
:
# embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fallback to get_input_embeddings(), which is used by
# have this attribute, we fall
back to get_input_embeddings(), which is used by
# the Transformers modeling backend.
embed_tokens
=
(
model
.
model
.
embed_tokens
...
...
vllm/v1/sample/tpu/sampler.py
View file @
66d3d542
...
...
@@ -181,7 +181,7 @@ def apply_top_k_top_p(
after thresholding the logit using this cut-off, the remaining elements
shall constitute the top-p set.
Note: in the case of tie (i.e. multip
p
le cut-off elements present in the
Note: in the case of tie (i.e. multiple cut-off elements present in the
logit), all tie elements are included in the top-p set. In other words,
this function does not break ties. Instead, these tie tokens have equal
chance of being chosen during final sampling, so we can consider the tie
...
...
vllm/v1/worker/dp_utils.py
View file @
66d3d542
...
...
@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
device
=
get_dp_group
().
device
group
=
get_dp_group
().
device_group
# Transfering this tensor from GPU to CPU will introduce a GPU sync
# Transfer
r
ing this tensor from GPU to CPU will introduce a GPU sync
# point that could adversely affect performance of vllm with asynch
# scheduling. This environment variable exists to quickly disable
# this optimization if we run into this case.
if
parallel_config
.
disable_nccl_for_dp_synchronization
:
logger
.
info_once
(
"Using CPU all reduce to syncronize DP padding between ranks."
)
logger
.
info_once
(
"Using CPU all reduce to synchronize DP padding between ranks."
)
device
=
"cpu"
group
=
get_dp_group
().
cpu_group
return
device
,
group
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment