Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bcb06d7b
Unverified
Commit
bcb06d7b
authored
Sep 12, 2025
by
Didier Durand
Committed by
GitHub
Sep 12, 2025
Browse files
[Doc]: fix typos in various files (#24726)
Signed-off-by:
Didier Durand
<
durand.didier@gmail.com
>
parent
0377802c
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
11 additions
and
11 deletions
+11
-11
benchmarks/kernels/benchmark_w8a8_block_fp8.py
benchmarks/kernels/benchmark_w8a8_block_fp8.py
+1
-1
csrc/cpu/cpu_types_vxe.hpp
csrc/cpu/cpu_types_vxe.hpp
+1
-1
csrc/cpu/sgl-kernels/moe.cpp
csrc/cpu/sgl-kernels/moe.cpp
+1
-1
docs/design/multiprocessing.md
docs/design/multiprocessing.md
+1
-1
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+1
-1
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+1
-1
vllm/entrypoints/openai/protocol.py
vllm/entrypoints/openai/protocol.py
+1
-1
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
+1
-1
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
vllm/v1/worker/tpu_input_batch.py
vllm/v1/worker/tpu_input_batch.py
+1
-1
No files found.
benchmarks/kernels/benchmark_w8a8_block_fp8.py
View file @
bcb06d7b
...
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
...
@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`.
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization.
block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128].
It should be 2-dim, e.g., [128, 128].
output_d
y
tpe: The dtype of the returned tensor.
output_dt
y
pe: The dtype of the returned tensor.
Returns:
Returns:
torch.Tensor: The result of matmul.
torch.Tensor: The result of matmul.
...
...
csrc/cpu/cpu_types_vxe.hpp
View file @
bcb06d7b
...
@@ -12,7 +12,7 @@ namespace vec_op {
...
@@ -12,7 +12,7 @@ namespace vec_op {
#define vec_sub(a, b) ((a) - (b))
#define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b))
#define vec_div(a, b) ((a) / (b))
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algeb
r
aic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU
// FIXME: FP16 is not fully supported in Torch-CPU
...
...
csrc/cpu/sgl-kernels/moe.cpp
View file @
bcb06d7b
...
@@ -215,7 +215,7 @@ int moe_align_block_size(
...
@@ -215,7 +215,7 @@ int moe_align_block_size(
offsets
[
mb
+
1
]
=
sorted_id_size
(
sorted_ids
+
mb
*
BLOCK_M
);
offsets
[
mb
+
1
]
=
sorted_id_size
(
sorted_ids
+
mb
*
BLOCK_M
);
}
}
});
});
// TODO: do we need to vect
e
rize this ?
// TODO: do we need to vect
o
rize this ?
for
(
int
mb
=
0
;
mb
<
num_token_blocks
;
++
mb
)
{
for
(
int
mb
=
0
;
mb
<
num_token_blocks
;
++
mb
)
{
offsets
[
mb
+
1
]
+=
offsets
[
mb
];
offsets
[
mb
+
1
]
+=
offsets
[
mb
];
}
}
...
...
docs/design/multiprocessing.md
View file @
bcb06d7b
...
@@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
...
@@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
## Introduction
## Introduction
!!! important
!!! important
The source code references are to the state of the code at the time of writing in December
,
2024.
The source code references are to the state of the code at the time of writing in December 2024.
The use of Python multiprocessing in vLLM is complicated by:
The use of Python multiprocessing in vLLM is complicated by:
...
...
vllm/attention/backends/flash_attn.py
View file @
bcb06d7b
...
@@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
...
@@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
attn_metadata
.
encoder_seq_start_loc
,
attn_metadata
.
encoder_seq_start_loc
,
attn_metadata
.
max_encoder_seq_len
)
attn_metadata
.
max_encoder_seq_len
)
elif
attn_type
==
AttentionType
.
ENCODER
:
elif
attn_type
==
AttentionType
.
ENCODER
:
# For encoder attention both the query and the key are same i.e the
# For encoder attention both the query and the key are same i.e
.
the
# encoder sequence.
# encoder sequence.
return
(
attn_metadata
.
encoder_seq_start_loc
,
return
(
attn_metadata
.
encoder_seq_start_loc
,
attn_metadata
.
max_encoder_seq_len
,
attn_metadata
.
max_encoder_seq_len
,
...
...
vllm/benchmarks/datasets.py
View file @
bcb06d7b
...
@@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
...
@@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
[6880, 6881] -> ['Ġcalls', 'here'] ->
[6880, 6881] -> ['Ġcalls', 'here'] ->
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
To avoid uncontrolled change of the prompt length,
To avoid uncontrolled change of the prompt length,
the encoded sequence is truncated before being decode again.
the encoded sequence is truncated before being decode
d
again.
"""
"""
# Build the inner sequence by sampling sequentially from the vocab
# Build the inner sequence by sampling sequentially from the vocab
inner_seq
=
((
offset
+
index
+
np
.
arange
(
input_len
))
inner_seq
=
((
offset
+
index
+
np
.
arange
(
input_len
))
...
...
vllm/entrypoints/openai/protocol.py
View file @
bcb06d7b
...
@@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
...
@@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
elif
processors
:
elif
processors
:
raise
ValueError
(
raise
ValueError
(
"The `logits_processors` argument is not supported by this "
"The `logits_processors` argument is not supported by this "
"server. See --logits-processor-pattern engine argu
g
ment "
"server. See --logits-processor-pattern engine argument "
"for more information."
)
"for more information."
)
return
None
return
None
...
...
vllm/model_executor/layers/mamba/mamba_mixer2.py
View file @
bcb06d7b
...
@@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
...
@@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
# - the weight already has a "weight_loader" attribute
# - the weight already has a "weight_loader" attribute
# which set_weight_attrs will raise if we do not
# which set_weight_attrs will raise if we do not
# delete before trying to override it
# delete before trying to override it
# - ditto for the ot
t
her two weights below
# - ditto for the other two weights below
delattr
(
self
.
conv1d
.
bias
,
"weight_loader"
)
delattr
(
self
.
conv1d
.
bias
,
"weight_loader"
)
set_weight_attrs
(
set_weight_attrs
(
self
.
conv1d
.
bias
,
self
.
conv1d
.
bias
,
...
...
vllm/model_executor/models/minicpmv.py
View file @
bcb06d7b
...
@@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
def
_process_multimodal_inputs
(
self
,
modalities
:
dict
):
def
_process_multimodal_inputs
(
self
,
modalities
:
dict
):
# The result multimodal_embeddings is tuple of tensors, with each
# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspo
e
nding to a multimodal data item (image or video).
# tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings
:
tuple
[
torch
.
Tensor
,
...]
=
()
multimodal_embeddings
:
tuple
[
torch
.
Tensor
,
...]
=
()
# NOTE: It is important to iterate over the keys in this dictionary
# NOTE: It is important to iterate over the keys in this dictionary
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
bcb06d7b
...
@@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
num_tokens
+=
num_pad
num_tokens
+=
num_pad
# If cudagraph_mode.decode_mode() == FULL and
# If cudagraph_mode.decode_mode() == FULL and
# cudagraph_mode.sep
e
rate_routine(). This means that we are using
# cudagraph_mode.sep
a
rate_routine(). This means that we are using
# different graphs and/or modes for mixed prefill-decode batches vs.
# different graphs and/or modes for mixed prefill-decode batches vs.
# uniform decode batches. A uniform decode batch means that all
# uniform decode batches. A uniform decode batch means that all
# requests have identical query length, except a potential virtual
# requests have identical query length, except a potential virtual
...
...
vllm/v1/worker/tpu_input_batch.py
View file @
bcb06d7b
...
@@ -392,7 +392,7 @@ class InputBatch:
...
@@ -392,7 +392,7 @@ class InputBatch:
# NOTE: the following is unsafe
# NOTE: the following is unsafe
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
# instead, we need to tempor
i
arily copy the data for one of the indices
# instead, we need to temporarily copy the data for one of the indices
# TODO(lucas): optimize this by only copying valid indices
# TODO(lucas): optimize this by only copying valid indices
tmp
=
self
.
token_ids_cpu
[
i1
,
...].
copy
()
tmp
=
self
.
token_ids_cpu
[
i1
,
...].
copy
()
self
.
token_ids_cpu
[
i1
,
...]
=
self
.
token_ids_cpu
[
i2
,
...]
self
.
token_ids_cpu
[
i1
,
...]
=
self
.
token_ids_cpu
[
i2
,
...]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment