Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04149cce
Unverified
Commit
04149cce
authored
Apr 09, 2025
by
yihong
Committed by
GitHub
Apr 09, 2025
Browse files
[BugFix] fix some typos found by typos. (#16314)
Signed-off-by:
yihong0618
<
zouzou0208@gmail.com
>
parent
24834f48
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
32 additions
and
32 deletions
+32
-32
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+2
-2
benchmarks/benchmark_serving_structured_output.py
benchmarks/benchmark_serving_structured_output.py
+2
-2
csrc/mamba/causal_conv1d/causal_conv1d.cu
csrc/mamba/causal_conv1d/causal_conv1d.cu
+1
-1
vllm/attention/backends/flash_attn.py
vllm/attention/backends/flash_attn.py
+1
-1
vllm/attention/backends/hpu_attn.py
vllm/attention/backends/hpu_attn.py
+3
-3
vllm/attention/backends/mla/common.py
vllm/attention/backends/mla/common.py
+3
-3
vllm/attention/backends/xformers.py
vllm/attention/backends/xformers.py
+3
-3
vllm/attention/ops/nki_flash_attn.py
vllm/attention/ops/nki_flash_attn.py
+1
-1
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+2
-2
vllm/engine/output_processor/multi_step.py
vllm/engine/output_processor/multi_step.py
+1
-1
vllm/entrypoints/openai/tool_parsers/utils.py
vllm/entrypoints/openai/tool_parsers/utils.py
+1
-1
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
...xecutor/layers/quantization/kernels/scaled_mm/__init__.py
+1
-1
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+3
-3
vllm/platforms/interface.py
vllm/platforms/interface.py
+1
-1
vllm/reasoning/granite_reasoning_parser.py
vllm/reasoning/granite_reasoning_parser.py
+1
-1
vllm/sampling_params.py
vllm/sampling_params.py
+1
-1
vllm/third_party/pynvml.py
vllm/third_party/pynvml.py
+1
-1
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+2
-2
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-1
No files found.
benchmarks/benchmark_serving.py
View file @
04149cce
...
...
@@ -921,7 +921,7 @@ if __name__ == "__main__":
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-sep
e
rated list of selected metrics to report percentils. "
help
=
"Comma-sep
a
rated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
"Default value is
\"
ttft,tpot,itl
\"
."
)
...
...
@@ -929,7 +929,7 @@ if __name__ == "__main__":
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-sep
e
rated list of percentiles for selected metrics. "
help
=
"Comma-sep
a
rated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
...
...
benchmarks/benchmark_serving_structured_output.py
View file @
04149cce
...
...
@@ -963,7 +963,7 @@ if __name__ == "__main__":
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-sep
e
rated list of selected metrics to report percentils. "
help
=
"Comma-sep
a
rated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
"Default value is
\"
ttft,tpot,itl
\"
."
)
...
...
@@ -971,7 +971,7 @@ if __name__ == "__main__":
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-sep
e
rated list of percentiles for selected metrics. "
help
=
"Comma-sep
a
rated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
...
...
csrc/mamba/causal_conv1d/causal_conv1d.cu
View file @
04149cce
...
...
@@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
int
final_state_position
=
((
seqlen
-
(
kWidth
-
1
))
-
(
n_chunks
-
1
)
*
kChunkSize
);
// in case the final state is separated between the last "smem_exchange" and
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positiv
i
e index)
// (which occurs when `final_state_position` is a non-positive index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if
(
conv_states
!=
nullptr
&&
final_state_position
<
0
&&
seqlen
>
kWidth
){
input_t
vals_load
[
kNElts
]
=
{
0
};
...
...
vllm/attention/backends/flash_attn.py
View file @
04149cce
...
...
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
assert
self
.
use_cuda_graph
if
turn_prefills_into_decodes
:
# When Mu
t
li-Step is enabled with Chunked-Prefill, prefills and
# When Mul
t
i-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.
...
...
vllm/attention/backends/hpu_attn.py
View file @
04149cce
...
...
@@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
logger
.
warning
(
"Could not import HPU FusedSDPA kernel. "
"vLLM will use native implementation."
)
suppored_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppored_head_sizes
:
suppor
t
ed_head_sizes
=
HPUPagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppor
t
ed_head_sizes
:
raise
ValueError
(
f
"Head size
{
head_size
}
is not supported by PagedAttention. "
f
"Supported head sizes are:
{
suppored_head_sizes
}
."
)
f
"Supported head sizes are:
{
suppor
t
ed_head_sizes
}
."
)
if
attn_type
!=
AttentionType
.
DECODER
:
raise
NotImplementedError
(
"Encoder self-attention and "
...
...
vllm/attention/backends/mla/common.py
View file @
04149cce
...
...
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
return spda_o @ W_O
NOTE: in the actual code,
`kv_b_proj` is [W_UK; W_UV] concatnated per head
`q_b_proj` is [W_UQ; W_QR] concatnated per head
`kv_b_proj` is [W_UK; W_UV] concat
e
nated per head
`q_b_proj` is [W_UQ; W_QR] concat
e
nated per head
`out_proj` is W_O
...
...
@@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata):
assert
num_seqs
>
num_queries
if
turn_prefills_into_decodes
:
# When Mu
t
li-Step is enabled with Chunked-Prefill, prefills and
# When Mul
t
i-Step is enabled with Chunked-Prefill, prefills and
# decodes are scheduled together. In the first step, all the
# prefills turn into decodes. This update reflects that
# conversion.
...
...
vllm/attention/backends/xformers.py
View file @
04149cce
...
...
@@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
assert
self
.
num_heads
%
self
.
num_kv_heads
==
0
self
.
num_queries_per_kv
=
self
.
num_heads
//
self
.
num_kv_heads
suppored_head_sizes
=
PagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppored_head_sizes
:
suppor
t
ed_head_sizes
=
PagedAttention
.
get_supported_head_sizes
()
if
head_size
not
in
suppor
t
ed_head_sizes
:
raise
ValueError
(
f
"Head size
{
head_size
}
is not supported by PagedAttention. "
f
"Supported head sizes are:
{
suppored_head_sizes
}
."
)
f
"Supported head sizes are:
{
suppor
t
ed_head_sizes
}
."
)
self
.
attn_type
=
attn_type
...
...
vllm/attention/ops/nki_flash_attn.py
View file @
04149cce
...
...
@@ -446,7 +446,7 @@ def flash_paged_attention(
IO tensor dtypes:
- This kernel assumes all IO tensors have the same dtype except for
block_tables (int32) and mask (int32)
- If mixed_p
e
rcision is True, then all Tensor Engine operation will be
- If mixed_pr
e
cision is True, then all Tensor Engine operation will be
performed in bfloat16 and accumulation will be performed in float32.
Otherwise the intermediates will be in the same type as the inputs.
...
...
vllm/benchmarks/serve.py
View file @
04149cce
...
...
@@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-sep
e
rated list of selected metrics to report percentils. "
help
=
"Comma-sep
a
rated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
)
parser
.
add_argument
(
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-sep
e
rated list of percentiles for selected metrics. "
help
=
"Comma-sep
a
rated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
...
...
vllm/engine/output_processor/multi_step.py
View file @
04149cce
...
...
@@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
externally (before the next schedule() call)
"""
# Sequences can be in RUNNING or FINISHED_ABORTED state
# once scheduled, as a sequence is moved to FIN
S
IHED_ABORTED
# once scheduled, as a sequence is moved to FINI
S
HED_ABORTED
# if a client disconnects from the api server.
seqs
=
sequence_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
if
seqs
is
None
:
...
...
vllm/entrypoints/openai/tool_parsers/utils.py
View file @
04149cce
...
...
@@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]:
# partial_json_parser doesn't support extra data and
# JSONDeco
r
der.raw_decode doesn't support partial JSON
# JSONDecoder.raw_decode doesn't support partial JSON
def
partial_json_loads
(
input_str
:
str
,
flags
:
Allow
)
->
tuple
[
Any
,
int
]:
try
:
return
(
partial_json_parser
.
loads
(
input_str
,
flags
),
len
(
input_str
))
...
...
vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
View file @
04149cce
...
...
@@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel(
compute_capability
:
Optional
[
int
]
=
None
)
->
Type
[
ScaledMMLinearKernel
]:
"""
Choose an Scal
l
edMMLinearKernel that can implement the given config for the
Choose an ScaledMMLinearKernel that can implement the given config for the
given compute capability. Attempts to choose the best kernel in terms of
performance.
...
...
vllm/platforms/cpu.py
View file @
04149cce
...
...
@@ -69,12 +69,12 @@ class CpuPlatform(Platform):
cache_config
=
vllm_config
.
cache_config
ipex_ava
l
iable
=
find_spec
(
"intel_extension_for_pytorch"
)
is
not
None
ipex_avai
l
able
=
find_spec
(
"intel_extension_for_pytorch"
)
is
not
None
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
128
if
ipex_ava
l
iable
else
16
cache_config
.
block_size
=
128
if
ipex_avai
l
able
else
16
if
not
ipex_ava
l
iable
and
cache_config
.
block_size
!=
16
:
if
not
ipex_avai
l
able
and
cache_config
.
block_size
!=
16
:
raise
RuntimeError
(
f
"--block-size=
{
cache_config
.
block_size
}
requires"
" intel_extension_for_pytorch"
)
...
...
vllm/platforms/interface.py
View file @
04149cce
...
...
@@ -231,7 +231,7 @@ class Platform:
parser
:
Optional
[
FlexibleArgumentParser
]
=
None
)
->
None
:
"""
Do some pre-regist
e
ration or update action for the current platform.
Do some pre-registration or update action for the current platform.
This function is called before global VllmConfig is initialized or cli
arguments are parsed. It's used for out-of-tree platforms to register or
...
...
vllm/reasoning/granite_reasoning_parser.py
View file @
04149cce
...
...
@@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser):
Args:
model_output (str): Output of the model to be parsed.
request (ChatCompletionReqest): Request being processed.
request (ChatCompletionReq
u
est): Request being processed.
Returns:
tuple[Optional[str], Optional[str]]: Tuple pair containing the
...
...
vllm/sampling_params.py
View file @
04149cce
...
...
@@ -101,7 +101,7 @@ class RequestOutputKind(Enum):
CUMULATIVE
=
0
# Return only deltas in each RequestOutput
DELTA
=
1
# Do not return intermediate RequestOuput
s
# Do not return intermediate RequestOu
t
put
FINAL_ONLY
=
2
...
...
vllm/third_party/pynvml.py
View file @
04149cce
...
...
@@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure):
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
_fmt_ = {"hex_value" : "%08X"}
to produce nicer output.
Default fo
m
ratting string for all fields can be set with key "<default>" like:
Default for
m
atting string for all fields can be set with key "<default>" like:
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
If not set it's assumed to be just "%s"
...
...
vllm/v1/attention/backends/mla/common.py
View file @
04149cce
...
...
@@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention(
return spda_o @ W_O
NOTE: in the actual code,
`kv_b_proj` is [W_UK; W_UV] concatnated per head
`q_b_proj` is [W_UQ; W_QR] concatnated per head
`kv_b_proj` is [W_UK; W_UV] concat
e
nated per head
`q_b_proj` is [W_UQ; W_QR] concat
e
nated per head
`out_proj` is W_O
...
...
vllm/v1/executor/multiproc_executor.py
View file @
04149cce
...
...
@@ -326,7 +326,7 @@ class WorkerProc:
logger
.
debug
(
"Worker interrupted."
)
except
Exception
:
# worker_busy_loop sends exceptions
exceptons
to Executor
# worker_busy_loop sends exceptions to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC itself, we need to alert the parent.
psutil
.
Process
().
parent
().
send_signal
(
signal
.
SIGUSR1
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
04149cce
...
...
@@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
)
->
Union
[
ModelRunnerOutput
,
torch
.
Tensor
]:
self
.
_update_states
(
scheduler_output
)
if
not
scheduler_output
.
total_num_scheduled_tokens
:
# Return empty ModelRunnerOu
p
tut if there's no work to do.
# Return empty ModelRunnerOut
p
ut if there's no work to do.
return
EMPTY_MODEL_RUNNER_OUTPUT
if
self
.
is_multimodal_model
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment