Unverified Commit 02d411fd authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files

[Doc]: fix typos in Python comments (#24115)


Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
parent d7e1e599
......@@ -218,7 +218,7 @@ if __name__ == "__main__":
"--xaxis",
type=str,
default="# of max concurrency.",
help="column name to use as X Axis in comparision graph",
help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
......
......@@ -1104,7 +1104,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
......
......@@ -998,7 +998,7 @@ def create_argument_parser():
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'Default value is "ttft,tpot,itl".',
......
......@@ -719,7 +719,7 @@ def create_argument_parser():
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
)
# hf dtaset
# hf dataset
parser.add_argument(
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
)
......
......@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
if not all_the_same(trace_eles)), None)
if first_trace_difference is None:
# can't create a unique name, leave them names as the
# can't create a unique name, leave the names as they
# are they will get aggregated by the pivot_table call
continue
......
......@@ -513,7 +513,7 @@ if flashinfer_comm is not None:
torch.ops._C.static_scaled_fp8_quant(
quant_out, norm_out, scale_factor)
if scale_factor is None or norm_out is not None:
# we need to return allreduce outpput
# we need to return allreduce output
# in cases of non quant fused AR + RMS norm
# and fused AR + RMS norm + quant without fused add
allreduce_in.copy_(allreduce_out)
......
......@@ -49,7 +49,7 @@ class MQLLMEngine:
This class is used to wrap the
[`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
in concurrnet manner. It runs a background loop and uses zeromq to
in concurrent manner. It runs a background loop and uses zeromq to
receive new requests and stream outputs incrementally via ipc.
The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
......
......@@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None
# The condition to determine if it is on a platform that supports
# torch._scaled_mm rowwise feature.
# The condition is determined once as the operations
# are time consuming.
# are time-consuming.
USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse(
torch.__version__) >= version.parse("2.7")
and current_platform.has_device_capability(94))
......
......@@ -211,7 +211,7 @@ class DefaultModelLoader(BaseModelLoader):
if not USE_TPU_COMMONS:
# In PyTorch XLA, we should call `xm.mark_step`
# requently so that not too many ops are accumulated
# frequently so that not too many ops are accumulated
# in the XLA program. import torch_xla.core.xla_model
# as xm
import torch_xla.core.xla_model as xm
......
......@@ -84,7 +84,7 @@ class XPUWorker(Worker):
"""Profiles the peak memory usage of the model to determine how many
KV blocks may be allocated without OOMs.
The engine will first conduct a profiling of the existing memory usage.
Then, it calculate the maximum possible number of GPU and CPU blocks
Then, it calculates the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
.. tip::
You may limit the usage of GPU memory
......
......@@ -234,7 +234,7 @@ class Worker(LocalOrDistributedWorkerBase):
KV blocks may be allocated without OOMs.
The engine will first conduct a profiling of the existing memory usage.
Then, it calculate the maximum possible number of GPU and CPU blocks
Then, it calculates the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
Tip:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment