Unverified Commit 66d3d542 authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files

[Doc]: fixing typos in diverse files (#29492)


Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
parent bab438ff
...@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -1005,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
"for headers to be passed with each request. These headers override " "for headers to be passed with each request. These headers override "
"per backend constants and values set via environment variable, and " "per backend constants and values set via environment variable, and "
"will be overriden by other arguments (such as request ids).", "will be overridden by other arguments (such as request ids).",
) )
parser.add_argument( parser.add_argument(
"--max-concurrency", "--max-concurrency",
...@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser): ...@@ -1138,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--percentile-metrics", "--percentile-metrics",
type=str, type=str,
default=None, default=None,
help="Comma-separated list of selected metrics to report percentils. " help="Comma-separated list of selected metrics to report percentiles. "
"This argument specifies the metrics to report percentiles. " "This argument specifies the metrics to report percentiles. "
'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
'If not specified, defaults to "ttft,tpot,itl" for generative models ' 'If not specified, defaults to "ttft,tpot,itl" for generative models '
......
...@@ -238,9 +238,9 @@ class ParallelConfig: ...@@ -238,9 +238,9 @@ class ParallelConfig:
cp_kv_cache_interleave_size: int = 1 cp_kv_cache_interleave_size: int = 1
"""Interleave size of kv_cache storage while using DCP or PCP. """Interleave size of kv_cache storage while using DCP or PCP.
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`, For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
and `total_cp_world_size = pcp_world_size * dcp_world_szie`. and `total_cp_world_size = pcp_world_size * dcp_world_size`.
store interleave_size tokens on total_cp_rank i, store interleave_size tokens on total_cp_rank i,
then store next interleave_size tokens on taotal_cp_rank i+1. then store next interleave_size tokens on total_cp_rank i+1.
Interleave_size=1: token-level alignment, where token `i` is stored on Interleave_size=1: token-level alignment, where token `i` is stored on
total_cp_rank `i % total_cp_world_size`. total_cp_rank `i % total_cp_world_size`.
Interleave_size=block_size: block-level alignment, where tokens are Interleave_size=block_size: block-level alignment, where tokens are
......
...@@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC): ...@@ -173,7 +173,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
vocab_size: int, vocab_size: int,
): ):
# NOTE We have remove lora extra vocab support for now. So we set # NOTE We have remove lora extra vocab support for now. So we set
# extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed. # extra_vocab_size always to 0, and extra_vocab_size will be removed.
extra_vocab_size = 0 extra_vocab_size = 0
( (
......
...@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax( ...@@ -428,7 +428,7 @@ def load_weights_using_from_2_way_softmax(
) )
if text_config.tie_word_embeddings: if text_config.tie_word_embeddings:
# embed_tokens is the assumed name for input embeddings. If the model does not # embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fallback to get_input_embeddings(), which is used by # have this attribute, we fall back to get_input_embeddings(), which is used by
# the Transformers modeling backend. # the Transformers modeling backend.
embed_tokens = ( embed_tokens = (
model.model.embed_tokens model.model.embed_tokens
...@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te ...@@ -486,7 +486,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
) )
if text_config.tie_word_embeddings: if text_config.tie_word_embeddings:
# embed_tokens is the assumed name for input embeddings. If the model does not # embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fallback to get_input_embeddings(), which is used by # have this attribute, we fall back to get_input_embeddings(), which is used by
# the Transformers modeling backend. # the Transformers modeling backend.
embed_tokens = ( embed_tokens = (
model.model.embed_tokens model.model.embed_tokens
......
...@@ -181,7 +181,7 @@ def apply_top_k_top_p( ...@@ -181,7 +181,7 @@ def apply_top_k_top_p(
after thresholding the logit using this cut-off, the remaining elements after thresholding the logit using this cut-off, the remaining elements
shall constitute the top-p set. shall constitute the top-p set.
Note: in the case of tie (i.e. multipple cut-off elements present in the Note: in the case of tie (i.e. multiple cut-off elements present in the
logit), all tie elements are included in the top-p set. In other words, logit), all tie elements are included in the top-p set. In other words,
this function does not break ties. Instead, these tie tokens have equal this function does not break ties. Instead, these tie tokens have equal
chance of being chosen during final sampling, so we can consider the tie chance of being chosen during final sampling, so we can consider the tie
......
...@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig): ...@@ -24,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
device = get_dp_group().device device = get_dp_group().device
group = get_dp_group().device_group group = get_dp_group().device_group
# Transfering this tensor from GPU to CPU will introduce a GPU sync # Transferring this tensor from GPU to CPU will introduce a GPU sync
# point that could adversely affect performance of vllm with asynch # point that could adversely affect performance of vllm with asynch
# scheduling. This environment variable exists to quickly disable # scheduling. This environment variable exists to quickly disable
# this optimization if we run into this case. # this optimization if we run into this case.
if parallel_config.disable_nccl_for_dp_synchronization: if parallel_config.disable_nccl_for_dp_synchronization:
logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.") logger.info_once(
"Using CPU all reduce to synchronize DP padding between ranks."
)
device = "cpu" device = "cpu"
group = get_dp_group().cpu_group group = get_dp_group().cpu_group
return device, group return device, group
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment