Unverified Commit 21d2b53f authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Remove need for explicit `\n` in docstring lists for `--help` formatting (#38350)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 98e7f223
...@@ -66,20 +66,19 @@ class CacheConfig: ...@@ -66,20 +66,19 @@ class CacheConfig:
enable_prefix_caching: bool = True enable_prefix_caching: bool = True
"""Whether to enable prefix caching.""" """Whether to enable prefix caching."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n """Set the hash algorithm for prefix caching:
- "sha256" uses Pickle for object serialization before hashing. This is the
current default, as SHA256 is the most secure choice to avoid potential - "sha256" uses Pickle for object serialization before hashing. This is the current
hash collisions.\n default, as SHA256 is the most secure choice to avoid potential hash collisions.
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256.\n serializes objects using canonical CBOR and hashes them with SHA-256.
- "xxhash" uses Pickle serialization with xxHash (128-bit) for faster, - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
non-cryptographic hashing. Requires the optional ``xxhash`` package. non-cryptographic hashing. Requires the optional ``xxhash`` package.
IMPORTANT: Use of a hashing algorithm that is not considered IMPORTANT: Use of a hashing algorithm that is not considered cryptographically
cryptographically secure theoretically increases the risk of hash collisions, secure theoretically increases the risk of hash collisions, which can cause
which can cause undefined behavior or even leak private information in undefined behavior or even leak private information in multi-tenant environments.
multi-tenant environments. Even if collisions are still very unlikely, it is Even if collisions are still very unlikely, it is important to consider your
important to consider your security risk tolerance against the performance security risk tolerance against the performance benefits before turning this on.
benefits before turning this on.\n
- "xxhash_cbor" combines canonical CBOR serialization with xxHash for - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
reproducible hashing. Requires the optional ``xxhash`` package.""" reproducible hashing. Requires the optional ``xxhash`` package."""
calculate_kv_scales: bool = False calculate_kv_scales: bool = False
......
...@@ -32,14 +32,14 @@ class KernelConfig: ...@@ -32,14 +32,14 @@ class KernelConfig:
moe_backend: MoEBackend = "auto" moe_backend: MoEBackend = "auto"
"""Backend for MoE expert computation kernels. Available options: """Backend for MoE expert computation kernels. Available options:
- "auto": Automatically select the best backend based on model and hardware\n - "auto": Automatically select the best backend based on model and hardware
- "triton": Use Triton-based fused MoE kernels\n - "triton": Use Triton-based fused MoE kernels
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
- "cutlass": Use vLLM CUTLASS kernels\n - "cutlass": Use vLLM CUTLASS kernels
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
- "marlin": Use Marlin kernels (weight-only quantization)\n - "marlin": Use Marlin kernels (weight-only quantization)
- "aiter": Use AMD AITer kernels (ROCm only)""" - "aiter": Use AMD AITer kernels (ROCm only)"""
@field_validator("moe_backend", mode="before") @field_validator("moe_backend", mode="before")
......
...@@ -51,7 +51,7 @@ class LoadConfig: ...@@ -51,7 +51,7 @@ class LoadConfig:
- "gguf" will load weights from GGUF format files (details specified in - "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md). https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
- "mistral" will load weights from consolidated safetensors files used by - "mistral" will load weights from consolidated safetensors files used by
Mistral models.\n Mistral models.
- Other custom values can be supported via plugins. - Other custom values can be supported via plugins.
""" """
download_dir: str | None = None download_dir: str | None = None
......
...@@ -125,26 +125,28 @@ class ModelConfig: ...@@ -125,26 +125,28 @@ class ModelConfig:
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model """Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used.""" name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto" tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n """Tokenizer mode:
- "auto" will use the tokenizer from `mistral_common` for Mistral models - "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n if available, otherwise it will use the "hf" tokenizer.
- "hf" will use the fast tokenizer if available.\n - "hf" will use the fast tokenizer if available.
- "slow" will always use the slow tokenizer.\n - "slow" will always use the slow tokenizer.
- "mistral" will always use the tokenizer from `mistral_common`.\n - "mistral" will always use the tokenizer from `mistral_common`.
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
- "qwen_vl" will always use the tokenizer from `qwen_vl`.\n - "qwen_vl" will always use the tokenizer from `qwen_vl`.
- Other custom values can be supported via plugins.""" - Other custom values can be supported via plugins."""
trust_remote_code: bool = False trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model """Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer.""" and tokenizer."""
dtype: ModelDType | torch.dtype = "auto" dtype: ModelDType | torch.dtype = "auto"
"""Data type for model weights and activations:\n """Data type for model weights and activations:
- "auto" will use FP16 precision for FP32 and FP16 models, and BF16 - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
precision for BF16 models.\n precision for BF16 models.
- "half" for FP16. Recommended for AWQ quantization.\n - "half" for FP16. Recommended for AWQ quantization.
- "float16" is the same as "half".\n - "float16" is the same as "half".
- "bfloat16" for a balance between precision and range.\n - "bfloat16" for a balance between precision and range.
- "float" is shorthand for FP32 precision.\n - "float" is shorthand for FP32 precision.
- "float32" for FP32 precision.""" - "float32" for FP32 precision."""
seed: int = 0 seed: int = 0
"""Random seed for reproducibility. """Random seed for reproducibility.
...@@ -182,10 +184,11 @@ class ModelConfig: ...@@ -182,10 +184,11 @@ class ModelConfig:
automatically derived from the model config. automatically derived from the model config.
When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
format. Examples:\n format. Examples:
- 1k -> 1000\n
- 1K -> 1024\n - 1k -> 1000
- 25.6k -> 25,600\n - 1K -> 1024
- 25.6k -> 25,600
- -1 or 'auto' -> Automatically choose the maximum model length that fits in - -1 or 'auto' -> Automatically choose the maximum model length that fits in
GPU memory. This will use the model's maximum context length if it fits, GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated.""" otherwise it will find the largest length that can be accommodated."""
...@@ -248,10 +251,11 @@ class ModelConfig: ...@@ -248,10 +251,11 @@ class ModelConfig:
prometheus metrics, if multiple names provided, metrics tag will take the prometheus metrics, if multiple names provided, metrics tag will take the
first one.""" first one."""
config_format: str | ConfigFormat = "auto" config_format: str | ConfigFormat = "auto"
"""The format of the model config to load:\n """The format of the model config to load:
- "auto" will try to load the config in hf format if available after trying - "auto" will try to load the config in hf format if available after trying
to load in mistral format.\n to load in mistral format.
- "hf" will load the config in hf format.\n - "hf" will load the config in hf format.
- "mistral" will load the config in mistral format.""" - "mistral" will load the config in mistral format."""
hf_token: bool | str | None = None hf_token: bool | str | None = None
"""The token to use as HTTP bearer authorization for remote files . If """The token to use as HTTP bearer authorization for remote files . If
...@@ -276,12 +280,12 @@ class ModelConfig: ...@@ -276,12 +280,12 @@ class ModelConfig:
"""Enable sleep mode for the engine (only cuda and """Enable sleep mode for the engine (only cuda and
hip platforms are supported).""" hip platforms are supported)."""
model_impl: str | ModelImpl = "auto" model_impl: str | ModelImpl = "auto"
"""Which implementation of the model to use:\n """Which implementation of the model to use:
- "auto" will try to use the vLLM implementation, if it exists, and fall
back to the Transformers implementation if no vLLM implementation is - "auto" will try to use the vLLM implementation, if it exists, and fall back to the
available.\n Transformers implementation if no vLLM implementation is available.
- "vllm" will use the vLLM model implementation.\n - "vllm" will use the vLLM model implementation.
- "transformers" will use the Transformers model implementation.\n - "transformers" will use the Transformers model implementation.
- "terratorch" will use the TerraTorch model implementation. - "terratorch" will use the TerraTorch model implementation.
""" """
override_attention_dtype: str | None = None override_attention_dtype: str | None = None
...@@ -1512,10 +1516,11 @@ class ModelConfig: ...@@ -1512,10 +1516,11 @@ class ModelConfig:
@property @property
def score_type(self) -> ScoreType: def score_type(self) -> ScoreType:
""" """
Scoring API handles score/rerank for:\n Scoring API handles score/rerank for:
- "classify" task (score_type: cross-encoder models)\n
- "embed" task (score_type: bi-encoder models)\n - "classify" task (score_type: cross-encoder models)
- "token_embed" task (score_type: late interaction models)\n - "embed" task (score_type: bi-encoder models)
- "token_embed" task (score_type: late interaction models)
""" """
# fixme: self._model_info.score_type is the score type before # fixme: self._model_info.score_type is the score type before
# as_seq_cls_model, which is "bi-encoder", rather than the # as_seq_cls_model, which is "bi-encoder", rather than the
...@@ -1593,9 +1598,10 @@ class ModelConfig: ...@@ -1593,9 +1598,10 @@ class ModelConfig:
such as the lm_head in a generation model, such as the lm_head in a generation model,
or the score or classifier in a classification model. or the score or classifier in a classification model.
`head_dtype` currently only supports pooling models.\n `head_dtype` currently only supports pooling models.
- The pooling model defaults to using fp32 head,
you can use --hf-overrides '{"head_dtype": "model"}' to disable it. - The pooling model defaults to using fp32 head, you can use
--hf-overrides '{"head_dtype": "model"}' to disable it.
""" """
head_dtype = _get_head_dtype( head_dtype = _get_head_dtype(
......
...@@ -146,7 +146,7 @@ class MultiModalConfig: ...@@ -146,7 +146,7 @@ class MultiModalConfig:
parallelism (TP). parallelism (TP).
- `"weights"`: Within the same vLLM engine, split the weights of - `"weights"`: Within the same vLLM engine, split the weights of
each layer across TP ranks. (default TP behavior)\n each layer across TP ranks. (default TP behavior)
- `"data"`: Within the same vLLM engine, split the batched input data - `"data"`: Within the same vLLM engine, split the batched input data
across TP ranks to process the data in parallel, while hosting across TP ranks to process the data in parallel, while hosting
the full weights on each TP rank. the full weights on each TP rank.
......
...@@ -148,10 +148,11 @@ class ParallelConfig: ...@@ -148,10 +148,11 @@ class ParallelConfig:
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig) eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
"""Expert parallelism configuration.""" """Expert parallelism configuration."""
expert_placement_strategy: ExpertPlacementStrategy = "linear" expert_placement_strategy: ExpertPlacementStrategy = "linear"
"""The expert placement strategy for MoE layers:\n """The expert placement strategy for MoE layers:
- "linear": Experts are placed in a contiguous manner. For example, with 4 - "linear": Experts are placed in a contiguous manner. For example, with 4
experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
experts [2, 3].\n experts [2, 3].
- "round_robin": Experts are placed in a round-robin manner. For example, - "round_robin": Experts are placed in a round-robin manner. For example,
with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1 with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
will have experts [1, 3]. This strategy can help improve load balancing will have experts [1, 3]. This strategy can help improve load balancing
...@@ -159,11 +160,11 @@ class ParallelConfig: ...@@ -159,11 +160,11 @@ class ParallelConfig:
all2all_backend: All2AllBackend = "allgather_reducescatter" all2all_backend: All2AllBackend = "allgather_reducescatter"
"""All2All backend for MoE expert parallel communication. Available options: """All2All backend for MoE expert parallel communication. Available options:
- "allgather_reducescatter": All2all based on allgather and reducescatter\n - "allgather_reducescatter": All2all based on allgather and reducescatter
- "deepep_high_throughput": Use deepep high-throughput kernels\n - "deepep_high_throughput": Use deepep high-throughput kernels
- "deepep_low_latency": Use deepep low-latency kernels\n - "deepep_low_latency": Use deepep low-latency kernels
- "mori": Use mori kernels\n - "mori": Use mori kernels
- "nixl_ep": Use nixl-ep kernels\n - "nixl_ep": Use nixl-ep kernels
- "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl - "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
- "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels""" - "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""
......
...@@ -37,7 +37,7 @@ class ProfilerConfig: ...@@ -37,7 +37,7 @@ class ProfilerConfig:
profiler: ProfilerKind | None = None profiler: ProfilerKind | None = None
"""Which profiler to use. Defaults to None. Options are: """Which profiler to use. Defaults to None. Options are:
- 'torch': Use PyTorch profiler.\n - 'torch': Use PyTorch profiler.
- 'cuda': Use CUDA profiler.""" - 'cuda': Use CUDA profiler."""
torch_profiler_dir: str = "" torch_profiler_dir: str = ""
......
...@@ -106,9 +106,10 @@ class SchedulerConfig: ...@@ -106,9 +106,10 @@ class SchedulerConfig:
max_num_batched_tokens in case max multimodal embedding size is larger.""" max_num_batched_tokens in case max multimodal embedding size is larger."""
policy: SchedulerPolicy = "fcfs" policy: SchedulerPolicy = "fcfs"
"""The scheduling policy to use:\n """The scheduling policy to use:
- "fcfs" means first come first served, i.e. requests are handled in order - "fcfs" means first come first served, i.e. requests are handled in order
of arrival.\n of arrival.
- "priority" means requests are handled based on given priority (lower - "priority" means requests are handled based on given priority (lower
value means earlier handling) and time of arrival deciding any ties).""" value means earlier handling) and time of arrival deciding any ties)."""
......
...@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance) ...@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
@overload @overload
@dataclass_transform(field_specifiers=(PydanticField,))
def config(cls: type[ConfigT]) -> type[ConfigT]: ... def config(cls: type[ConfigT]) -> type[ConfigT]: ...
@overload @overload
@dataclass_transform(field_specifiers=(PydanticField,))
def config( def config(
*, config: ConfigDict | None = None, **kwargs: Any *, config: ConfigDict | None = None, **kwargs: Any
) -> Callable[[type[ConfigT]], type[ConfigT]]: ... ) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
......
...@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma ...@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma
def _split_lines(self, text, width): def _split_lines(self, text, width):
""" """
1. Sentences split across lines have their single newlines removed. 1. Sentences split across lines have their single newlines removed.
2. Paragraphs and explicit newlines are split into separate lines. 2. Paragraphs and lists are split into separate lines.
3. Each line is wrapped to the specified width (width of terminal). 3. Each line is wrapped to the specified width (width of terminal).
""" """
# The patterns also include whitespace after the newline # The pattern also includes whitespace after the newline
single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*") newlines_to_remove = re.compile(r"(?<!\n)\n(?!\n)(?!\s*(-|\*|\+|\d+\.))\s*")
multiple_newlines = re.compile(r"\n{2,}\s*") lines = newlines_to_remove.sub(" ", text).splitlines()
text = single_newline.sub(" ", text)
lines = re.split(multiple_newlines, text)
return sum([textwrap.wrap(line, width) for line in lines], []) return sum([textwrap.wrap(line, width) for line in lines], [])
def add_arguments(self, actions): def add_arguments(self, actions):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment