Remove need for explicit `\n` in docstring lists for `--help` formatting (#38350)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

Remove need for explicit `\n` in docstring lists for `--help` formatting (#38350)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
21d2b53f · Harry Mellor · GitHub · 98e7f223 · 21d2b53f · 21d2b53f
Unverified Commit 21d2b53f authored Mar 27, 2026 by Harry Mellor Committed by GitHub Mar 27, 2026
10 changed files
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -66,20 +66,19 @@ class CacheConfig:
    enable_prefix_caching: bool = True
    """Whether to enable prefix caching."""
    prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
-    """Set the hash algorithm for prefix caching:\n
+    """Set the hash algorithm for prefix caching:
-    - "sha256" uses Pickle for object serialization before hashing. This is the
-    current default, as SHA256 is the most secure choice to avoid potential
+    - "sha256" uses Pickle for object serialization before hashing. This is the current
-    hash collisions.\n
+      default, as SHA256 is the most secure choice to avoid potential hash collisions.
    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
-    serializes objects using canonical CBOR and hashes them with SHA-256.\n
+      serializes objects using canonical CBOR and hashes them with SHA-256.
    - "xxhash" uses Pickle serialization with xxHash (128-bit) for faster,
      non-cryptographic hashing. Requires the optional ``xxhash`` package.
-    IMPORTANT: Use of a hashing algorithm that is not considered 
+      IMPORTANT: Use of a hashing algorithm that is not considered  cryptographically
-    cryptographically secure theoretically increases the risk of hash collisions,
+      secure theoretically increases the risk of hash collisions, which can cause
-    which can cause undefined behavior or even leak private information in
+      undefined behavior or even leak private information in multi-tenant environments.
-    multi-tenant environments. Even if collisions are still very unlikely, it is
+      Even if collisions are still very unlikely, it is important to consider your
-    important to consider your security risk tolerance against the performance
+      security risk tolerance against the performance benefits before turning this on.
-    benefits before turning this on.\n
    - "xxhash_cbor" combines canonical CBOR serialization with xxHash for
      reproducible hashing. Requires the optional ``xxhash`` package."""
    calculate_kv_scales: bool = False

--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -32,14 +32,14 @@ class KernelConfig:
    moe_backend: MoEBackend = "auto"
    """Backend for MoE expert computation kernels. Available options:
-    - "auto": Automatically select the best backend based on model and hardware\n
+    - "auto": Automatically select the best backend based on model and hardware
-    - "triton": Use Triton-based fused MoE kernels\n
+    - "triton": Use Triton-based fused MoE kernels
-    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
+    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
-    - "cutlass": Use vLLM CUTLASS kernels\n
+    - "cutlass": Use vLLM CUTLASS kernels
-    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
+    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
-    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
+    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
-    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
+    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
-    - "marlin": Use Marlin kernels (weight-only quantization)\n
+    - "marlin": Use Marlin kernels (weight-only quantization)
    - "aiter": Use AMD AITer kernels (ROCm only)"""
    @field_validator("moe_backend", mode="before")

--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -51,7 +51,7 @@ class LoadConfig:
    - "gguf" will load weights from GGUF format files (details specified in
      https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
    - "mistral" will load weights from consolidated safetensors files used by
-      Mistral models.\n
+      Mistral models.
    - Other custom values can be supported via plugins.
    """
    download_dir: str | None = None

--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -125,26 +125,28 @@ class ModelConfig:
    """Name or path of the Hugging Face tokenizer to use. If unspecified, model
    name or path will be used."""
    tokenizer_mode: TokenizerMode | str = "auto"
-    """Tokenizer mode:\n
+    """Tokenizer mode:
    - "auto" will use the tokenizer from `mistral_common` for Mistral models
-    if available, otherwise it will use the "hf" tokenizer.\n
+      if available, otherwise it will use the "hf" tokenizer.
-    - "hf" will use the fast tokenizer if available.\n
+    - "hf" will use the fast tokenizer if available.
-    - "slow" will always use the slow tokenizer.\n
+    - "slow" will always use the slow tokenizer.
-    - "mistral" will always use the tokenizer from `mistral_common`.\n
+    - "mistral" will always use the tokenizer from `mistral_common`.
-    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
+    - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
-    - "qwen_vl" will always use the tokenizer from `qwen_vl`.\n
+    - "qwen_vl" will always use the tokenizer from `qwen_vl`.
    - Other custom values can be supported via plugins."""
    trust_remote_code: bool = False
    """Trust remote code (e.g., from HuggingFace) when downloading the model
    and tokenizer."""
    dtype: ModelDType | torch.dtype = "auto"
-    """Data type for model weights and activations:\n
+    """Data type for model weights and activations:
    - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
-    precision for BF16 models.\n
+      precision for BF16 models.
-    - "half" for FP16. Recommended for AWQ quantization.\n
+    - "half" for FP16. Recommended for AWQ quantization.
-    - "float16" is the same as "half".\n
+    - "float16" is the same as "half".
-    - "bfloat16" for a balance between precision and range.\n
+    - "bfloat16" for a balance between precision and range.
-    - "float" is shorthand for FP32 precision.\n
+    - "float" is shorthand for FP32 precision.
    - "float32" for FP32 precision."""
    seed: int = 0
    """Random seed for reproducibility.
@@ -182,10 +184,11 @@ class ModelConfig:
    automatically derived from the model config.
    When passing via `--max-model-len`, supports k/m/g/K/M/G in human-readable
-    format. Examples:\n
+    format. Examples:
-    - 1k -> 1000\n
-    - 1K -> 1024\n
+    - 1k -> 1000
-    - 25.6k -> 25,600\n
+    - 1K -> 1024
+    - 25.6k -> 25,600
    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
      GPU memory. This will use the model's maximum context length if it fits,
      otherwise it will find the largest length that can be accommodated."""
@@ -248,10 +251,11 @@ class ModelConfig:
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
    config_format: str | ConfigFormat = "auto"
-    """The format of the model config to load:\n
+    """The format of the model config to load:
    - "auto" will try to load the config in hf format if available after trying
-    to load in mistral format.\n
+      to load in mistral format.
-    - "hf" will load the config in hf format.\n
+    - "hf" will load the config in hf format.
    - "mistral" will load the config in mistral format."""
    hf_token: bool | str | None = None
    """The token to use as HTTP bearer authorization for remote files . If
@@ -276,12 +280,12 @@ class ModelConfig:
    """Enable sleep mode for the engine (only cuda and
    hip platforms are supported)."""
    model_impl: str | ModelImpl = "auto"
-    """Which implementation of the model to use:\n
+    """Which implementation of the model to use:
-    - "auto" will try to use the vLLM implementation, if it exists, and fall
-    back to the Transformers implementation if no vLLM implementation is
+    - "auto" will try to use the vLLM implementation, if it exists, and fall back to the
-    available.\n
+      Transformers implementation if no vLLM implementation is available.
-    - "vllm" will use the vLLM model implementation.\n
+    - "vllm" will use the vLLM model implementation.
-    - "transformers" will use the Transformers model implementation.\n
+    - "transformers" will use the Transformers model implementation.
    - "terratorch" will use the TerraTorch model implementation.
    """
    override_attention_dtype: str | None = None
@@ -1512,10 +1516,11 @@ class ModelConfig:
    @property
    def score_type(self) -> ScoreType:
        """
-        Scoring API handles score/rerank for:\n
+        Scoring API handles score/rerank for:
-        - "classify" task (score_type: cross-encoder models)\n
-        - "embed" task (score_type: bi-encoder models)\n
+        - "classify" task (score_type: cross-encoder models)
-        - "token_embed" task (score_type: late interaction models)\n
+        - "embed" task (score_type: bi-encoder models)
+        - "token_embed" task (score_type: late interaction models)
        """
        # fixme: self._model_info.score_type is the score type before
        #  as_seq_cls_model, which is "bi-encoder", rather than the
@@ -1593,9 +1598,10 @@ class ModelConfig:
        such as the lm_head in a generation model,
        or the score or classifier in a classification model.
-        `head_dtype` currently only supports pooling models.\n
+        `head_dtype` currently only supports pooling models.
-        - The pooling model defaults to using fp32 head,
-        you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
+        - The pooling model defaults to using fp32 head, you can use
+          --hf-overrides '{"head_dtype": "model"}' to disable it.
        """
        head_dtype = _get_head_dtype(

--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -146,7 +146,7 @@ class MultiModalConfig:
    parallelism (TP).
    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)\n
+      each layer across TP ranks. (default TP behavior)
    - `"data"`: Within the same vLLM engine, split the batched input data
      across TP ranks to process the data in parallel, while hosting
      the full weights on each TP rank.

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -148,10 +148,11 @@ class ParallelConfig:
    eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
    """Expert parallelism configuration."""
    expert_placement_strategy: ExpertPlacementStrategy = "linear"
-    """The expert placement strategy for MoE layers:\n
+    """The expert placement strategy for MoE layers:
    - "linear": Experts are placed in a contiguous manner. For example, with 4
      experts and 2 ranks, rank 0 will have experts [0, 1] and rank 1 will have
-      experts [2, 3].\n
+      experts [2, 3].
    - "round_robin": Experts are placed in a round-robin manner. For example,
      with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
      will have experts [1, 3]. This strategy can help improve load balancing
@@ -159,11 +160,11 @@ class ParallelConfig:
    all2all_backend: All2AllBackend = "allgather_reducescatter"
    """All2All backend for MoE expert parallel communication. Available options:
-    - "allgather_reducescatter": All2all based on allgather and reducescatter\n
+    - "allgather_reducescatter": All2all based on allgather and reducescatter
-    - "deepep_high_throughput": Use deepep high-throughput kernels\n
+    - "deepep_high_throughput": Use deepep high-throughput kernels
-    - "deepep_low_latency": Use deepep low-latency kernels\n
+    - "deepep_low_latency": Use deepep low-latency kernels
-    - "mori": Use mori kernels\n
+    - "mori": Use mori kernels
-    - "nixl_ep": Use nixl-ep kernels\n
+    - "nixl_ep": Use nixl-ep kernels
    - "flashinfer_nvlink_two_sided": Use flashinfer two-sided kernels for mnnvl
    - "flashinfer_nvlink_one_sided": Use flashinfer high-throughput a2a kernels"""

--- a/vllm/config/profiler.py
+++ b/vllm/config/profiler.py
@@ -37,7 +37,7 @@ class ProfilerConfig:
    profiler: ProfilerKind | None = None
    """Which profiler to use. Defaults to None. Options are:
-    - 'torch': Use PyTorch profiler.\n
+    - 'torch': Use PyTorch profiler.
    - 'cuda': Use CUDA profiler."""
    torch_profiler_dir: str = ""

--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -106,9 +106,10 @@ class SchedulerConfig:
    max_num_batched_tokens in case max multimodal embedding size is larger."""
    policy: SchedulerPolicy = "fcfs"
-    """The scheduling policy to use:\n
+    """The scheduling policy to use:
    - "fcfs" means first come first served, i.e. requests are handled in order 
-    of arrival.\n
+      of arrival.
    - "priority" means requests are handled based on given priority (lower
      value means earlier handling) and time of arrival deciding any ties)."""

--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -37,10 +37,12 @@ ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
 @overload
+@dataclass_transform(field_specifiers=(PydanticField,))
 def config(cls: type[ConfigT]) -> type[ConfigT]: ...
 @overload
+@dataclass_transform(field_specifiers=(PydanticField,))
 def config(
    *, config: ConfigDict | None = None, **kwargs: Any
 ) -> Callable[[type[ConfigT]], type[ConfigT]]: ...

--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -31,14 +31,12 @@ class SortedHelpFormatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpForma
    def _split_lines(self, text, width):
        """
        1. Sentences split across lines have their single newlines removed.
-        2. Paragraphs and explicit newlines are split into separate lines.
+        2. Paragraphs and lists are split into separate lines.
        3. Each line is wrapped to the specified width (width of terminal).
        """
-        # The patterns also include whitespace after the newline
+        # The pattern also includes whitespace after the newline
-        single_newline = re.compile(r"(?<!\n)\n(?!\n)\s*")
+        newlines_to_remove = re.compile(r"(?<!\n)\n(?!\n)(?!\s*(-|\*|\+|\d+\.))\s*")
-        multiple_newlines = re.compile(r"\n{2,}\s*")
+        lines = newlines_to_remove.sub(" ", text).splitlines()
-        text = single_newline.sub(" ", text)
-        lines = re.split(multiple_newlines, text)
        return sum([textwrap.wrap(line, width) for line in lines], [])
    def add_arguments(self, actions):