DOC: Documentation pages fixes (#38125)

Signed-off-by: Mateusz Sokół <mat646@gmail.com>

DOC: Documentation pages fixes (#38125)
Signed-off-by: Mateusz Sokół <mat646@gmail.com>
b1cb1d3d · Mateusz Sokół · GitHub · 6ae8bbd0 · b1cb1d3d
Unverified Commit b1cb1d3d authored Mar 26, 2026 by Mateusz Sokół Committed by GitHub Mar 26, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 26 deletions

vllm/config/load.py vllm/config/load.py +31 -26

No files found.
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -24,38 +24,43 @@ class LoadConfig:
    """Configuration for loading the model weights."""

    load_format: str | LoadFormats = "auto"
-    """The format of the model weights to load:\n
+    """
+    The format of the model weights to load.
+
    - "auto" will try to load the weights in the safetensors format and fall
-    back to the pytorch bin format if safetensors format is not available.\n
-    - "pt" will load the weights in the pytorch bin format.\n
-    - "safetensors" will load the weights in the safetensors format.\n
+      back to the pytorch bin format if safetensors format is not available.
+    - "pt" will load the weights in the pytorch bin format.
+    - "safetensors" will load the weights in the safetensors format.
    - "instanttensor" will load the Safetensors weights on CUDA devices using
-    InstantTensor, which enables distributed loading with pipelined prefetching
-    and fast direct I/O.\n
+      InstantTensor, which enables distributed loading with pipelined prefetching
+      and fast direct I/O.
    - "npcache" will load the weights in pytorch format and store a numpy cache
-    to speed up the loading.\n
+      to speed up the loading.
    - "dummy" will initialize the weights with random values, which is mainly
-    for profiling.\n
+      for profiling.
    - "tensorizer" will use CoreWeave's tensorizer library for fast weight
-    loading. See the Tensorize vLLM Model script in the Examples section for
-    more information.\n
+      loading. See the Tensorize vLLM Model script in the Examples section for
+      more information.
    - "runai_streamer" will load the Safetensors weights using Run:ai Model
-    Streamer.\n
+      Streamer.
    - "runai_streamer_sharded" will load weights from pre-sharded checkpoint
-    files using Run:ai Model Streamer.\n
-    - "bitsandbytes" will load the weights using bitsandbytes quantization.\n
+      files using Run:ai Model Streamer.
+    - "bitsandbytes" will load the weights using bitsandbytes quantization.
    - "sharded_state" will load weights from pre-sharded checkpoint files,
-    supporting efficient loading of tensor-parallel models.\n
+      supporting efficient loading of tensor-parallel models.
    - "gguf" will load weights from GGUF format files (details specified in
-    https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
+      https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).
    - "mistral" will load weights from consolidated safetensors files used by
-    Mistral models.\n
-    - Other custom values can be supported via plugins."""
+      Mistral models.\n
+    - Other custom values can be supported via plugins.
+    """
    download_dir: str | None = None
    """Directory to download and load the weights, default to the default
    cache directory of Hugging Face."""
    safetensors_load_strategy: str | None = None
-    """Specifies the loading strategy for safetensors weights.
+    """
+    Specifies the loading strategy for safetensors weights.
+
    - None (default): Uses memory-mapped (lazy) loading. When an NFS
      filesystem is detected and the total checkpoint size fits within 90%%
      of available RAM, prefetching is enabled automatically.
@@ -72,7 +77,7 @@ class LoadConfig:
    - "torchao": Weights are loaded in upfront and then reconstructed
      into torchao tensor subclasses. This is used when the checkpoint
      was quantized using torchao and saved using safetensors.
-      Needs torchao >= 0.14.0
+      Needs `torchao >= 0.14.0`.
    """
    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
    """Extra config for model loader. This will be passed to the model loader
@@ -88,13 +93,13 @@ class LoadConfig:
    weights."""
    pt_load_map_location: str | dict[str, str] = "cpu"
    """
-    pt_load_map_location: the map location for loading pytorch checkpoint, to
-    support loading checkpoints can only be loaded on certain devices like
-    "cuda", this is equivalent to {"": "cuda"}. Another supported format is
-    mapping from different devices like from GPU 1 to GPU 0:
-    {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
-    in dictionary needs to be double quoted for json parsing. For more details,
-    see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
+    The map location for loading pytorch checkpoint, to support loading
+    checkpoints can only be loaded on certain devices like "cuda", this
+    is equivalent to `{"": "cuda"}`. Another supported format is mapping
+    from different devices like from GPU 1 to GPU 0: `{"cuda:1": "cuda:0"}`.
+    Note that when passed from command line, the strings in dictionary
+    need to be double quoted for json parsing. For more details, see
+    the original doc for `map_location` parameter in [`torch.load`][] parameter.
    """

    def compute_hash(self) -> str: