[DOCS] Add engine args documentation (#1741)

a921d8be · Casper · GitHub · 094f716b · a921d8be · a921d8be
Unverified Commit a921d8be authored Nov 22, 2023 by Casper Committed by GitHub Nov 22, 2023
Showing with 119 additions and 0 deletions

docs/source/index.rst docs/source/index.rst +1 -0

docs/source/models/engine_args.rst docs/source/models/engine_args.rst +114 -0

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +4 -0

No files found.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -73,6 +73,7 @@ Documentation
   models/supported_models
   models/adding_model
+   models/engine_args
 .. toctree::
   :maxdepth: 1

--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
+.. _engine_args:
+Engine Arguments
+================
+Below, you can find an explanation of every engine argument for vLLM:
+.. option:: --model <model_name_or_path>
+    Name or path of the huggingface model to use.
+.. option:: --tokenizer <tokenizer_name_or_path>
+    Name or path of the huggingface tokenizer to use.
+.. option:: --revision <revision>
+    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+.. option:: --tokenizer-revision <revision>
+    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+.. option:: --tokenizer-mode {auto,slow}
+    The tokenizer mode.
+    * "auto" will use the fast tokenizer if available.
+    * "slow" will always use the slow tokenizer.
+.. option:: --trust-remote-code
+    Trust remote code from huggingface.
+.. option:: --download-dir <directory>
+    Directory to download and load the weights, default to the default cache dir of huggingface.
+.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
+    The format of the model weights to load.
+    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
+    * "pt" will load the weights in the pytorch bin format.
+    * "safetensors" will load the weights in the safetensors format.
+    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
+    * "dummy" will initialize the weights with random values, mainly for profiling.
+.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
+    Data type for model weights and activations.
+    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
+    * "half" for FP16. Recommended for AWQ quantization.
+    * "float16" is the same as "half".
+    * "bfloat16" for a balance between precision and range.
+    * "float" is shorthand for FP32 precision.
+    * "float32" for FP32 precision.
+.. option:: --max-model-len <length>
+    Model context length. If unspecified, will be automatically derived from the model config.
+.. option:: --worker-use-ray
+    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
+.. option:: --pipeline-parallel-size (-pp) <size>
+    Number of pipeline stages.
+.. option:: --tensor-parallel-size (-tp) <size>
+    Number of tensor parallel replicas.
+.. option:: --max-parallel-loading-workers <workers>
+    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
+.. option:: --block-size {8,16,32}
+    Token block size for contiguous chunks of tokens.
+.. option:: --seed <seed>
+    Random seed for operations.
+.. option:: --swap-space <size>
+    CPU swap space size (GiB) per GPU.
+.. option:: --gpu-memory-utilization <percentage>
+    The percentage of GPU memory to be used for the model executor.
+.. option:: --max-num-batched-tokens <tokens>
+    Maximum number of batched tokens per iteration.
+.. option:: --max-num-seqs <sequences>
+    Maximum number of sequences per iteration.
+.. option:: --max-paddings <paddings>
+    Maximum number of paddings in a batch.
+.. option:: --disable-log-stats
+    Disable logging statistics.
+.. option:: --quantization (-q) {awq,squeezellm,None}
+    Method used to quantize the weights.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -42,6 +42,10 @@ class EngineArgs:
    def add_cli_args(
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Shared CLI arguments for vLLM engine."""
+        # NOTE: If you update any of the arguments below, please also
+        # make sure to update docs/source/models/engine_args.rst
        # Model arguments
        parser.add_argument(
            '--model',