[Docs] add the parallel sampling usage in LLMEngine and AsyncLLM (#24222)

c9ff9e6f · William Song · GitHub · eaffe448 · c9ff9e6f
Unverified Commit c9ff9e6f authored Sep 18, 2025 by William Song Committed by GitHub Sep 18, 2025
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

vllm/sampling_params.py vllm/sampling_params.py +7 -1

No files found.
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -81,7 +81,13 @@ class SamplingParams(
    """

    n: int = 1
-    """Number of output sequences to return for the given prompt."""
+    """Number of outputs to return for the given prompt request.
+
+    NOTE:
+        `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
+        are generated and streamed cumulatively per request. To see all `n`
+        outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
+        in `SamplingParams`."""
    best_of: Optional[int] = None
    """Number of output sequences that are generated from the prompt. From
    these `best_of` sequences, the top `n` sequences are returned. `best_of`