[Frontend] Make beam search emulator temperature modifiable (#8928)

Co-authored-by: Eduard Balzin <nfunctor@yahoo.fr>

[Frontend] Make beam search emulator temperature modifiable (#8928)
Co-authored-by: Eduard Balzin <nfunctor@yahoo.fr>
090e945e · Edouard B. · GitHub · e1a3f5e8 · 090e945e
Unverified Commit 090e945e authored Sep 28, 2024 by Edouard B. Committed by GitHub Sep 28, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

vllm/entrypoints/llm.py vllm/entrypoints/llm.py +3 -1

No files found.
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -396,6 +396,7 @@ class LLM:
        beam_width: int,
        max_tokens: int,
        ignore_eos: bool = False,
+        temperature: float = 0.0,
    ) -> List[BeamSearchOutput]:
        """
        Generate sequences using beam search.
@@ -405,6 +406,7 @@ class LLM:
                of token IDs.
            beam_width: The number of beams to keep at each step.
            max_tokens: The max number of tokens to generate for each prompt.
+            temperature: The temperature to use for generation.
        
        TODO: how does beam search work together with length penalty, frequency
        penalty, and stopping criteria, etc.?
@@ -416,7 +418,7 @@ class LLM:
        # at https://github.com/huggingface/transformers/blob/e15687fffe5c9d20598a19aeab721ae0a7580f8a/src/transformers/generation/beam_search.py#L534 # noqa
        beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                            max_tokens=1,
-                                            temperature=0.0)
+                                            temperature=temperature)
        instances: List[BeamSearchInstance] = []

        for prompt in prompts: