beam_search.py 3.14 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from dataclasses import dataclass
5

6
from vllm.inputs import TokenInputs, token_inputs
7
from vllm.logprobs import Logprob
8
from vllm.lora.request import LoRARequest
9
from vllm.multimodal.inputs import MultiModalInputs, mm_inputs
10

11
12
13
14
15
16
17
18

@dataclass
class BeamSearchSequence:
    """A sequence for beam search.
    It keeps track of the tokens and the log probability of the sequence.
    The text field is optional and will only be filled when the sequence is
    about to be returned to the user.
    """
19

20
21
    orig_prompt: TokenInputs | MultiModalInputs

22
    # The tokens include the prompt.
23
24
    tokens: list[int]
    logprobs: list[dict[int, Logprob]]
25
    lora_request: LoRARequest | None = None
26
    cum_logprob: float = 0.0
27
28
29
    text: str | None = None
    finish_reason: str | None = None
    stop_reason: int | str | None = None
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

    def get_prompt(self):
        prompt = self.orig_prompt

        prompt_text = prompt.get("prompt")
        cache_salt = prompt.get("cache_salt")

        if prompt["type"] == "token":
            return token_inputs(
                self.tokens,
                prompt=prompt_text,
                cache_salt=cache_salt,
            )

        return mm_inputs(
            prompt_token_ids=self.tokens,
            mm_kwargs=prompt["mm_kwargs"],
            mm_hashes=prompt["mm_hashes"],
            mm_placeholders=prompt["mm_placeholders"],
            prompt=prompt_text,
            cache_salt=cache_salt,
        )
52
53
54
55
56
57
58
59


@dataclass
class BeamSearchOutput:
    """The output of beam search.
    It contains the list of the best beam search sequences.
    The length of the list is equal to the beam width.
    """
60

61
    sequences: list[BeamSearchSequence]
62
63
64


class BeamSearchInstance:
65
66
    def __init__(
        self,
67
        prompt: TokenInputs | MultiModalInputs,
68
69
        lora_request: LoRARequest | None = None,
        logprobs: list[dict[int, Logprob]] | None = None,
70
71
        **kwargs,
    ):
72
        self.beams: list[BeamSearchSequence] = [
73
            BeamSearchSequence(
74
75
                orig_prompt=prompt,
                tokens=prompt["prompt_token_ids"],
76
                logprobs=[] if logprobs is None else list(logprobs),
77
                lora_request=lora_request,
78
79
                **kwargs,
            )
80
        ]
81
        self.completed: list[BeamSearchSequence] = []
82
83
84


def get_beam_search_score(
85
    tokens: list[int],
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    cumulative_logprob: float,
    eos_token_id: int,
    length_penalty: float = 1.0,
) -> float:
    """Calculate the beam search score with length penalty.

    Adapted from

    https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
    """
    seq_len = len(tokens)
    if tokens[-1] == eos_token_id:
        seq_len -= 1

    return cumulative_logprob / (seq_len**length_penalty)


def create_sort_beams_key_function(eos_token_id: int, length_penalty: float):
    def sort_beams_key(x: BeamSearchSequence) -> float:
105
106
107
        return get_beam_search_score(
            x.tokens, x.cum_logprob, eos_token_id, length_penalty
        )
108
109

    return sort_beams_key