backend_guidance.py 8.57 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
from __future__ import annotations

6
7
import copy
import json
8
9
import os
from dataclasses import dataclass
10
from typing import TYPE_CHECKING, Any, Union
11
12
13
14

import torch

from vllm.logger import init_logger
15
from vllm.sampling_params import SamplingParams
16
from vllm.utils import LazyLoader
17
18
19
20
21
from vllm.v1.structured_output.backend_types import (
    StructuredOutputBackend,
    StructuredOutputGrammar,
    StructuredOutputOptions,
)
22
23
24
25
26
27
28
29
30
from vllm.v1.structured_output.request import get_structured_output_key

if TYPE_CHECKING:
    import llguidance
    import llguidance.hf as llguidance_hf
    import llguidance.torch as llguidance_torch
else:
    llguidance = LazyLoader("llguidance", globals(), "llguidance")
    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
31
    llguidance_torch = LazyLoader("llguidance.torch", globals(), "llguidance.torch")
32
33
34
35

logger = init_logger(__name__)


36
37
38
39
def _walk_json_for_additional_properties(data: object):
    if isinstance(data, dict):
        for value in data.values():
            _walk_json_for_additional_properties(value)
40
41
42
43
        if "additionalProperties" not in data and (
            "properties" in data or "patternProperties" in data
        ):
            data["additionalProperties"] = False
44
45
46
47
48
49
    elif isinstance(data, list):
        for item in data:
            _walk_json_for_additional_properties(item)


def process_for_additional_properties(
50
51
    guide_json: Union[str, dict[str, Any]],
) -> dict[str, Any]:
52
53
54
55
56
57
58
59
60
    if isinstance(guide_json, str):
        guide_json_obj = json.loads(guide_json)
    else:
        # copy for modifications
        guide_json_obj = copy.deepcopy(guide_json)
    _walk_json_for_additional_properties(guide_json_obj)
    return guide_json_obj


61
@dataclass
62
class GuidanceBackend(StructuredOutputBackend):
63
    def __post_init__(self):
64
        self.disable_any_whitespace = (
65
            self.vllm_config.structured_outputs_config.disable_any_whitespace
66
67
        )
        self.disable_additional_properties = (
68
            self.vllm_config.structured_outputs_config.disable_additional_properties
69
        )
70

71
        self.ll_tokenizer = llguidance_hf.from_tokenizer(
72
73
            self.tokenizer, self.vocab_size
        )
74

75
76
77
    def compile_grammar(
        self, request_type: StructuredOutputOptions, grammar_spec: str
    ) -> StructuredOutputGrammar:
78
        self.serialized_grammar = serialize_guidance_grammar(
79
80
81
82
83
            request_type,
            grammar_spec,
            self.disable_any_whitespace,
            self.disable_additional_properties,
        )
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

        ll_matcher = llguidance.LLMatcher(
            self.ll_tokenizer,
            self.serialized_grammar,
            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
        )

        r = GuidanceGrammar(
            ll_matcher=ll_matcher,
            ll_tokenizer=self.ll_tokenizer,
            vocab_size=self.vocab_size,
        )

        r.check_error()
        return r

    def allocate_token_bitmask(self, max_num_seqs: int):
        return llguidance_torch.allocate_token_bitmask(
102
103
            max_num_seqs, self.ll_tokenizer.vocab_size
        )
104

105
106
107
    def destroy(self):
        pass

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

@dataclass
class GuidanceGrammar(StructuredOutputGrammar):
    ll_matcher: llguidance.LLMatcher
    ll_tokenizer: llguidance.LLTokenizer
    vocab_size: int
    printed_error: bool = False
    terminated: bool = False

    def check_error(self):
        if not self.printed_error:
            err = self.ll_matcher.get_error()
            if err:
                self.printed_error = True
                logger.warning("LLMatcher error: %s", err)

    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
        """Accepts a list of tokens and advances the parser.

        Returns True if the parser was advanced successfully.
        Returns False if the parser failed to advance.
        """

        if self.ll_tokenizer.eos_token in tokens:
            self.terminated = True

        if self.ll_matcher.is_stopped():
            return True

        # TODO - Add jump decoding support in the future:
        # self.ll_matcher.compute_ff_bytes() - this should always work
        # self.ll_matcher.compute_ff_tokens() - this only works for
        #   "canonical" tokenizers
        # For conversion between the two, see
        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md

        r = self.ll_matcher.consume_tokens(tokens)

        self.check_error()

        return r

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    def validate_tokens(self, tokens: list[int]) -> list[int]:
        """Checks if the list of tokens are accepted by the parser in sequence.
        Will not advance the parser.

        Returns the prefix list of tokens that are accepted by the parser.
        """
        if len(tokens) == 0:
            return []
        if self.ll_matcher.is_stopped():
            return []

        num_tokens = self.ll_matcher.validate_tokens(tokens)

        self.check_error()

        return tokens[:num_tokens]

    def rollback(self, num_tokens: int) -> None:
        self.ll_matcher.rollback(num_tokens)
        self.check_error()

171
172
173
174
175
176
177
178
179
180
181
182
183
184
    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
        # this will automatically return [EOS] mask if the matcher is stopped
        # or otherwise in an error state
        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
        self.check_error()

    def is_terminated(self) -> bool:
        return self.terminated

    def reset(self):
        # This method may be not needed anymore? TODO
        self.ll_matcher.reset()


185
186
187
188
def serialize_guidance_grammar(
    request_type: StructuredOutputOptions,
    grammar_spec: Union[str, dict[str, Any]],
    disable_any_whitespace: bool = False,
189
    disable_additional_properties: bool = False,
190
) -> str:
191
192
193
    def _process_schema(
        grammar_spec: Union[str, dict[str, Any]],
    ) -> str:
194
        if disable_additional_properties:
195
            grammar_spec = process_for_additional_properties(grammar_spec)
196
        return llguidance.LLMatcher.grammar_from_json_schema(
197
198
199
            grammar_spec,
            defaults={
                "whitespace_flexible": not disable_any_whitespace,
200
201
            },
        )
202
203
204

    if request_type == StructuredOutputOptions.JSON:
        return _process_schema(grammar_spec)
205
206
    elif request_type == StructuredOutputOptions.JSON_OBJECT:
        return llguidance.LLMatcher.grammar_from_json_schema(
207
208
209
            '{"type": "object"}',
            defaults={
                "whitespace_flexible": not disable_any_whitespace,
210
211
            },
        )
212
213
214
215
216
217
218
    else:
        if request_type == StructuredOutputOptions.REGEX:
            tp = "regex"
        elif request_type == StructuredOutputOptions.GRAMMAR:
            tp = "grammar"
        elif request_type == StructuredOutputOptions.CHOICE:
            tp = "choice"
219
        elif request_type == StructuredOutputOptions.STRUCTURAL_TAG:
220
221
222
223
224
225
226
227
228
229
230
            if isinstance(grammar_spec, str):
                s_tag = json.loads(grammar_spec)
            else:
                s_tag = grammar_spec
            triggers: list[str] = s_tag["triggers"]
            tags: list[llguidance.StructTag] = []
            for s in s_tag["structures"]:
                begin: str = s["begin"]
                trig = next((t for t in triggers if begin.startswith(t)), None)
                if trig is None:
                    raise ValueError(
231
232
                        f"Trigger {begin} not found in triggers {triggers}"
                    )
233
234
235
236
237
238
                tags.append(
                    llguidance.StructTag(
                        trigger=trig,
                        begin=s["begin"],
                        grammar=_process_schema(s["schema"]),
                        end=s["end"],
239
240
                    )
                )
241
            if not tags:
242
                raise ValueError("No structural tags found in the grammar spec.")
243
            return llguidance.StructTag.to_grammar(tags)
244
        else:
245
246
247
248
249
250
            logger.error(
                "Validation should have already occurred. Please file an issue."
            )
            raise ValueError(
                f"grammar is not of valid supported types. ({request_type!s})"
            )
251
252
253
254
        return llguidance.grammar_from(tp, grammar_spec)


def validate_guidance_grammar(
255
    sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None
256
) -> None:
257
258
    tp, grm = get_structured_output_key(sampling_params)
    guidance_grm = serialize_guidance_grammar(tp, grm)
259
    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
260
261
    if err:
        raise ValueError(f"Grammar error: {err}")