Merge remote-tracking branch 'mirror/v0.8.2' into v0.8.2-ori

31f6b24f · zhuwenwen · 89d1dd57 · 25f560a6 · 31f6b24f · 31f6b24f
Commit 31f6b24f authored Mar 26, 2025 by zhuwenwen
8 changed files
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                     StructuredOutputGrammar)
@@ -50,6 +51,8 @@ class StructuredOutputManager:
                    XgrammarBackend)
                self.backend = XgrammarBackend(self.vllm_config)
+            elif backend_name == "guidance":
+                self.backend = GuidanceBackend(self.vllm_config)
            else:
                raise ValueError(
                    f"Unsupported structured output backend: {backend_name}")

--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.request import get_structured_output_key
+if TYPE_CHECKING:
+    import llguidance
+    import llguidance.hf as llguidance_hf
+    import llguidance.torch as llguidance_torch
+else:
+    llguidance = LazyLoader("llguidance", globals(), "llguidance")
+    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(),
+                                  "llguidance.torch")
+logger = init_logger(__name__)
+class GuidanceBackend(StructuredOutputBackend):
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+        self.vllm_config = vllm_config
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        self.serialized_grammar = serialize_guidance_grammar(
+            request_type, grammar_spec)
+        ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+        r = GuidanceGrammar(
+            ll_matcher=ll_matcher,
+            ll_tokenizer=self.ll_tokenizer,
+            vocab_size=self.vocab_size,
+        )
+        r.check_error()
+        return r
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return llguidance_torch.allocate_token_bitmask(
+            max_num_seqs, self.ll_tokenizer.vocab_size)
+@dataclass
+class GuidanceGrammar(StructuredOutputGrammar):
+    ll_matcher: llguidance.LLMatcher
+    ll_tokenizer: llguidance.LLTokenizer
+    vocab_size: int
+    printed_error: bool = False
+    terminated: bool = False
+    def check_error(self):
+        if not self.printed_error:
+            err = self.ll_matcher.get_error()
+            if err:
+                self.printed_error = True
+                logger.warning("LLMatcher error: %s", err)
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the parser.
+        Returns True if the parser was advanced successfully.
+        Returns False if the parser failed to advance.
+        """
+        if self.ll_tokenizer.eos_token in tokens:
+            self.terminated = True
+        if self.ll_matcher.is_stopped():
+            return True
+        # TODO - Add jump decoding support in the future:
+        # self.ll_matcher.compute_ff_bytes() - this should always work
+        # self.ll_matcher.compute_ff_tokens() - this only works for
+        #   "canonical" tokenizers
+        # For conversion between the two, see
+        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
+        r = self.ll_matcher.consume_tokens(tokens)
+        self.check_error()
+        return r
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        # this will automatically return [EOS] mask if the matcher is stopped
+        # or otherwise in an error state
+        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
+        self.check_error()
+    def is_terminated(self) -> bool:
+        return self.terminated
+    def reset(self):
+        # This method may be not needed anymore? TODO
+        self.ll_matcher.reset()
+def serialize_guidance_grammar(request_type: StructuredOutputOptions,
+                               grammar_spec: str) -> str:
+    if request_type == StructuredOutputOptions.JSON:
+        # TODO: make whitespace_flexible configurable
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            grammar_spec, defaults={
+                "whitespace_flexible": True,
+            })
+    elif request_type == StructuredOutputOptions.JSON_OBJECT:
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}', defaults={
+                "whitespace_flexible": True,
+            })
+    else:
+        if request_type == StructuredOutputOptions.REGEX:
+            tp = "regex"
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            tp = "grammar"
+        elif request_type == StructuredOutputOptions.CHOICE:
+            tp = "choice"
+        else:
+            logger.error("Validation should have already occurred. "
+                         "Please file an issue.")
+            raise ValueError("grammar is not of valid supported types. "
+                             f"({request_type!s})")
+        return llguidance.grammar_from(tp, grammar_spec)
+def validate_guidance_grammar(
+        sampling_params: SamplingParams,
+        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
+    tp, grm = get_structured_output_key(sampling_params)
+    guidance_grm = serialize_guidance_grammar(tp, grm)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm,
+                                                tokenizer=tokenizer)
+    if err:
+        raise ValueError(f"Grammar error: {err}")
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -53,25 +53,30 @@ class StructuredOutputRequest:
    @functools.cached_property
    def structured_output_key(self) -> StructuredOutputKey:
-        params = self.sampling_params.guided_decoding
+        return get_structured_output_key(self.sampling_params)
-        assert params is not None, "params can't be None."
-        if params.json is not None:
-            if not isinstance(params.json, str):
+def get_structured_output_key(
-                json_str = json.dumps(params.json)
+        sampling_params: SamplingParams) -> StructuredOutputKey:
-            else:
+    params = sampling_params.guided_decoding
-                json_str = params.json
+    assert params is not None, "params can't be None."
-            return (StructuredOutputOptions.JSON, json_str)
+    if params.json is not None:
-        elif params.json_object:
+        if not isinstance(params.json, str):
-            return (StructuredOutputOptions.JSON_OBJECT, "")
+            json_str = json.dumps(params.json)
-        elif params.regex is not None:
+        else:
-            return (StructuredOutputOptions.REGEX, params.regex)
+            json_str = params.json
-        elif params.choice is not None:
+        return (StructuredOutputOptions.JSON, json_str)
-            if not isinstance(params.choice, str):
+    elif params.json_object:
-                json_str = json.dumps(params.choice)
+        return (StructuredOutputOptions.JSON_OBJECT, "")
-            else:
+    elif params.regex is not None:
-                json_str = params.choice
+        return (StructuredOutputOptions.REGEX, params.regex)
-            return (StructuredOutputOptions.CHOICE, json_str)
+    elif params.choice is not None:
-        elif params.grammar is not None:
+        if not isinstance(params.choice, str):
-            return (StructuredOutputOptions.GRAMMAR, params.grammar)
+            json_str = json.dumps(params.choice)
        else:
-            raise ValueError("No valid structured output parameter found")
+            json_str = params.choice
+        return (StructuredOutputOptions.CHOICE, json_str)
+    elif params.grammar is not None:
+        return (StructuredOutputOptions.GRAMMAR, params.grammar)
+    else:
+        raise ValueError("No valid structured output parameter found")
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -239,7 +239,7 @@ def choice_as_grammar(choice: list[str]) -> str:
    return grammar
-def validate_structured_output_request(
+def validate_structured_output_request_xgrammar(
        sampling_params: SamplingParams) -> None:
    """Validate that the request is supported by structured output.

--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
@@ -197,6 +198,9 @@ class InputBatch:
        # that are currently in the prefill phase.
        self.num_prompt_logprobs: dict[str, int] = {}
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
        self.logit_bias: list[Optional[dict[int,
                                            float]]] = [None] * max_num_reqs
        self.has_allowed_token_ids: set[str] = set()
@@ -362,6 +366,7 @@ class InputBatch:
        self.generators.pop(req_index, None)
        self.num_logprobs.pop(req_id, None)
        self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
        # LoRA
        lora_id = self.request_lora_mapping[req_index]

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1059,7 +1059,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                sampling_metadata=sampling_metadata,
            )
        else:
-            # TODO(woosuk): Optimize the memory usage.
+            # When indexing with a tensor (bonus_logits_indices), PyTorch
+            # creates a new tensor with separate storage from the original
+            # logits tensor. This means any in-place operations on bonus_logits
+            # won't affect the original logits tensor.
            bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
            sampler_output = self.model.sample(
                logits=bonus_logits,
@@ -1067,7 +1070,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            )
            bonus_token_ids = sampler_output.sampled_token_ids
-            # TODO(woosuk): Optimize the memory usage.
+            # Just like `bonus_logits`, `target_logits` is a new tensor with
+            # separate storage from the original `logits` tensor. Therefore,
+            # it is safe to update `target_logits` in place.
            target_logits = logits[spec_decode_metadata.target_logits_indices]
            output_token_ids = self.rejection_sampler(
                spec_decode_metadata,
@@ -1191,6 +1196,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        if not num_prompt_logprobs_dict:
            return {}
+        in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
        # Since prompt logprobs are a rare feature, prioritize simple,
@@ -1206,16 +1212,36 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
                self.device, non_blocking=True)
+            # Set up target LogprobsTensors object.
+            logprobs_tensors = in_progress_dict.get(req_id)
+            if not logprobs_tensors:
+                # Create empty logprobs CPU tensors for the entire prompt.
+                # If chunked, we'll copy in slice by slice.
+                logprobs_tensors = LogprobsTensors.empty_cpu(
+                    num_prompt_tokens - 1, num_prompt_logprobs + 1)
+                in_progress_dict[req_id] = logprobs_tensors
            # Determine number of logits to retrieve.
-            start_tok = request.num_computed_tokens + 1
+            start_idx = request.num_computed_tokens
+            start_tok = start_idx + 1
            num_remaining_tokens = num_prompt_tokens - start_tok
-            if num_tokens < num_remaining_tokens:
+            if num_tokens <= num_remaining_tokens:
                # This is a chunk, more tokens remain.
+                # In the == case, there are no more prompt logprobs to produce
+                # but we want to defer returning them to the next step where we
+                # have new generated tokens to return.
                num_logits = num_tokens
            else:
                # This is the last chunk of prompt tokens to return.
                num_logits = num_remaining_tokens
                completed_prefill_reqs.append(req_id)
+                prompt_logprobs_dict[req_id] = logprobs_tensors
+            if num_logits <= 0:
+                # This can happen for the final chunk if we prefilled exactly
+                # (num_prompt_tokens - 1) tokens for this request in the prior
+                # step. There are no more prompt logprobs to produce.
+                continue
            # Get the logits corresponding to this req's prompt tokens.
            # If this is a partial request (i.e. chunked prefill),
@@ -1236,19 +1262,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                logprobs, num_prompt_logprobs, tgt_token_ids)
            # Transfer GPU->CPU async.
-            prompt_logprobs_dict[req_id] = LogprobsTensors(
+            chunk_slice = slice(start_idx, start_idx + num_logits)
-                token_ids.to("cpu", non_blocking=True),
+            logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
-                logprobs.to("cpu", non_blocking=True),
+                token_ids, non_blocking=True)
-                ranks.to("cpu", non_blocking=True),
+            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
-            )
+                                                         non_blocking=True)
+            logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
+                ranks, non_blocking=True)
        # Remove requests that have completed prefill from the batch
        # num_prompt_logprobs_dict.
        for req_id in completed_prefill_reqs:
            del num_prompt_logprobs_dict[req_id]
+            del in_progress_dict[req_id]
        # Must synchronize the non-blocking GPU->CPU transfers.
-        torch.cuda.synchronize()
+        if prompt_logprobs_dict:
+            torch.cuda.synchronize()
        return prompt_logprobs_dict

--- a/vllm/fa_utils.py
+++ b/vllm/fa_utils.py
@@ -46,3 +46,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
        return fa_version
    except (ImportError, AssertionError):
        return None
+def flash_attn_supports_fp8() -> bool:
+    from vllm.platforms import current_platform
+    return get_flash_attn_version() == 3 and \
+        current_platform.get_device_capability().major == 9
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -376,8 +376,22 @@ class HpuModelAdapter:
        mask = mask >= metadata.block_usage.unsqueeze(-1)
        attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
            mask, -math.inf))
-        block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
+        if os.environ.get('VLLM_USE_FAKE_HPU',
-                                                    num_classes=batch_size)
+                          '0') == '0' and htorch.utils.internal.is_lazy():
+            block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
+                                                        num_classes=batch_size)
+        else:
+            # Unfortunately one_hot on CPU/torch.compile mode/eager mode
+            # doesn't handle out of bounds classes so we need to convert
+            # all negative values to 0 (block_mapping) or bs (block_groups)
+            block_groups = metadata.block_groups.to(torch.long)
+            block_mapping = torch.nn.functional.relu(block_groups)
+            block_mapping = torch.nn.functional.one_hot(block_mapping,
+                                                        num_classes=batch_size)
+            oob_values = block_groups.lt(0)
+            block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
+            block_groups.masked_fill_(oob_values, batch_size)
+            metadata = metadata._replace(block_groups=block_groups)
        block_mapping = block_mapping.to(dtype)
        metadata = metadata._replace(block_mapping=block_mapping,
                                     attn_bias=attn_bias)