Commit 31f6b24f authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/v0.8.2' into v0.8.2-ori

parents 89d1dd57 25f560a6
...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional ...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Optional
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.structured_output.backend_guidance import GuidanceBackend
from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
StructuredOutputGrammar) StructuredOutputGrammar)
...@@ -50,6 +51,8 @@ class StructuredOutputManager: ...@@ -50,6 +51,8 @@ class StructuredOutputManager:
XgrammarBackend) XgrammarBackend)
self.backend = XgrammarBackend(self.vllm_config) self.backend = XgrammarBackend(self.vllm_config)
elif backend_name == "guidance":
self.backend = GuidanceBackend(self.vllm_config)
else: else:
raise ValueError( raise ValueError(
f"Unsupported structured output backend: {backend_name}") f"Unsupported structured output backend: {backend_name}")
......
# SPDX-License-Identifier: Apache-2.0
import os
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional
import torch
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.utils import LazyLoader
from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
StructuredOutputGrammar,
StructuredOutputOptions)
from vllm.v1.structured_output.request import get_structured_output_key
if TYPE_CHECKING:
import llguidance
import llguidance.hf as llguidance_hf
import llguidance.torch as llguidance_torch
else:
llguidance = LazyLoader("llguidance", globals(), "llguidance")
llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
llguidance_torch = LazyLoader("llguidance.torch", globals(),
"llguidance.torch")
logger = init_logger(__name__)
class GuidanceBackend(StructuredOutputBackend):
def __init__(self, vllm_config: VllmConfig):
self.vllm_config = vllm_config
tokenizer_group = init_tokenizer_from_configs(
model_config=vllm_config.model_config,
scheduler_config=vllm_config.scheduler_config,
parallel_config=vllm_config.parallel_config,
lora_config=vllm_config.lora_config) # type: ignore[arg-type]
tokenizer_group.ping()
self.vllm_config = vllm_config
self.vocab_size = vllm_config.model_config.get_vocab_size()
tokenizer = tokenizer_group.get_lora_tokenizer(None)
self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
def compile_grammar(self, request_type: StructuredOutputOptions,
grammar_spec: str) -> StructuredOutputGrammar:
self.serialized_grammar = serialize_guidance_grammar(
request_type, grammar_spec)
ll_matcher = llguidance.LLMatcher(
self.ll_tokenizer,
self.serialized_grammar,
log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
)
r = GuidanceGrammar(
ll_matcher=ll_matcher,
ll_tokenizer=self.ll_tokenizer,
vocab_size=self.vocab_size,
)
r.check_error()
return r
def allocate_token_bitmask(self, max_num_seqs: int):
return llguidance_torch.allocate_token_bitmask(
max_num_seqs, self.ll_tokenizer.vocab_size)
@dataclass
class GuidanceGrammar(StructuredOutputGrammar):
ll_matcher: llguidance.LLMatcher
ll_tokenizer: llguidance.LLTokenizer
vocab_size: int
printed_error: bool = False
terminated: bool = False
def check_error(self):
if not self.printed_error:
err = self.ll_matcher.get_error()
if err:
self.printed_error = True
logger.warning("LLMatcher error: %s", err)
def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
"""Accepts a list of tokens and advances the parser.
Returns True if the parser was advanced successfully.
Returns False if the parser failed to advance.
"""
if self.ll_tokenizer.eos_token in tokens:
self.terminated = True
if self.ll_matcher.is_stopped():
return True
# TODO - Add jump decoding support in the future:
# self.ll_matcher.compute_ff_bytes() - this should always work
# self.ll_matcher.compute_ff_tokens() - this only works for
# "canonical" tokenizers
# For conversion between the two, see
# https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
r = self.ll_matcher.consume_tokens(tokens)
self.check_error()
return r
def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
# this will automatically return [EOS] mask if the matcher is stopped
# or otherwise in an error state
llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
self.check_error()
def is_terminated(self) -> bool:
return self.terminated
def reset(self):
# This method may be not needed anymore? TODO
self.ll_matcher.reset()
def serialize_guidance_grammar(request_type: StructuredOutputOptions,
grammar_spec: str) -> str:
if request_type == StructuredOutputOptions.JSON:
# TODO: make whitespace_flexible configurable
return llguidance.LLMatcher.grammar_from_json_schema(
grammar_spec, defaults={
"whitespace_flexible": True,
})
elif request_type == StructuredOutputOptions.JSON_OBJECT:
return llguidance.LLMatcher.grammar_from_json_schema(
'{"type": "object"}', defaults={
"whitespace_flexible": True,
})
else:
if request_type == StructuredOutputOptions.REGEX:
tp = "regex"
elif request_type == StructuredOutputOptions.GRAMMAR:
tp = "grammar"
elif request_type == StructuredOutputOptions.CHOICE:
tp = "choice"
else:
logger.error("Validation should have already occurred. "
"Please file an issue.")
raise ValueError("grammar is not of valid supported types. "
f"({request_type!s})")
return llguidance.grammar_from(tp, grammar_spec)
def validate_guidance_grammar(
sampling_params: SamplingParams,
tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
tp, grm = get_structured_output_key(sampling_params)
guidance_grm = serialize_guidance_grammar(tp, grm)
err = llguidance.LLMatcher.validate_grammar(guidance_grm,
tokenizer=tokenizer)
if err:
raise ValueError(f"Grammar error: {err}")
...@@ -53,25 +53,30 @@ class StructuredOutputRequest: ...@@ -53,25 +53,30 @@ class StructuredOutputRequest:
@functools.cached_property @functools.cached_property
def structured_output_key(self) -> StructuredOutputKey: def structured_output_key(self) -> StructuredOutputKey:
params = self.sampling_params.guided_decoding return get_structured_output_key(self.sampling_params)
assert params is not None, "params can't be None."
if params.json is not None:
if not isinstance(params.json, str): def get_structured_output_key(
json_str = json.dumps(params.json) sampling_params: SamplingParams) -> StructuredOutputKey:
else: params = sampling_params.guided_decoding
json_str = params.json assert params is not None, "params can't be None."
return (StructuredOutputOptions.JSON, json_str) if params.json is not None:
elif params.json_object: if not isinstance(params.json, str):
return (StructuredOutputOptions.JSON_OBJECT, "") json_str = json.dumps(params.json)
elif params.regex is not None: else:
return (StructuredOutputOptions.REGEX, params.regex) json_str = params.json
elif params.choice is not None: return (StructuredOutputOptions.JSON, json_str)
if not isinstance(params.choice, str): elif params.json_object:
json_str = json.dumps(params.choice) return (StructuredOutputOptions.JSON_OBJECT, "")
else: elif params.regex is not None:
json_str = params.choice return (StructuredOutputOptions.REGEX, params.regex)
return (StructuredOutputOptions.CHOICE, json_str) elif params.choice is not None:
elif params.grammar is not None: if not isinstance(params.choice, str):
return (StructuredOutputOptions.GRAMMAR, params.grammar) json_str = json.dumps(params.choice)
else: else:
raise ValueError("No valid structured output parameter found") json_str = params.choice
return (StructuredOutputOptions.CHOICE, json_str)
elif params.grammar is not None:
return (StructuredOutputOptions.GRAMMAR, params.grammar)
else:
raise ValueError("No valid structured output parameter found")
...@@ -239,7 +239,7 @@ def choice_as_grammar(choice: list[str]) -> str: ...@@ -239,7 +239,7 @@ def choice_as_grammar(choice: list[str]) -> str:
return grammar return grammar
def validate_structured_output_request( def validate_structured_output_request_xgrammar(
sampling_params: SamplingParams) -> None: sampling_params: SamplingParams) -> None:
"""Validate that the request is supported by structured output. """Validate that the request is supported by structured output.
......
...@@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest ...@@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs from vllm.multimodal import MultiModalKwargs
from vllm.sampling_params import SamplingParams, SamplingType from vllm.sampling_params import SamplingParams, SamplingType
from vllm.utils import swap_dict_values from vllm.utils import swap_dict_values
from vllm.v1.outputs import LogprobsTensors
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.utils import copy_slice from vllm.v1.utils import copy_slice
from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.block_table import BlockTable
...@@ -197,6 +198,9 @@ class InputBatch: ...@@ -197,6 +198,9 @@ class InputBatch:
# that are currently in the prefill phase. # that are currently in the prefill phase.
self.num_prompt_logprobs: dict[str, int] = {} self.num_prompt_logprobs: dict[str, int] = {}
# To accumulate prompt logprobs tensor chunks across prefill steps.
self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
self.logit_bias: list[Optional[dict[int, self.logit_bias: list[Optional[dict[int,
float]]] = [None] * max_num_reqs float]]] = [None] * max_num_reqs
self.has_allowed_token_ids: set[str] = set() self.has_allowed_token_ids: set[str] = set()
...@@ -362,6 +366,7 @@ class InputBatch: ...@@ -362,6 +366,7 @@ class InputBatch:
self.generators.pop(req_index, None) self.generators.pop(req_index, None)
self.num_logprobs.pop(req_id, None) self.num_logprobs.pop(req_id, None)
self.num_prompt_logprobs.pop(req_id, None) self.num_prompt_logprobs.pop(req_id, None)
self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
# LoRA # LoRA
lora_id = self.request_lora_mapping[req_index] lora_id = self.request_lora_mapping[req_index]
......
...@@ -1059,7 +1059,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1059,7 +1059,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
sampling_metadata=sampling_metadata, sampling_metadata=sampling_metadata,
) )
else: else:
# TODO(woosuk): Optimize the memory usage. # When indexing with a tensor (bonus_logits_indices), PyTorch
# creates a new tensor with separate storage from the original
# logits tensor. This means any in-place operations on bonus_logits
# won't affect the original logits tensor.
bonus_logits = logits[spec_decode_metadata.bonus_logits_indices] bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
sampler_output = self.model.sample( sampler_output = self.model.sample(
logits=bonus_logits, logits=bonus_logits,
...@@ -1067,7 +1070,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1067,7 +1070,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
) )
bonus_token_ids = sampler_output.sampled_token_ids bonus_token_ids = sampler_output.sampled_token_ids
# TODO(woosuk): Optimize the memory usage. # Just like `bonus_logits`, `target_logits` is a new tensor with
# separate storage from the original `logits` tensor. Therefore,
# it is safe to update `target_logits` in place.
target_logits = logits[spec_decode_metadata.target_logits_indices] target_logits = logits[spec_decode_metadata.target_logits_indices]
output_token_ids = self.rejection_sampler( output_token_ids = self.rejection_sampler(
spec_decode_metadata, spec_decode_metadata,
...@@ -1191,6 +1196,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1191,6 +1196,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if not num_prompt_logprobs_dict: if not num_prompt_logprobs_dict:
return {} return {}
in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
# Since prompt logprobs are a rare feature, prioritize simple, # Since prompt logprobs are a rare feature, prioritize simple,
...@@ -1206,16 +1212,36 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1206,16 +1212,36 @@ class GPUModelRunner(LoRAModelRunnerMixin):
prompt_token_ids = torch.tensor(request.prompt_token_ids).to( prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
self.device, non_blocking=True) self.device, non_blocking=True)
# Set up target LogprobsTensors object.
logprobs_tensors = in_progress_dict.get(req_id)
if not logprobs_tensors:
# Create empty logprobs CPU tensors for the entire prompt.
# If chunked, we'll copy in slice by slice.
logprobs_tensors = LogprobsTensors.empty_cpu(
num_prompt_tokens - 1, num_prompt_logprobs + 1)
in_progress_dict[req_id] = logprobs_tensors
# Determine number of logits to retrieve. # Determine number of logits to retrieve.
start_tok = request.num_computed_tokens + 1 start_idx = request.num_computed_tokens
start_tok = start_idx + 1
num_remaining_tokens = num_prompt_tokens - start_tok num_remaining_tokens = num_prompt_tokens - start_tok
if num_tokens < num_remaining_tokens: if num_tokens <= num_remaining_tokens:
# This is a chunk, more tokens remain. # This is a chunk, more tokens remain.
# In the == case, there are no more prompt logprobs to produce
# but we want to defer returning them to the next step where we
# have new generated tokens to return.
num_logits = num_tokens num_logits = num_tokens
else: else:
# This is the last chunk of prompt tokens to return. # This is the last chunk of prompt tokens to return.
num_logits = num_remaining_tokens num_logits = num_remaining_tokens
completed_prefill_reqs.append(req_id) completed_prefill_reqs.append(req_id)
prompt_logprobs_dict[req_id] = logprobs_tensors
if num_logits <= 0:
# This can happen for the final chunk if we prefilled exactly
# (num_prompt_tokens - 1) tokens for this request in the prior
# step. There are no more prompt logprobs to produce.
continue
# Get the logits corresponding to this req's prompt tokens. # Get the logits corresponding to this req's prompt tokens.
# If this is a partial request (i.e. chunked prefill), # If this is a partial request (i.e. chunked prefill),
...@@ -1236,19 +1262,23 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1236,19 +1262,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
logprobs, num_prompt_logprobs, tgt_token_ids) logprobs, num_prompt_logprobs, tgt_token_ids)
# Transfer GPU->CPU async. # Transfer GPU->CPU async.
prompt_logprobs_dict[req_id] = LogprobsTensors( chunk_slice = slice(start_idx, start_idx + num_logits)
token_ids.to("cpu", non_blocking=True), logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
logprobs.to("cpu", non_blocking=True), token_ids, non_blocking=True)
ranks.to("cpu", non_blocking=True), logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
) non_blocking=True)
logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
ranks, non_blocking=True)
# Remove requests that have completed prefill from the batch # Remove requests that have completed prefill from the batch
# num_prompt_logprobs_dict. # num_prompt_logprobs_dict.
for req_id in completed_prefill_reqs: for req_id in completed_prefill_reqs:
del num_prompt_logprobs_dict[req_id] del num_prompt_logprobs_dict[req_id]
del in_progress_dict[req_id]
# Must synchronize the non-blocking GPU->CPU transfers. # Must synchronize the non-blocking GPU->CPU transfers.
torch.cuda.synchronize() if prompt_logprobs_dict:
torch.cuda.synchronize()
return prompt_logprobs_dict return prompt_logprobs_dict
......
...@@ -46,3 +46,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]: ...@@ -46,3 +46,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
return fa_version return fa_version
except (ImportError, AssertionError): except (ImportError, AssertionError):
return None return None
def flash_attn_supports_fp8() -> bool:
from vllm.platforms import current_platform
return get_flash_attn_version() == 3 and \
current_platform.get_device_capability().major == 9
...@@ -376,8 +376,22 @@ class HpuModelAdapter: ...@@ -376,8 +376,22 @@ class HpuModelAdapter:
mask = mask >= metadata.block_usage.unsqueeze(-1) mask = mask >= metadata.block_usage.unsqueeze(-1)
attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
mask, -math.inf)) mask, -math.inf))
block_mapping = torch.nn.functional.one_hot(metadata.block_groups, if os.environ.get('VLLM_USE_FAKE_HPU',
num_classes=batch_size) '0') == '0' and htorch.utils.internal.is_lazy():
block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
num_classes=batch_size)
else:
# Unfortunately one_hot on CPU/torch.compile mode/eager mode
# doesn't handle out of bounds classes so we need to convert
# all negative values to 0 (block_mapping) or bs (block_groups)
block_groups = metadata.block_groups.to(torch.long)
block_mapping = torch.nn.functional.relu(block_groups)
block_mapping = torch.nn.functional.one_hot(block_mapping,
num_classes=batch_size)
oob_values = block_groups.lt(0)
block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
block_groups.masked_fill_(oob_values, batch_size)
metadata = metadata._replace(block_groups=block_groups)
block_mapping = block_mapping.to(dtype) block_mapping = block_mapping.to(dtype)
metadata = metadata._replace(block_mapping=block_mapping, metadata = metadata._replace(block_mapping=block_mapping,
attn_bias=attn_bias) attn_bias=attn_bias)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment