Unverified Commit 26e673fe authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 Deprecation] Remove V0 Sequence class & Sampler (#25332)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: default avatarWoosuk Kwon <woosuk@thinkingmachines.ai>
parent 65a5910c
......@@ -2,18 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import Optional
import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from .utils import maybe_prefix
......@@ -105,8 +102,10 @@ class Medusa(nn.Module):
return [block(hidden_states) for block in self.blocks]
def compute_logits(
self, hidden_states: list[torch.Tensor],
sampling_metadata: SamplingMetadata) -> list[torch.Tensor]:
self,
hidden_states: list[torch.Tensor],
sampling_metadata,
) -> list[torch.Tensor]:
logits_lst: list[torch.Tensor] = []
for hs, lm_head in zip(hidden_states, self.lm_heads):
......@@ -130,57 +129,6 @@ class Medusa(nn.Module):
return logits_lst
def sample(
self,
logits: list[torch.Tensor],
sampling_metadata: SamplingMetadata,
) -> list[SamplerOutput]:
logits = torch.stack(logits, dim=0).float()
logprobs = torch.log_softmax(logits, dim=-1)
token_ids = logits.argmax(-1) # support only top-1 for now
probs = torch.softmax(logits, dim=-1)
token_id_list = []
token_prob_list = []
token_logprob_list = []
for idx, seq_group in enumerate(sampling_metadata.seq_groups):
token_id_list.append(token_ids[:, seq_group.sample_indices])
token_prob_list.append(probs[:, seq_group.sample_indices])
token_logprob_list.append(logprobs[:, seq_group.sample_indices])
outputs: list[Optional[SamplerOutput]] = []
for idx in range(len(sampling_metadata.seq_groups)):
outputs.append(
SamplerOutput(
outputs=None,
sampled_token_probs=token_prob_list[idx].squeeze(1),
logprobs=token_logprob_list[idx].squeeze(1),
sampled_token_ids=token_id_list[idx].squeeze(1),
))
return outputs
def generate_proposals(
self,
previous_hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[list[SamplerOutput]]:
# During preemption, we may receive an empty tensor (batch_size=0)
if previous_hidden_states.size(0) == 0:
# Return None to signal the Top1Proposer that no proposals
# were generated for this batch, allowing it to handle this
# special case appropriately
return None
return self.sample(
logits=self.compute_logits(
hidden_states=self.forward(previous_hidden_states),
sampling_metadata=sampling_metadata,
),
sampling_metadata=sampling_metadata,
)
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
params_dict = dict(self.named_parameters())
......
......@@ -8,9 +8,7 @@ import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
......@@ -141,55 +139,57 @@ class MLPSpeculator(nn.Module):
self.config = config
self.logits_processor = LogitsProcessor(config.vocab_size,
config.vocab_size, 1.0)
self.sampler = get_sampler()
def generate_proposals(
self,
input_ids: torch.Tensor,
previous_hidden_states: torch.Tensor,
num_predict_tokens: int,
sampling_metadata: SamplingMetadata,
) -> list[SamplerOutput]:
if num_predict_tokens > self.max_speculative_tokens:
raise ValueError(f"Max speculative tokens for model is "
f"{self.max_speculative_tokens}, but "
f"{num_predict_tokens} were requested")
# b x 1 x d
previous_hidden_states = previous_hidden_states.unsqueeze(1)
# NOTE(woosuk): This method is commented out because it is old code
# using V0. We should either port it to V1 or remove it.
if self.scale_input:
previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
# def generate_proposals(
# self,
# input_ids: torch.Tensor,
# previous_hidden_states: torch.Tensor,
# num_predict_tokens: int,
# sampling_metadata: SamplingMetadata,
# ) -> list[SamplerOutput]:
# if num_predict_tokens > self.max_speculative_tokens:
# raise ValueError(f"Max speculative tokens for model is "
# f"{self.max_speculative_tokens}, but "
# f"{num_predict_tokens} were requested")
# # b x 1 x d
# previous_hidden_states = previous_hidden_states.unsqueeze(1)
# if self.scale_input:
# previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
# b x 1
last_tokens = input_ids.unsqueeze(1)
# # b x 1
# last_tokens = input_ids.unsqueeze(1)
next_tokens = []
# next_tokens = []
for head_index in range(num_predict_tokens):
# for head_index in range(num_predict_tokens):
# Project and predict
z = self.emb[head_index](last_tokens) # b k d
states = self.proj[head_index](previous_hidden_states)
# # Project and predict
# z = self.emb[head_index](last_tokens) # b k d
# states = self.proj[head_index](previous_hidden_states)
# Weighted add of state_weight*state and emb_weight*z
# Let subsequent LN take care of denominator
# state_weight is close to 1, so shouldn't be any precision issues
states.add_(z, alpha=self.emb_weight / self.state_weight)
# # Weighted add of state_weight*state and emb_weight*z
# # Let subsequent LN take care of denominator
# # state_weight is close to 1, so shouldn't be any precision issues
# states.add_(z, alpha=self.emb_weight / self.state_weight)
states = self.activation(self.ln[head_index](states)) # b k d
previous_hidden_states = states
# TODO: not yet supporting top_k_tokens_per_head
states = states.flatten(0, 1)
# states = self.activation(self.ln[head_index](states)) # b k d
# previous_hidden_states = states
# # TODO: not yet supporting top_k_tokens_per_head
# states = states.flatten(0, 1)
logits = self.logits_processor(self.head[head_index], states,
sampling_metadata)
# logits = self.logits_processor(self.head[head_index], states,
# sampling_metadata)
output = self.sampler(logits, sampling_metadata)
last_tokens = output.sampled_token_ids
next_tokens.append(output)
# output = self.sampler(logits, sampling_metadata)
# last_tokens = output.sampled_token_ids
# next_tokens.append(output)
return next_tokens
# return next_tokens
def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
......
......@@ -697,16 +697,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
# If the shape is the same, it means that we have already
# prune hidden states manually.
prune_hidden_states = hidden_states.size(
0) != sampling_metadata.selected_token_indices.size(0)
processed_logits = self.logits_processor(
self.lm_head,
hidden_states,
sampling_metadata,
self.embedding_bias,
prune_hidden_states=prune_hidden_states)
)
return processed_logits
def load_weights(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from array import array
from dataclasses import dataclass
from typing import Optional
import torch
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData,
SequenceGroupMetadata)
from vllm.utils import (PyObjectCache, async_tensor_h2d,
is_pin_memory_available, make_tensor_with_pad)
_SAMPLING_EPS = 1e-5
@dataclass
class SequenceGroupToSample:
# |---------- N-1 iteration --------|
# |---------------- N iteration ---------------------|
# |- tokenA -|......................|-- newTokens ---|
# |---------- context_len ----------|
# |-------------------- seq_len ----------------------|
# |-- query_len ---|
# Sequence ids for the sequence group in a previous step.
seq_ids: list[int]
sampling_params: SamplingParams
# seq_id -> sequence data.
seq_data: dict[int, SequenceData]
# The length of the sequence (all tokens seen in the past + new token to
# compute attention) of the sequence group. None if it is in a decode
# stage.
seq_len: Optional[int]
# The length of new query tokens to compute in the current step. None if it
# is in a decode stage. The length of query_len <= seq_len if chunked
# prefill is enabled.
query_len: Optional[int]
# A random number generator for sampling.
generator: Optional[torch.Generator]
# True if the sequence group is in prefill stage. False if it is in a
# decode stage.
is_prompt: bool
# Query token indices from logits. to compute prompt logprob. Empty if
# prompt logprob is not required.
prompt_logprob_indices: list[int]
# Sample token indices from logits. Empty if sampling is not required.
sample_indices: list[int]
@property
def do_sample(self):
return len(self.sample_indices) > 0
def __post_init__(self):
if len(self.prompt_logprob_indices) > 0:
assert self.sampling_params.prompt_logprobs is not None
if self.is_prompt:
assert self.seq_len is not None
assert self.query_len is not None
def gen_seq_group_to_sample_builder(num_seqs: int):
return lambda: SequenceGroupToSample(
seq_ids=[0] * num_seqs,
sampling_params=None,
seq_data=None, # type: ignore
seq_len=0,
query_len=0,
generator=None,
is_prompt=True,
prompt_logprob_indices=[],
sample_indices=[],
)
class SamplingMetadataCache:
"""Used to cache SamplingMetadata objects between scheduler iterations"""
def __init__(self):
self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {}
def get_cached_seq_group_to_sample(self, num_seqs):
if num_seqs not in self._seq_group_to_sample_cache:
self._seq_group_to_sample_cache[num_seqs] = PyObjectCache(
gen_seq_group_to_sample_builder(num_seqs))
obj = self._seq_group_to_sample_cache[num_seqs].get_object()
return obj
def reset(self):
for cache in self._seq_group_to_sample_cache.values():
cache.reset()
class SamplingMetadata:
"""Metadata for input sequences. Used in sampler.
The usage is as follows;
```
hidden_states = execute_model(...)
logits = hidden_states[sampling_metadata.selected_token_indices]
sample(logits)
def sample(logits):
# Use categorized_sample_indices for sampling....
```
Args:
seq_groups: List of batched sequence groups.
selected_token_indices: (num_query_tokens_to_logprob). Indices to find
logits from the initial model output hidden states.
categorized_sample_indices: SamplingType -> token indices to sample.
Each token indices is 2D tensor of (num_indices, num_indices) where
the first item means the sample index within the returned logit
(before pruning padding), and the second item means the sample
index after pruning using selected_token_indices.
For example, if the returned logit is [1, 2, 3], and we select
[1, 2] for sampling, the pruned logit will be [2, 3]. In this case,
The first tuple is [1, 2] (sampled index within original logit),
and the second tuple is [0, 1] (sampled index within pruned logit).
num_prompts: Number of prompt sequence groups in seq_groups.
skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU
serialization of token outputs.
reuse_sampling_tensors: Indicates if we want to reuse sampling
tensors that are part of the sampler forward pass. Currently,
it is mainly used for multi-step decode.
"""
def __init__(
self,
seq_groups: list[SequenceGroupToSample],
selected_token_indices: torch.Tensor,
categorized_sample_indices: dict[SamplingType, torch.Tensor],
num_prompts: int,
skip_sampler_cpu_output: bool = False,
reuse_sampling_tensors: bool = False,
) -> None:
self.seq_groups = seq_groups
self.selected_token_indices = selected_token_indices
self.categorized_sample_indices = categorized_sample_indices
self.num_prompts = num_prompts
self.skip_sampler_cpu_output = skip_sampler_cpu_output
self.reuse_sampling_tensors = reuse_sampling_tensors
@staticmethod
def prepare(
seq_group_metadata_list: list[SequenceGroupMetadata],
seq_lens: list[int],
query_lens: list[int],
device: str,
pin_memory: bool,
generators: Optional[dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None,
) -> "SamplingMetadata":
(
seq_groups,
selected_token_indices,
categorized_sample_indices,
num_prompts,
) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
device, generators, cache)
selected_token_indices = async_tensor_h2d(
selected_token_indices,
dtype=torch.long,
target_device=device,
pin_memory=pin_memory,
)
categorized_sample_indices = {
t:
async_tensor_h2d(
seq_ids,
dtype=torch.int,
target_device=device,
pin_memory=pin_memory,
)
for t, seq_ids in categorized_sample_indices.items()
}
sampling_metadata = SamplingMetadata(
seq_groups=seq_groups,
selected_token_indices=selected_token_indices,
categorized_sample_indices=categorized_sample_indices,
num_prompts=num_prompts,
)
return sampling_metadata
def __repr__(self) -> str:
return (
"SamplingMetadata("
f"seq_groups={self.seq_groups}, "
f"selected_token_indices={self.selected_token_indices}, "
f"categorized_sample_indices={self.categorized_sample_indices})")
def _prepare_seq_groups(
seq_group_metadata_list: list[SequenceGroupMetadata],
seq_lens: list[int],
query_lens: list[int],
device: str,
generators: Optional[dict[str, torch.Generator]] = None,
cache: Optional[SamplingMetadataCache] = None,
) -> tuple[
list[SequenceGroupToSample],
list[int],
dict[SamplingType, list[int]],
int,
]:
"""Prepare sequence groups and indices for sampling.
Args:
seq_group_metadata_list: A list of sequence group to batch.
seq_lens: A list of sequence lens per sequence group.
Index of prompt len should match with seq_group_metadata_list.
query_lens: A list of query lengths. Prompt lens include the length
of entire prompt tokens, and it could be shorter.
device: A device to use for random number generators,
`SequenceGroupToSample.generator`.
generators: A store of per-request random number generators used
for seeded requests.
Returns:
seq_groups: A list of sequence group to sample.
selected_token_indices: See the definition from `SamplingMetadata`.
categorized_sample_indices: See the definition from `SamplingMetadata`.
num_prompts: Total number of prompts from `seq_group_metadata_list`.
"""
# Batched sequence groups for the current model forward stsep.
seq_groups: list[SequenceGroupToSample] = []
# A list of token indices to sample/compute logprob. It is used to
# prune the outcome logits from the model for the performance.
selected_token_indices: list[int] = []
# Used for selected_token_indices.
model_output_idx = 0
# Sampling type -> (
# indices to sample/prompt logprob within pruned output logits,
# indices to sample within pruned logits)
categorized_sample_indices: dict[SamplingType, list[int]] = {
t: []
for t in SamplingType
}
# Index of logits to compute logprob. Logits include both prompt logprob
# and sample logprob indices.
logit_idx = 0
# Total number of prompts from given sequence groups.
num_prompts = 0
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
seq_ids = seq_group_metadata.seq_data.keys()
if cache is not None:
sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids))
for j, seq_id in enumerate(seq_ids):
sample_obj.seq_ids[j] = seq_id
sample_obj.prompt_logprob_indices.clear()
sample_obj.sample_indices.clear()
sampling_params = seq_group_metadata.sampling_params
is_prompt = seq_group_metadata.is_prompt
generator: Optional[torch.Generator] = None
# If the current seq group is in decode stage, it is None.
seq_len: Optional[int] = None
query_len: Optional[int] = None
prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices
if cache is not None else [])
sample_indices: list[int] = (sample_obj.sample_indices
if cache is not None else [])
do_sample = seq_group_metadata.do_sample
if seq_group_metadata.is_prompt:
if sampling_params.seed is not None:
generator = torch.Generator(device=device).manual_seed(
sampling_params.seed)
if generators is not None:
generators[seq_group_metadata.request_id] = generator
num_prompts += 1
num_prefill_sample = len(seq_ids)
assert num_prefill_sample == 1
assert query_lens is not None and seq_lens is not None
query_len, seq_len = query_lens[i], seq_lens[i]
# If we need sampling, exclude num_prefill_sample tokens from
# prompt logprob.
prompt_logprob_len = (query_len - num_prefill_sample
if do_sample else query_len)
sample_len = num_prefill_sample if do_sample else 0
else:
# Decode
prompt_logprob_len = 0
query_len = query_lens[i] if query_lens is not None and len(
query_lens) > 0 else 1
sample_len = len(seq_ids) * query_len if do_sample else 0
if sampling_params.seed is not None and generators is not None:
generator = generators.get(seq_group_metadata.request_id)
# Update indices to select from the model output.
"""
This blocks computes selected_token_indices which is used in the
following way.
hidden_states = model(...)
logits = hidden_states[selected_token_indices]
"""
if sampling_params.prompt_logprobs is not None:
selected_token_indices.extend(
range(model_output_idx, model_output_idx + prompt_logprob_len))
model_output_idx += prompt_logprob_len
if do_sample:
selected_token_indices.extend(
range(model_output_idx, model_output_idx + sample_len))
model_output_idx += sample_len
# We now find indices for logprob computation and sampling.
"""
This block computes categorized_sample_indices which is used in the
following way.
hidden_states = model(...)
logits = hidden_states[selected_token_indices]
def sample(logits):
# Use categorized_sample_indices for sampling.
# prompt_logprob_indices to find prompt logprob indices.
# sample_indices to find sample indices.
"""
if sampling_params.prompt_logprobs is not None:
prompt_logprob_indices.extend(
range(logit_idx, logit_idx + prompt_logprob_len))
logit_idx += prompt_logprob_len
if do_sample:
sample_indices.extend(range(logit_idx, logit_idx + sample_len))
categorized_sample_indices[sampling_params.sampling_type].extend(
list(range(logit_idx, logit_idx + sample_len)))
logit_idx += sample_len
if cache is not None:
sample_obj.sampling_params = sampling_params
sample_obj.seq_data = seq_group_metadata.seq_data
sample_obj.seq_len = seq_len
sample_obj.query_len = query_len
sample_obj.generator = generator
sample_obj.is_prompt = is_prompt
else:
sample_obj = SequenceGroupToSample(
seq_ids=list(seq_ids),
sampling_params=sampling_params,
seq_data=seq_group_metadata.seq_data,
seq_len=seq_len,
query_len=query_len,
generator=generator,
is_prompt=is_prompt,
prompt_logprob_indices=list(prompt_logprob_indices),
sample_indices=list(sample_indices),
)
seq_groups.append(sample_obj)
if cache is not None:
cache.reset()
return (seq_groups, selected_token_indices, categorized_sample_indices,
num_prompts)
@dataclass
class SamplingTensors:
"""Tensors for sampling."""
temperatures: torch.Tensor
top_ps: torch.Tensor
top_ks: torch.Tensor
min_ps: torch.Tensor
presence_penalties: torch.Tensor
frequency_penalties: torch.Tensor
repetition_penalties: torch.Tensor
prompt_tokens: torch.Tensor
output_tokens: torch.Tensor
@classmethod
def from_sampling_metadata(
cls,
sampling_metadata: "SamplingMetadata",
vocab_size: int,
device: torch.device,
dtype: torch.dtype,
) -> tuple["SamplingTensors", bool, bool, bool]:
prompt_tokens: list[array] = []
output_tokens: list[array] = []
top_ks: list[int] = []
temperatures: list[float] = []
top_ps: list[float] = []
min_ps: list[float] = []
presence_penalties: list[float] = []
frequency_penalties: list[float] = []
repetition_penalties: list[float] = []
do_penalties = False
do_top_p_top_k = False
do_min_p = False
assert sampling_metadata.seq_groups is not None
for seq_group in sampling_metadata.seq_groups:
seq_ids = seq_group.seq_ids
sampling_params = seq_group.sampling_params
temperature = sampling_params.temperature
p = sampling_params.presence_penalty
f = sampling_params.frequency_penalty
r = sampling_params.repetition_penalty
top_p = sampling_params.top_p
min_p = sampling_params.min_p
# k should not be greater than the vocab size.
top_k = min(sampling_params.top_k, vocab_size)
top_k = vocab_size if top_k < 1 else top_k
if temperature < _SAMPLING_EPS:
# NOTE: Zero temperature means deterministic sampling
# (i.e., greedy sampling or beam search).
# Set the temperature to 1 to avoid division by zero.
temperature = 1.0
if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS
or top_k != vocab_size):
do_top_p_top_k = True
if not do_min_p and min_p > _SAMPLING_EPS:
do_min_p = True
if not do_penalties and (abs(p) >= _SAMPLING_EPS
or abs(f) >= _SAMPLING_EPS
or abs(r - 1.0) >= _SAMPLING_EPS):
do_penalties = True
is_prompt = seq_group.is_prompt
if is_prompt and sampling_params.prompt_logprobs is not None:
# For tokens in the prompt that we only need to get
# their logprobs
query_len = seq_group.query_len
assert query_len is not None
prefill_len = len(seq_group.prompt_logprob_indices)
temperatures += [temperature] * prefill_len
top_ps += [top_p] * prefill_len
top_ks += [top_k] * prefill_len
min_ps += [min_p] * prefill_len
presence_penalties += [0] * prefill_len
frequency_penalties += [0] * prefill_len
repetition_penalties += [1] * prefill_len
if seq_group.do_sample:
sample_lens = len(seq_group.sample_indices)
assert sample_lens >= len(seq_ids)
temperatures += [temperature] * sample_lens
top_ps += [top_p] * sample_lens
top_ks += [top_k] * sample_lens
min_ps += [min_p] * sample_lens
presence_penalties += [p] * sample_lens
frequency_penalties += [f] * sample_lens
repetition_penalties += [r] * sample_lens
if do_penalties:
for seq_group in sampling_metadata.seq_groups:
seq_ids = seq_group.seq_ids
sampling_params = seq_group.sampling_params
if (seq_group.is_prompt
and sampling_params.prompt_logprobs is not None):
prefill_len = len(seq_group.prompt_logprob_indices)
prompt_tokens.extend(
array(VLLM_TOKEN_ID_ARRAY_TYPE)
for _ in range(prefill_len))
output_tokens.extend(
array(VLLM_TOKEN_ID_ARRAY_TYPE)
for _ in range(prefill_len))
if seq_group.do_sample:
for seq_id in seq_ids:
seq_data = seq_group.seq_data[seq_id]
prompt_tokens.append(seq_data.prompt_token_ids_array)
output_tokens.append(seq_data.output_token_ids_array)
sampling_tensors = SamplingTensors.from_lists(
temperatures,
top_ps,
top_ks,
min_ps,
presence_penalties,
frequency_penalties,
repetition_penalties,
prompt_tokens,
output_tokens,
vocab_size,
device,
dtype,
)
return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)
@classmethod
def from_lists(
cls,
temperatures: list[float],
top_ps: list[float],
top_ks: list[int],
min_ps: list[float],
presence_penalties: list[float],
frequency_penalties: list[float],
repetition_penalties: list[float],
prompt_tokens: list[array],
output_tokens: list[array],
vocab_size: int,
device: torch.device,
dtype: torch.dtype,
) -> "SamplingTensors":
# Note that the performance will be very bad without
# pinned memory.
pin_memory = is_pin_memory_available()
do_penalties = prompt_tokens or output_tokens
if do_penalties:
prompt_t = make_tensor_with_pad(
prompt_tokens,
vocab_size,
device="cpu",
dtype=torch.int64,
pin_memory=pin_memory,
)
output_t = make_tensor_with_pad(
output_tokens,
vocab_size,
device="cpu",
dtype=torch.int64,
pin_memory=pin_memory,
)
else:
empty_tensor = torch.empty(0, device=device, dtype=torch.long)
prompt_t = empty_tensor
output_t = empty_tensor
temperatures_t = torch.tensor(
temperatures,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
top_ps_t = torch.tensor(
top_ps,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
min_ps_t = torch.tensor(
min_ps,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
presence_penalties_t = torch.tensor(
presence_penalties,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
frequency_penalties_t = torch.tensor(
frequency_penalties,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
repetition_penalties_t = torch.tensor(
repetition_penalties,
device="cpu",
dtype=dtype,
pin_memory=pin_memory,
)
top_ks_t = torch.tensor(
top_ks,
device="cpu",
dtype=torch.int,
pin_memory=pin_memory,
)
# Because the memory is pinned, we can do non-blocking
# transfer to device.
return cls(
temperatures=temperatures_t.to(device=device, non_blocking=True),
top_ps=top_ps_t.to(device=device, non_blocking=True),
top_ks=top_ks_t.to(device=device, non_blocking=True),
min_ps=min_ps_t.to(device=device, non_blocking=True),
presence_penalties=presence_penalties_t.to(device=device,
non_blocking=True),
frequency_penalties=frequency_penalties_t.to(device=device,
non_blocking=True),
repetition_penalties=repetition_penalties_t.to(device=device,
non_blocking=True),
prompt_tokens=prompt_t.to(device=device, non_blocking=True),
output_tokens=output_t.to(device=device, non_blocking=True),
)
# Placeholder until it can be safely removed.
pass
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Sequence and its related classes."""
import copy
import enum
from abc import ABC, abstractmethod
from array import array
from collections import defaultdict
from collections.abc import Mapping
from collections.abc import Sequence as GenericSequence
from dataclasses import dataclass, field
from functools import reduce
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional, Union
import msgspec
import torch
from vllm.inputs import SingletonInputs
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import RequestOutputKind, SamplingParams
if TYPE_CHECKING:
from vllm.lora.request import LoRARequest
from vllm.v1.worker.kv_connector_model_runner_mixin import (
KVConnectorOutput)
else:
......@@ -34,50 +19,6 @@ VLLM_TOKEN_ID_ARRAY_TYPE = "l"
VLLM_INVALID_TOKEN_ID = -1
def array_full(token_id: int, count: int):
"""[`array`][] equivalent of [numpy.full][]."""
return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
class SequenceStatus(enum.IntEnum):
"""Status of a sequence."""
WAITING = 0
RUNNING = 1
SWAPPED = 2
# Note: anything after SWAPPED (2) will be considered
# as a finished status.
FINISHED_STOPPED = 3
FINISHED_LENGTH_CAPPED = 4
FINISHED_ABORTED = 5
FINISHED_IGNORED = 6
@staticmethod
def is_finished(status: "SequenceStatus") -> bool:
return status > SequenceStatus.SWAPPED
@staticmethod
def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
if status == SequenceStatus.FINISHED_STOPPED:
finish_reason = "stop"
elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
finish_reason = "length"
elif status == SequenceStatus.FINISHED_ABORTED:
finish_reason = "abort"
elif status == SequenceStatus.FINISHED_IGNORED:
# The ignored sequences are the sequences whose prompt lengths
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
finish_reason = "length"
else:
finish_reason = None
return finish_reason
class SequenceStage(enum.Enum):
PREFILL = enum.auto()
DECODE = enum.auto()
@dataclass
class RequestMetrics:
"""Metrics associated with a request.
......@@ -107,971 +48,12 @@ class RequestMetrics:
model_execute_time: Optional[float] = None
class SequenceDataDelta(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
omit_defaults=True): # type: ignore[call-arg]
"""Delta SequenceData to send to workers per step."""
# A new token to be appended to existing SequenceData.
new_output_token_ids: list[int]
# Overwriting existing `cumulative_logprob`
new_cumulative_logprob: float
# Overwriting existing `num_computed_tokens`.
new_num_computed_tokens: int
# Overwriting existing `stage`.
new_stage: SequenceStage
class SequenceData(msgspec.Struct,
omit_defaults=True): # type: ignore[call-arg]
"""Data associated with a sequence."""
# NOTE: we cannot use Union[list, array] because msgspec cannot support
# union of 2 list types.
_prompt_token_ids: array
_output_token_ids: array = msgspec.field(
default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, []))
_prompt_embeds: Optional[torch.Tensor] = None
_output_embeds: Optional[torch.Tensor] = None
### The below fields should not be passed as an argument ###
_cumulative_logprob: float = 0.0
_prompt_token_ids_tuple: tuple[int,
...] = msgspec.field(default_factory=tuple)
# The number of tokens that are computed (that run against the model).
_num_computed_tokens: int = 0
# The number of tokens with prefix cache hit.
_num_cached_tokens: int = 0
_stage: SequenceStage = SequenceStage.PREFILL
_cached_all_token_ids: list[int] = msgspec.field(default_factory=list)
_cached_all_token_embeds: Optional[torch.Tensor] = None
# It is used to get delta input. It is reset when `get_delta_and_reset`
# is called.
_new_appended_tokens: list[int] = msgspec.field(default_factory=list)
# It is used to compute mrope_position_ids.
_mrope_position_delta: Optional[int] = None
@staticmethod
def from_prompt_token_counts(
*token_counts: tuple[int, int]) -> "SequenceData":
"""
Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
by concatenating prompt token sequences.
Each tuple represents one token sequence, expressed in the form
`(token_id, count)`.
"""
if len(token_counts) == 0:
return SequenceData.from_seqs([])
prompt_token_ids_arr = reduce(
array.__iadd__,
(array_full(token_id, count) for token_id, count in token_counts),
)
return SequenceData(prompt_token_ids_arr)
@staticmethod
def from_seqs(
prompt_token_ids: GenericSequence[int],
output_token_ids: Optional[GenericSequence[int]] = None,
*,
prompt_embeds: Optional[torch.Tensor] = None,
) -> "SequenceData":
"""
Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
from prompt and output token sequences.
"""
prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
prompt_token_ids)
if output_token_ids is None:
return SequenceData(prompt_token_ids_arr,
_prompt_embeds=prompt_embeds)
output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
output_token_ids)
return SequenceData(prompt_token_ids_arr,
_output_token_ids=output_token_ids_arr,
_prompt_embeds=prompt_embeds)
def __post_init__(self) -> None:
assert self._prompt_token_ids.typecode == "l"
assert self._output_token_ids.typecode == "l"
self._prompt_token_ids_tuple: tuple[int, ...] = tuple(
self._prompt_token_ids)
self._update_cached_all_tokens()
if self._prompt_embeds is not None:
self._update_cached_all_token_embeds()
def _update_cached_all_tokens(self):
assert isinstance(self._prompt_token_ids, array)
assert isinstance(self._output_token_ids, array)
self._cached_all_token_ids: list[int] = list(self._prompt_token_ids +
self._output_token_ids)
def _update_cached_all_token_embeds(self):
assert isinstance(self._prompt_embeds, torch.Tensor)
self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds
if self._output_embeds is not None:
self._cached_all_token_embeds = torch.cat(
(self._cached_all_token_embeds, self._output_embeds), dim=0)
@property
def cumulative_logprob(self) -> float:
"""The cumulative log probability of the output."""
return self._cumulative_logprob
@property
def prompt_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the prompt."""
return self._prompt_token_ids_tuple
@prompt_token_ids.setter
def prompt_token_ids(self, new_prompt_token_ids) -> None:
raise NotImplementedError
@property
def prompt_token_ids_array(self) -> array:
"""Return the prompt token ids in array type.
Note that the array is in "I" type, and it is not compatible
with torch.long (2 bytes vs 4 bytes). So beware of the usage.
"""
return self._prompt_token_ids
@property
def output_token_ids(self) -> tuple[int, ...]:
"""The token IDs of the output."""
return tuple(self._output_token_ids)
@output_token_ids.setter
def output_token_ids(self,
new_output_token_ids: GenericSequence[int]) -> None:
self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
new_output_token_ids)
self._update_cached_all_tokens()
@property
def output_embeds(self) -> Optional[torch.Tensor]:
return self._output_embeds
@output_embeds.setter
def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None:
self._output_token_embeds = new_output_token_embeds
self._update_cached_all_token_embeds()
@property
def output_token_ids_array(self) -> array:
"""Return the prompt token ids in array type.
Note that the array is in "I" type, and it is not compatible
with torch.long (2 bytes vs 4 bytes). So beware of the usage.
"""
assert isinstance(self._output_token_ids, array)
return self._output_token_ids
@property
def prompt_embeds(self) -> Optional[torch.Tensor]:
return self._prompt_embeds
@prompt_embeds.setter
def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None:
self._prompt_embeds = prompt_embeds
self._update_cached_all_token_embeds()
@property
def mrope_position_delta(self) -> Optional[int]:
return self._mrope_position_delta
@mrope_position_delta.setter
def mrope_position_delta(self, new_mrope_position_delta):
self._mrope_position_delta = new_mrope_position_delta
def append_token_id(self,
token_id: int,
logprob: float,
token_embed: Optional[torch.Tensor] = None) -> None:
self._output_token_ids.append(token_id)
self._new_appended_tokens.append(token_id)
self._cached_all_token_ids.append(token_id)
self._cumulative_logprob += logprob
if token_embed is not None:
# Do not pass in with batch or sequence dimensions
assert token_embed.ndim == 1
token_embed = token_embed.detach().cpu().unsqueeze(0)
if self._output_embeds is None:
self._output_embeds = token_embed
else:
self._output_embeds = torch.cat(
(self._output_embeds, token_embed), dim=0)
assert self._cached_all_token_embeds is not None
self._cached_all_token_embeds = torch.cat(
(self._cached_all_token_embeds,
token_embed.to(device=self._cached_all_token_embeds.device)),
dim=0)
def get_len(self) -> int:
return len(self._output_token_ids) + len(self._prompt_token_ids)
def get_prompt_len(self) -> int:
return len(self._prompt_token_ids)
def get_output_len(self) -> int:
return len(self._output_token_ids)
def get_token_ids(self) -> list[int]:
return self._cached_all_token_ids
def get_token_embeddings(self) -> Optional[torch.Tensor]:
return self._cached_all_token_embeds
def get_prefix_token_ids(
self, num_tokens: int
) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]:
"""Get prefix tokens, and make the return value hashable"""
prompt_length = self.get_prompt_len()
if num_tokens > prompt_length:
return (self._prompt_token_ids_tuple,
tuple(self._output_token_ids[:num_tokens - prompt_length]))
else:
return (self._prompt_token_ids_tuple[:num_tokens], None)
def get_num_computed_tokens(self) -> int:
"""Return the number of prefill tokens that are already computed."""
return self._num_computed_tokens
def update_num_computed_tokens(self, num_new_computed_tokens: int):
"""Update number of tokens computed so far."""
self._num_computed_tokens += num_new_computed_tokens
assert self._num_computed_tokens <= self.get_len(), (
self._num_computed_tokens, self.get_len())
# If all tokens are computed, it means it is in decoding phase.
if self.get_num_uncomputed_tokens() == 0:
self._stage = SequenceStage.DECODE
def get_num_cached_tokens(self) -> int:
"""Return the number of tokens with prefix cache hit."""
return self._num_cached_tokens
def update_num_cached_tokens(self, num_cached_tokens: int):
"""Update the number of tokens with prefix cache hit."""
self._num_cached_tokens = num_cached_tokens
def reset_state_for_recompute(self) -> None:
"""Reset the number of computed tokens from this sequence. It is
supposed to be called when a sequence needs to be started from
the beginning again (e.g., sequence is preempted).
"""
self._num_computed_tokens = 0
self._stage = SequenceStage.PREFILL
self._new_appended_tokens = []
def get_num_uncomputed_tokens(self) -> int:
"""Return the number of prefill tokens that are not computed."""
# we use `get_len()` which includes prompt_len + output_len instead
# of prompt_len here. This is because during recompute we need to
# prefill for both prompt and output.
return self.get_len() - self.get_num_computed_tokens()
def get_last_token_id(self) -> int:
if not self._output_token_ids:
return self._prompt_token_ids[-1]
return self._output_token_ids[-1]
def get_prompt_token_ids(self) -> tuple[int, ...]:
return self.prompt_token_ids
def get_output_token_ids(self) -> tuple[int, ...]:
return self.output_token_ids
def get_delta_and_reset(self) -> SequenceDataDelta:
delta = SequenceDataDelta(self._new_appended_tokens,
self._cumulative_logprob,
self.get_num_computed_tokens(), self.stage)
# Reset delta state.
self._new_appended_tokens = []
return delta
def apply_delta(self, delta: SequenceDataDelta):
self._num_computed_tokens = delta.new_num_computed_tokens
self._cumulative_logprob = delta.new_cumulative_logprob
self._stage = delta.new_stage
self._output_token_ids.extend(delta.new_output_token_ids)
self._cached_all_token_ids.extend(delta.new_output_token_ids)
@property
def stage(self) -> SequenceStage:
return self._stage
def __repr__(self) -> str:
return (f"SequenceData("
f"prompt_token_ids={self._prompt_token_ids}, "
f"prompt_embeds.shape="
f"{getattr(self._prompt_embeds, 'shape', None)}, "
f"output_token_ids={self.output_token_ids}, "
f"cumulative_logprob={self.cumulative_logprob}, "
f"get_num_computed_tokens={self.get_num_computed_tokens()})")
class Sequence:
"""Stores the data, status, and block information of a sequence.
The sequence is constructed from the
[`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
(for encoder-decoder) instance passed in through the `inputs`
constructor argument.
Args:
seq_id: The ID of the sequence.
inputs: The inputs of the sequence.
block_size: The block size of the sequence. Should be the same as the
block size used by the block manager and cache engine.
eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
lora_request: LoRA request.
"""
def __init__(
self,
seq_id: int,
inputs: SingletonInputs,
block_size: int,
eos_token_id: Optional[int] = None,
lora_request: Optional[LoRARequest] = None,
) -> None:
self.seq_id = seq_id
self.inputs = inputs
self.block_size = block_size
self.eos_token_id = eos_token_id
self.lora_request = lora_request
self.data = SequenceData.from_seqs(
self.prompt_token_ids,
prompt_embeds=self.inputs["prompt_embeds"]
if self.inputs["type"] == "embeds" else None)
self.output_logprobs: SampleLogprobs = []
self.output_text = ""
self.status = SequenceStatus.WAITING
self.stop_reason: Union[int, str, None] = None
# These are used to keep track of delta outputs
self._last_output_token_ids_offset: int = 0
self._last_output_text_offset: int = 0
# Used for incremental detokenization
self.prefix_offset = 0
self.read_offset = 0
# Input + output tokens
self.tokens: Optional[list[str]] = None
@property
def n_blocks(self) -> int:
return (self.get_len() + self.block_size - 1) // self.block_size
@property
def prompt(self) -> Optional[str]:
if self.inputs["type"] == "embeds":
return None
return self.inputs.get("prompt")
@property
def prompt_token_ids(self) -> list[int]:
if self.inputs["type"] == "embeds":
return [0] * len(self.inputs["prompt_embeds"])
return self.inputs["prompt_token_ids"]
@property
def multi_modal_data(self) -> MultiModalKwargs:
if self.inputs["type"] == "multimodal":
return self.inputs["mm_kwargs"].get_data()
return MultiModalKwargs()
@property
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
if self.inputs["type"] == "multimodal":
return self.inputs["mm_placeholders"]
return {}
@property
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
def get_output_text_to_return(self, buffer_length: int,
delta: bool) -> str:
"""If delta is True, only new text since the last call to
this method is returned"""
# We return the full output text if the sequence is finished.
truncate = buffer_length and not self.is_finished()
if not delta:
return self.output_text[:-buffer_length] if truncate else (
self.output_text)
length = len(self.output_text)
if truncate:
length -= buffer_length
last_offset = self._last_output_text_offset
if last_offset < length:
self._last_output_text_offset = length
return self.output_text[last_offset:length]
return ""
def get_output_token_ids_to_return(
self, delta: bool) -> Union[GenericSequence[int], int]:
"""If delta is True, only new tokens since the last call to
this method are returned"""
if not delta:
return self.get_output_token_ids()
output_len = self.get_output_len()
# Get the number of new tokens
num_new_tokens = output_len - self._last_output_token_ids_offset
self._last_output_token_ids_offset = output_len
# Return new tokens
if num_new_tokens == 1:
# Optimization for single decode token case
# (which is what we have most of the time)
return self.data._cached_all_token_ids[-1]
if num_new_tokens == 0:
return []
return self.data._cached_all_token_ids[-num_new_tokens:]
def hash_of_block(self, logical_idx: int) -> int:
# TODO This can produce incorrect hash when block size > prompt size
# Compute the number of tokens in the sequence
# TODO: The current hashing function is O(L^2). We should optimize
# this in the future.
num_tokens = self.num_hashed_tokens_of_block(logical_idx)
hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
return hash((hashed_tokens, self.lora_int_id))
def extra_hash(self) -> Optional[int]:
"""
This function computes an extra hash for a sequence, specifically
designed for prefix caching mode. The final sequence hash is determined
by applying token_ids from the sequence's blocks.
"""
if self.lora_int_id == 0:
return None
# NOTE: If there are additional factors influencing the block aside from
# token_ids, include them as input parameters to the hash.
return hash(self.lora_int_id)
def num_hashed_tokens_of_block(self, logical_idx: int):
return logical_idx * self.block_size + self.block_size
def reset_state_for_recompute(self):
"""Reset the sequence states for recomputation."""
self.data.reset_state_for_recompute()
def append_token_id(self,
token_id: int,
logprobs: dict[int, Logprob],
token_embed: Optional[torch.Tensor] = None) -> None:
assert token_id in logprobs
self.output_logprobs.append(logprobs)
self.data.append_token_id(token_id, logprobs[token_id].logprob,
token_embed)
def get_len(self) -> int:
return self.data.get_len()
def get_prompt_len(self) -> int:
return self.data.get_prompt_len()
def get_output_len(self) -> int:
return self.data.get_output_len()
def get_token_ids(self) -> list[int]:
return self.data.get_token_ids()
def get_prompt_token_ids(self) -> tuple[int, ...]:
return self.data.get_prompt_token_ids()
def get_last_token_id(self) -> int:
return self.data.get_last_token_id()
def get_output_token_ids(self) -> tuple[int, ...]:
return self.data.get_output_token_ids()
def get_cumulative_logprob(self) -> float:
return self.data.cumulative_logprob
def is_finished(self) -> bool:
return SequenceStatus.is_finished(self.status)
def fork(self, new_seq_id: int) -> "Sequence":
new_seq = copy.deepcopy(self)
new_seq.seq_id = new_seq_id
return new_seq
def get_num_new_tokens(self) -> int:
"""Get the number of new tokens to be computed.
Returns:
The new number of tokens to be computed. I.e., 1 for decode, or
the remaining prompt size for prefill.
"""
if self.data.stage == SequenceStage.DECODE:
return 1
return self.data.get_num_uncomputed_tokens()
def get_num_computed_tokens(self) -> int:
return self.data.get_num_computed_tokens()
def is_prefill(self) -> bool:
return self.data.stage == SequenceStage.PREFILL
def __repr__(self) -> str:
return (f"Sequence(seq_id={self.seq_id}, "
f"status={self.status.name}, "
f"num_blocks={self.n_blocks})")
class SequenceGroupState(msgspec.Struct,
omit_defaults=True): # type: ignore[call-arg]
"""Mutable state tied to a specific sequence group"""
# for multi-step decoding
num_steps: int = 1
current_step: int = 0
@property
def remaining_steps(self) -> int:
return self.num_steps - self.current_step
class SequenceGroup:
"""A group of sequences that are generated from the same prompt.
Args:
request_id: The ID of the request.
seqs: The list of sequences.
sampling_params: The sampling parameters used to generate the outputs.
arrival_time: The arrival time of the request.
lora_request: LoRA request.
pooling_params: The parameters used to generate the pooler
for a pooling model.
pooled_data: The extracted hidden states from a pooling model.
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate
for single-draft speculative decoding but larger than
that for multi-draft SD (currently not supported).
"""
def __init__(self,
request_id: str,
seqs: list[Sequence],
arrival_time: float,
sampling_params: Optional[SamplingParams] = None,
lora_request: Optional[LoRARequest] = None,
pooling_params: Optional[PoolingParams] = None,
pooled_data: Optional[torch.Tensor] = None,
encoder_seq: Optional[Sequence] = None,
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
draft_size: int = 1) -> None:
self.request_id = request_id
self.seqs = seqs
self.first_seq = seqs[0]
self.arrival_time = arrival_time
self.is_single_seq = len(seqs) == 1
self.seqs_dict = {seq.seq_id: seq for seq in seqs}
self.sampling_params = sampling_params
self.metrics = RequestMetrics(arrival_time=arrival_time,
last_token_time=arrival_time,
first_scheduled_time=None,
first_token_time=None,
time_in_queue=None)
self.last_token_latency = 0.0
self.lora_request = lora_request
self.prompt_logprobs: Optional[PromptLogprobs] = None
self.state = SequenceGroupState()
self.pooling_params = pooling_params
self.pooled_data = pooled_data
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self.priority = priority
self.cached_request_output = None
@property
def prompt(self) -> Optional[str]:
return self.first_seq.prompt
@property
def prompt_token_ids(self) -> list[int]:
return self.first_seq.prompt_token_ids
@property
def encoder_prompt(self) -> Optional[str]:
# There are either 0 or 1 encoder sequences
# If one is present, its prompt is distinct
# from the decoder's.
return (self.encoder_seq.prompt
if self.encoder_seq is not None else None)
@property
def encoder_prompt_token_ids(self) -> Optional[list[int]]:
# There are either 0 or 1 encoder sequences
# If one is present, its prompt token ids are
# distinct from the decoder's.
return (self.encoder_seq.prompt_token_ids
if self.encoder_seq is not None else None)
@property
def multi_modal_data(self) -> MultiModalKwargs:
if self.first_seq.multi_modal_data:
return self.first_seq.multi_modal_data
elif self.encoder_seq is not None:
return self.encoder_seq.multi_modal_data
return MultiModalKwargs()
@property
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
if self.first_seq.multi_modal_data:
return self.first_seq.multi_modal_placeholders
elif self.encoder_seq is not None:
return self.encoder_seq.multi_modal_placeholders
return {}
@property
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails.
assert not self.is_prefill(), (
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
self.last_token_latency = now - self.metrics.last_token_time
self.metrics.last_token_time = now
def get_last_token_latency(self) -> float:
"""Returns the latency of the last token."""
assert not self.is_prefill(), (
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
return self.last_token_latency
def maybe_set_first_token_time(self, time: float) -> None:
"""Sets the first token time for Request level timings."""
# Note: in a case where a sequence_group is swapped and
# recomputed, the time between iterations is counted
# in TPOT, rather than recalculating TTFT (since from the )
# POV of the user, there is simply a long generation delay.
if (self.metrics.first_token_time is None
and self.first_seq.get_output_len() == 1):
self.metrics.first_token_time = time
def maybe_set_first_scheduled_time(self, time: float) -> None:
"""Sets the first scheduled time and time in queue for Request
level timings."""
if self.metrics.first_scheduled_time is None:
self.metrics.first_scheduled_time = time
self.metrics.time_in_queue = time - self.metrics.arrival_time
def set_finished_time(self, time: Optional[float]) -> None:
"""Sets the finished time for Request level timings."""
self.metrics.finished_time = time
def get_max_num_running_seqs(self) -> int:
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
if self.is_single_seq:
return 0 if self.first_seq.is_finished() else 1
return self.num_seqs() - self.num_finished_seqs()
def get_seqs(
self,
status: Optional[SequenceStatus] = None,
) -> list[Sequence]:
if status is None:
return self.seqs
if self.is_single_seq:
return self.seqs if self.first_seq.status == status else []
return [seq for seq in self.seqs if seq.status == status]
def is_encoder_decoder(self) -> bool:
return self.encoder_seq is not None
def get_encoder_seq(self) -> Optional[Sequence]:
return self.encoder_seq
def get_finished_seqs(self) -> list[Sequence]:
if self.is_single_seq:
return self.seqs if self.first_seq.is_finished() else []
return [seq for seq in self.seqs if seq.is_finished()]
def update_num_computed_tokens(self, num_new_computed_tokens: int):
"""Update number of tokens computed so far."""
for seq in self.seqs:
if not seq.is_finished():
seq.data.update_num_computed_tokens(num_new_computed_tokens)
def get_num_uncomputed_tokens(self) -> int:
num_uncomputed_tokens = 0
for seq in self.seqs:
if not seq.is_finished():
num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
return num_uncomputed_tokens
def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
# Optimization. We don't need to call get_seqs if we don't need to
# filter by states.
if status is None:
return len(self.seqs)
if self.is_single_seq:
return 1 if self.seqs[0].status == status else 0
return len(self.get_seqs(status))
def num_finished_seqs(self) -> int:
if self.is_single_seq:
return 1 if self.seqs[0].is_finished() else 0
return len(self.get_finished_seqs())
def is_finished(self) -> bool:
if self.is_single_seq:
return self.first_seq.is_finished()
return all(seq.is_finished() for seq in self.seqs)
def is_prefill(self) -> bool:
return self.first_seq.is_prefill()
def __repr__(self) -> str:
return (f"SequenceGroup(request_id={self.request_id}, "
f"sampling_params={self.sampling_params}, "
f"num_seqs={len(self.seqs)})")
def uses_prompt_embeds(self) -> bool:
"""Returns True if the sequence group uses input embeds."""
return any(seq.data.prompt_embeds is not None for seq in self.seqs)
class SequenceGroupMetadataDelta(
msgspec.Struct,
tag=True, # type: ignore[call-arg]
array_like=True, # type: ignore[call-arg]
omit_defaults=True): # type: ignore[call-arg]
"""Delta of SequenceGroupMetadata.
After sending the first SequenceGroupMetadata, vLLM scheduler
only sends delta to reduce the data payload size.
"""
seq_data_delta: dict[int, SequenceDataDelta]
request_id: str
block_tables: dict[int, list[int]]
is_prompt: bool
do_sample: bool = True
token_chunk_size: Optional[int] = None
computed_block_nums: Optional[list[int]] = None
state: Optional[SequenceGroupState] = msgspec.field(
default_factory=lambda: SequenceGroupState())
class SequenceGroupMetadata(
msgspec.Struct,
tag=True, # type: ignore[call-arg]
array_like=True, # type: ignore[call-arg]
omit_defaults=True): # type: ignore[call-arg]
"""Metadata for a sequence group. Used to create `AttentionMetadata`.
Attributes:
request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data)
sampling_params: The sampling parameters used to generate the outputs.
block_tables: The block tables. (Seq id -> list of physical block
numbers)
do_sample: True if sampling is required. Sampling is not required when
e.g., prefill is chunked, and the current iteration only computes
query tokens for prefill, we don't need sampling.
pooling_params: Pooling parameters.
lora_request: LoRA request.
computed_block_nums: The block numbers that are already computed,
used in prefix caching.
state: Internal state tied to this sequence group.
token_type_ids: Token type IDs.
multi_modal_data: Multi modal data.
multi_modal_placeholders: Multi modal placeholders.
encoder_seq_data: Optional sequence data for encoder prompt
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
model.
cross_block_table: Optional cross-attention block table associated
with the encoder prompt
(SequenceGroup.encoder_seq). Should be None
unless you are working with an encoder/decoder
model.
"""
request_id: str
is_prompt: bool
seq_data: dict[int, SequenceData]
sampling_params: Optional[SamplingParams]
block_tables: dict[int, list[int]]
do_sample: bool = True
pooling_params: Optional[PoolingParams] = None
lora_request: Optional[LoRARequest] = None
computed_block_nums: Optional[list[int]] = None
state: Optional[SequenceGroupState] = msgspec.field(
default_factory=lambda: SequenceGroupState())
multi_modal_data: Optional[MultiModalKwargs] = None
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[list[int]] = None
token_chunk_size: Optional[int] = None
### Stateful fields that are lazily defined. ###
# The number of speculative tokens adopted in this request.
# None means specuative decoding is not used.
# Zero means speculative decoding is disabled for some reasons.
# TODO: We should maintain this states out of the sequence group.
num_speculative_tokens: Optional[int] = None
def __post_init__(self):
if self.seq_data is not None and self.token_chunk_size is None:
if self.is_prompt:
self.token_chunk_size = next(iter(
self.seq_data.values())).get_len()
else:
self.token_chunk_size = 1
@property
def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
# Multi-Step Chunked-Prefill property
@property
def is_single_step_prompt(self) -> bool:
# do_sample is true, only when the token_chunk_size matches the
# num_uncomputed_tokens of the sequence. This indicates that
# the prompt will finish processing in a single `execute_model`
# step.
return self.is_prompt and self.do_sample
def get_first_seq_id(self) -> int:
# This is an efficient way of fetching the seq_id when
# we know this SequenceGroup has only one sequence.
return next(iter(self.seq_data))
def apply_delta(self,
sequence_group_metadata_delta: SequenceGroupMetadataDelta):
for id, delta in sequence_group_metadata_delta.seq_data_delta.items():
self.seq_data[id].apply_delta(delta)
assert self.request_id == sequence_group_metadata_delta.request_id
self.block_tables = sequence_group_metadata_delta.block_tables
self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size
self.do_sample = sequence_group_metadata_delta.do_sample
self.is_prompt = sequence_group_metadata_delta.is_prompt
def finish_step(self) -> None:
assert self.state is not None
assert self.state.current_step < self.state.num_steps, \
f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa
self.state.current_step += 1
class SequenceOutput(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg]
"""The model output associated with a sequence.
Attributes:
parent_seq_id: The ID of the parent sequence (for forking in beam
search).
output_token: The output token ID.
logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i))
output_embed: Optional output embedding tensor.
"""
parent_seq_id: int
output_token: int
logprobs: dict[int, Logprob]
output_embed: Optional[torch.Tensor] = None
def __repr__(self) -> str:
output_embed_shape = \
self.output_embed.shape if self.output_embed is not None else None
return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
f"output_token={self.output_token}, "
f"output_embed.shape={output_embed_shape}, "
f"logprobs={self.logprobs})")
def __eq__(self, other: object) -> bool:
if not isinstance(other, SequenceOutput):
raise NotImplementedError()
equal = (self.parent_seq_id == other.parent_seq_id
and self.output_token == other.output_token)
log_probs_equal = other.logprobs == self.logprobs
return equal and log_probs_equal
class SequenceGroupOutput(ABC):
"""The base class for model outputs associated with a sequence group."""
@abstractmethod
def __repr__(self) -> str:
pass
@abstractmethod
def __eq__(self, other: object) -> bool:
pass
class CompletionSequenceGroupOutput(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg]
"""The model output associated with a completion sequence group."""
__metaclass__ = SequenceGroupOutput
samples: list[SequenceOutput]
# Prompt logprob for each prompt query token.
prompt_logprobs: Optional[PromptLogprobs]
step_index: Optional[int] = 0
def __repr__(self) -> str:
return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
f"prompt_logprobs={self.prompt_logprobs})")
def __eq__(self, other: object) -> bool:
if not isinstance(other, CompletionSequenceGroupOutput):
raise NotImplementedError()
return (self.samples == other.samples
and self.prompt_logprobs == other.prompt_logprobs)
class PoolingSequenceGroupOutput(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True, # type: ignore[call-arg]
):
"""The model output associated with a pooling sequence group."""
__metaclass__ = SequenceGroupOutput
# Annotated as Any to be compatible with msgspec
# The actual type is in SequenceGroup.pooled_data
data: Any
......@@ -1161,305 +143,9 @@ class PoolerOutput(
self.__class__) and self.outputs == other.outputs
def get_all_seq_ids(
seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]:
"""Given a list of SequenceGroupMetadata, create a list of all
sequence ids.
"""
return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
def get_all_seq_ids_and_request_ids(
seq_group_metadata_list: list[SequenceGroupMetadata]
) -> tuple[list[int], dict[str, set[int]]]:
"""Given a list of SequenceGroupMetadata, create a list of all
sequence ids.
"""
seq_ids: list[int] = []
request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set)
for sg in seq_group_metadata_list:
for seq_id in sg.seq_data:
seq_ids.append(seq_id)
request_id_seq_ids_mapping[sg.request_id].add(seq_id)
return seq_ids, request_id_seq_ids_mapping
class HiddenStates(msgspec.Struct, array_like=True,
omit_defaults=True): # type: ignore[call-arg]
"""Hidden states corresponding to in-progress sequences.
Used in speculative decoding to pass hidden states from
the target model to the proposer model.
seq_ids are the sequence ids of each entry of the batch
dimension of the hidden_states tensor"""
# Scorer hidden states. For prefill step, it is used for hidden states of
# all tokens, whereas for decode step, it is used for last accepted tokens.
hidden_states: torch.Tensor
# The sequence group metadata list. Only needed for decode step.
seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
# Scorer hidden states of the 2nd last token proposed by the proposer (
# irrespective of whether it was accepted or not). Only used for cases when
# last proposed token is accepted (i.e., in case of bonus tokens). For the
# case of no bonus tokens, these are ignored.
second_last_token_hidden_states: Optional[torch.Tensor] = None
_seq_ids: list[int] = msgspec.field(default_factory=list)
def __post_init__(self):
if self.seq_group_metadata_list is not None:
assert len(self.seq_group_metadata_list) == len(self.hidden_states)
self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list)
@property
def seq_ids(self) -> list[int]:
return self._seq_ids
def update(self,
hidden_states: torch.Tensor,
seq_group_metadata_list: list[SequenceGroupMetadata],
second_last_token_hidden_states: Optional[torch.Tensor] = None):
"""Update hidden states from target model invocation. Only used for
decode steps"""
assert len(seq_group_metadata_list) == len(hidden_states)
self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
self.hidden_states = torch.cat([self.hidden_states, hidden_states])
if self.second_last_token_hidden_states is not None:
# Adding dummy hidden_states to this to maintain same shape
self.second_last_token_hidden_states = torch.cat([
self.second_last_token_hidden_states,
torch.zeros_like(hidden_states)
if second_last_token_hidden_states is None else
second_last_token_hidden_states
])
def prune(self,
seq_group_metadata_list: list[SequenceGroupMetadata]) -> None:
"""Prune to provided list of sequence ids. Only used for decode steps.
"""
# Currently this prunes all seq_ids not present in
# seq_group_metadata_list which might cause problems where a sequence
# may be "paused" then "resumed" later. This should only prune sequences
# which are confirmed to be aborted.
seq_ids = get_all_seq_ids(seq_group_metadata_list)
# Only keep sequence IDs that exist in self._seq_ids
seq_ids = [seq_id for seq_id in seq_ids if seq_id in self._seq_ids]
if seq_ids != self._seq_ids:
# Batch contents changed - prune removed sequences.
index = [self._seq_ids.index(seq_id) for seq_id in seq_ids]
self.hidden_states = self.hidden_states[index]
if self.second_last_token_hidden_states is not None:
self.second_last_token_hidden_states = self\
.second_last_token_hidden_states[index]
self._seq_ids = seq_ids
def expand_with_bonus_tokens(
self, seq_with_bonus_token_in_last_step: set) -> None:
"""Expand hidden states for sequences with bonus tokens. This is in
alignment with `MultiStepWorker._expand_execute_model_request`."""
if self.second_last_token_hidden_states is None \
or not seq_with_bonus_token_in_last_step:
return
index = []
for seq_id in self._seq_ids:
i = self._seq_ids.index(seq_id)
if seq_id in seq_with_bonus_token_in_last_step:
index.append(i + len(self._seq_ids))
index.append(i)
self.hidden_states = torch.cat(
[self.hidden_states, self.second_last_token_hidden_states])[index]
class ExecuteModelRequest(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
omit_defaults=True): # type: ignore[call-arg]
"""The model execution request, containing CPU metadata only. The LLM
engine should create an instance of this class for each request batch."""
# The sequence group metadata list.
seq_group_metadata_list: list[Union[SequenceGroupMetadata,
SequenceGroupMetadataDelta]]
# Blocks to swap in. List of CPU -> GPU block number.
blocks_to_swap_in: list[tuple[int,
int]] = msgspec.field(default_factory=list)
# Blocks to swap out. List of GPU -> CPU block number.
blocks_to_swap_out: list[tuple[int,
int]] = msgspec.field(default_factory=list)
# Blocks to copy. Source to dest block.
blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list)
# Virtual engine ID for pipeline parallel.
virtual_engine: int = 0
# The number of slots for lookahead decoding.
num_lookahead_slots: int = 0
# The number of requests in the running queue.
running_queue_size: int = 0
# Optional hidden states from prior step.
previous_hidden_states: Optional[HiddenStates] = None
# The number of forward steps to run.
num_steps: int = 1
# Finished request ids since last step.
finished_requests_ids: list[str] = msgspec.field(default_factory=list)
# The last sampled token ids for multi step decoding.
last_sampled_token_ids: Optional[torch.Tensor] = None
# Async callback
async_callback: Optional[Callable] = None
@property
def is_last_step(self) -> bool:
# TODO(will) make this be able to handle batches with variable number of
# steps
assert len(self.seq_group_metadata_list) > 0
first_seq_group = self.seq_group_metadata_list[0]
assert first_seq_group.state is not None
return first_seq_group.state.remaining_steps == 1
@property
def current_step(self) -> int:
# TODO(will) make this be able to handle batches with variable number of
# steps
assert len(self.seq_group_metadata_list) > 0
state = self.seq_group_metadata_list[0].state
assert state is not None
return state.current_step
def clone(
self, seq_group_metadata_list: list[Union[SequenceGroupMetadata,
SequenceGroupMetadataDelta]]
) -> "ExecuteModelRequest":
"""Clone the request with a new sequence group metadata list."""
return ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=self.blocks_to_swap_in.copy(),
blocks_to_swap_out=self.blocks_to_swap_out.copy(),
blocks_to_copy=self.blocks_to_copy.copy(),
virtual_engine=self.virtual_engine,
num_lookahead_slots=self.num_lookahead_slots,
running_queue_size=self.running_queue_size,
previous_hidden_states=self.previous_hidden_states,
num_steps=self.num_steps,
finished_requests_ids=self.finished_requests_ids,
last_sampled_token_ids=self.last_sampled_token_ids.clone()
if self.last_sampled_token_ids is not None else None,
async_callback=self.async_callback)
@dataclass
class SequenceGroupBase:
group_id: str # the original request id before splitting
assembled_seq_group: Optional[SequenceGroup] = None
# seq id to a unique index inside this group
seq_id_to_index: dict[str, int] = field(default_factory=dict)
# seq ids to be finished
to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict)
# seq id to finished sequences
finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict)
streaming: bool = False
output_produced: bool = False
@staticmethod
def add_request(request_id: str, engine, params, *args, **kwargs):
"""When we are ready to add a request with request_id and params
into the engine, we can split the request into multiple requests.
"""
raise NotImplementedError
def finish_seq(self, seq: SequenceGroup):
"""The sequence `seq` finishes, we should record the information.
"""
del self.to_be_finished[seq.request_id]
self.finished_reqs[seq.request_id] = seq
def maybe_assemble_group(
self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
"""Assemble the sequence group, for producing the final
output, or adding request in the engine again.
"""
raise NotImplementedError
class ParallelSampleSequenceGroup(SequenceGroupBase):
@staticmethod
def add_request(request_id: str, engine, params, **kwargs):
original_params = params
group = ParallelSampleSequenceGroup(request_id)
seqs = []
for i in range(original_params.n):
request_id_i = f"{request_id}_parallel_sample_{i}"
group.seq_id_to_index[request_id_i] = i
params = original_params.clone()
params.n = 1
if params.seed is not None:
params.seed += i
seq_group = engine._add_processed_request(
request_id_i,
params=params,
**kwargs,
) # type: ignore
assert seq_group is not None
engine.seq_id_to_seq_group[request_id_i] = group
group.to_be_finished[request_id_i] = seq_group
seqs.append(seq_group.seqs[0])
# for parallel sampling, the `assembled_seq_group` is always
# available, since we have all the sequences ready, and they
# will not change.
group.assembled_seq_group = SequenceGroup(
request_id=request_id,
seqs=seqs,
arrival_time=seq_group.arrival_time,
sampling_params=original_params,
lora_request=seq_group.lora_request,
pooling_params=seq_group.pooling_params,
pooled_data=seq_group.pooled_data,
encoder_seq=seq_group.encoder_seq,
trace_headers=seq_group.trace_headers,
priority=seq_group.priority,
)
group.streaming = params.output_kind == RequestOutputKind.DELTA
group.output_produced = False
def maybe_assemble_group(
self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
# in the streaming mode, we will return the assembled sequence
# for the first remaining sequence, and then return None for the
# rest of sequences
if self.streaming:
first_remaining_id = next(iter(self.to_be_finished))
if seq_group.request_id == first_remaining_id:
return self.assembled_seq_group
return None
# in the non-streaming mode, we will return the assembled sequence
# when the last sequences finishes, and then return None for the
# rest of the time
if (len(self.to_be_finished) == 1
and seq_group.request_id in self.to_be_finished
and seq_group.is_finished()):
assert self.assembled_seq_group is not None
params = self.assembled_seq_group.sampling_params
assert isinstance(params, SamplingParams)
if not self.output_produced:
self.output_produced = True
if params._real_n is not None:
# Get the top-n sequences.
n = params._real_n or params.n
seqs = self.assembled_seq_group.seqs
sorting_key = lambda seq: seq.get_cumulative_logprob()
sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
top_n_seqs = sorted_seqs[:n]
self.assembled_seq_group.seqs = top_n_seqs
return self.assembled_seq_group
if self.output_produced:
return None
return None
# Placeholder. Remove.
pass
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
from vllm.logprobs import Logprob
from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence,
SequenceGroup)
from .detokenizer_utils import (convert_prompt_ids_to_tokens,
detokenize_incrementally)
from .tokenizer import AnyTokenizer
class Detokenizer:
"""Provides methods to decode the output of a model into text."""
def __init__(self, tokenizer: AnyTokenizer):
self.tokenizer = tokenizer
def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
prompt_logprobs: list[Optional[dict[
int, Logprob]]],
position_offset: int) -> None:
"""Decodes the logprobs for the prompt of a sequence group.
Args:
seq_group: The sequence group to decode.
prompt_logprobs: The logprobs to decode.
position_offset: Offset of the first index of the logprobs
relative to the start of the sequence (for chunked prefill).
Returns:
The prompt logprobs with the decoded tokens.
"""
prms = seq_group.sampling_params
assert prms is not None
# We can pick any sequence for the prompt.
seq = seq_group.get_seqs()[0]
# Only prompt, without the generated token.
all_token_ids = seq.get_token_ids()
prompt_token_ids = all_token_ids[:-1]
prefix_offset = 0
read_offset = 0
next_iter_prefix_offset = 0
next_iter_read_offset = 0
next_iter_tokens: list[str] = []
prev_tokens = None
for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
prompt_logprobs):
# Absolute token position equals the index in the logprobs
# list plus the offset of the entire logprobs list relative
# to the start of the sequence.
token_position = token_position_in_logprob + position_offset
if not prompt_logprobs_for_token:
continue
for token_id, sample_logprob in prompt_logprobs_for_token.items():
if (sample_logprob.decoded_token is None
and token_id != VLLM_INVALID_TOKEN_ID):
prompt_token_ids_with_token = (
prompt_token_ids[:token_position] + [token_id])
(new_tokens, new_text, new_prefix_offset,
new_read_offset) = detokenize_incrementally(
tokenizer=self.tokenizer,
all_input_ids=prompt_token_ids_with_token,
prev_tokens=prev_tokens,
prefix_offset=prefix_offset,
read_offset=read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.
spaces_between_special_tokens,
)
sample_logprob.decoded_token = new_text
# Use the offsets & prev tokens corresponding to
# real tokens to ensure detokenization is consistent
# actual with prompt.
if token_id == all_token_ids[token_position]:
next_iter_prefix_offset = new_prefix_offset
next_iter_read_offset = new_read_offset
next_iter_tokens = new_tokens
# Advance to the next token position.
prefix_offset = next_iter_prefix_offset
read_offset = next_iter_read_offset
if prev_tokens is None:
prev_tokens = next_iter_tokens.copy()
else:
prev_tokens.extend(next_iter_tokens)
def decode_sequence_inplace(self, seq: Sequence,
prms: SamplingParams) -> int:
"""Decodes the new token for a sequence. In-place operation.
Args:
seq: The sequence to decode.
prms: The sampling parameters used to generate the sequence.
Returns:
The number of characters added to the output text.
"""
all_input_ids = seq.get_token_ids()
token_id_generated_this_iteration = all_input_ids[-1]
# Convert prompt token IDs to tokens if necessary.
# Do it here so that we don't have to repeat this
# computation for each logprob.
if seq.tokens is None:
(seq.tokens, seq.prefix_offset,
seq.read_offset) = convert_prompt_ids_to_tokens(
tokenizer=self.tokenizer,
prompt_ids=all_input_ids[:-1],
skip_special_tokens=prms.skip_special_tokens,
)
(new_tokens, new_decoded_token_text, prefix_offset,
read_offset) = detokenize_incrementally(
tokenizer=self.tokenizer,
all_input_ids=all_input_ids,
prev_tokens=seq.tokens,
prefix_offset=seq.prefix_offset,
read_offset=seq.read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.spaces_between_special_tokens,
)
# Decode logprobs
logprobs = seq.output_logprobs[-1]
if logprobs:
previous_tokens = all_input_ids[:-1]
for token_id, sample_logprob in logprobs.items():
# If the token was generated this iteration,
# use the provided text.
if token_id == token_id_generated_this_iteration:
sample_logprob.decoded_token = new_decoded_token_text
continue
if (sample_logprob.decoded_token is None
and token_id != VLLM_INVALID_TOKEN_ID):
all_input_ids_with_logprob = previous_tokens + [token_id]
(_, new_text, _, _) = detokenize_incrementally(
tokenizer=self.tokenizer,
all_input_ids=all_input_ids_with_logprob,
prev_tokens=seq.tokens,
prefix_offset=seq.prefix_offset,
read_offset=seq.read_offset,
skip_special_tokens=prms.skip_special_tokens,
spaces_between_special_tokens=prms.
spaces_between_special_tokens,
)
sample_logprob.decoded_token = new_text
seq.tokens.extend(new_tokens)
seq.prefix_offset = prefix_offset
seq.read_offset = read_offset
seq.output_text += new_decoded_token_text
return len(new_decoded_token_text)
......@@ -11,12 +11,12 @@ import torch.nn as nn
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.sequence import ExecuteModelRequest
from vllm.utils import (enable_trace_function_call_for_thread,
resolve_obj_by_qualname, run_method,
update_environment_variables,
warn_for_unimplemented_methods)
from vllm.v1.outputs import SamplerOutput
logger = init_logger(__name__)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment