utils.py 3 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
王敏's avatar
王敏 committed
3
4
5
6
import msgspec
from abc import ABC
import torch

7
from vllm.sampling_params import SamplingParams
Jiayi Yao's avatar
Jiayi Yao committed
8
from vllm.triton_utils import tl, triton
王敏's avatar
王敏 committed
9
from vllm.utils import async_tensor_h2d
10
11
12
13
14
15
16
17
18
19
20

_SAMPLING_EPS = 1e-5


def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
    """True if request is incompatible with speculative decoding"""
    return (sampling_params.frequency_penalty != 0.0
            or sampling_params.presence_penalty != 0.0
            or sampling_params.repetition_penalty != 1.0
            or sampling_params.min_p > _SAMPLING_EPS
            or sampling_params.logprobs is not None)
Jiayi Yao's avatar
Jiayi Yao committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46


@triton.jit
def prepare_eagle_input_kernel(
    out_ptr,
    cu_query_lens_ptr,
    cu_num_tokens_ptr,
    BLOCK_SIZE: tl.constexpr,
):
    pid = tl.program_id(0)

    # [start_pos, end_pos)
    start_pos = tl.load(cu_num_tokens_ptr + pid)
    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
    num_tokens = end_pos - start_pos

    index_start = tl.load(cu_query_lens_ptr + pid)

    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
    for i in tl.range(num_blocks):
        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
        tl.store(
            out_ptr + start_pos + offset,
            index_start + offset,
            mask=offset < num_tokens,
        )
王敏's avatar
王敏 committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

class DraftProbs(ABC):  # type: ignore[call-arg]
    """Draft probs corresponding to in-progress sequences."""

    # spec tokens probs. 
    draft_probs: torch.Tensor

    # The request id list.
    _req_ids: list[str]

    def __init__(self, draft_probs, req_ids):
        assert len(req_ids) == len(draft_probs)
        self.draft_probs = draft_probs
        self._req_ids = req_ids

    def update(self,
               draft_probs: torch.Tensor,
               tmp_req_ids: list[str]):
        diff_req_ids = [item for item in self._req_ids if item not in tmp_req_ids]
        index = [self._req_ids.index(req_id) for req_id in diff_req_ids]
        self._req_ids = diff_req_ids
        self.draft_probs = self.draft_probs[index]
        self.draft_probs = torch.cat([self.draft_probs, draft_probs])
        
        self._req_ids.extend(tmp_req_ids)
        assert len(self._req_ids) == len(self.draft_probs)

    def prune(self, req_ids: list[str]):
        new_req_ids = [req_id for req_id in self._req_ids if req_id not in req_ids]
        if new_req_ids != self._req_ids:
            # Batch contents changed - prune removed sequences.
            index = [self._req_ids.index(req_id) for req_id in new_req_ids]
            self.draft_probs = self.draft_probs[index]
            self._req_ids = new_req_ids

    def get_probs(self, req_ids: list[str]):
        index = [self._req_ids.index(req_id) for req_id in req_ids]
王敏's avatar
王敏 committed
84
85
86
87
88
89
        index_tensor = async_tensor_h2d(
                    index,
                    dtype=torch.int32,
                    target_device=self.draft_probs.device,
                    pin_memory=True)
        return self.draft_probs[index_tensor]
王敏's avatar
王敏 committed
90