"csrc/vscode:/vscode.git/clone" did not exist on "18511aeda64b473314bb7727a97a220565e0af41"
utils.py 2.75 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
import msgspec
from abc import ABC
import torch

7
from vllm.sampling_params import SamplingParams
Jiayi Yao's avatar
Jiayi Yao committed
8
from vllm.triton_utils import tl, triton
9
10
11
12
13
14
15
16
17
18
19

_SAMPLING_EPS = 1e-5


def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
    """True if request is incompatible with speculative decoding"""
    return (sampling_params.frequency_penalty != 0.0
            or sampling_params.presence_penalty != 0.0
            or sampling_params.repetition_penalty != 1.0
            or sampling_params.min_p > _SAMPLING_EPS
            or sampling_params.logprobs is not None)
Jiayi Yao's avatar
Jiayi Yao committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45


@triton.jit
def prepare_eagle_input_kernel(
    out_ptr,
    cu_query_lens_ptr,
    cu_num_tokens_ptr,
    BLOCK_SIZE: tl.constexpr,
):
    pid = tl.program_id(0)

    # [start_pos, end_pos)
    start_pos = tl.load(cu_num_tokens_ptr + pid)
    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
    num_tokens = end_pos - start_pos

    index_start = tl.load(cu_query_lens_ptr + pid)

    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
    for i in tl.range(num_blocks):
        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
        tl.store(
            out_ptr + start_pos + offset,
            index_start + offset,
            mask=offset < num_tokens,
        )
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

class DraftProbs(ABC):  # type: ignore[call-arg]
    """Draft probs corresponding to in-progress sequences."""

    # spec tokens probs. 
    draft_probs: torch.Tensor

    # The request id list.
    _req_ids: list[str]

    def __init__(self, draft_probs, req_ids):
        assert len(req_ids) == len(draft_probs)
        self.draft_probs = draft_probs
        self._req_ids = req_ids

    def update(self,
               draft_probs: torch.Tensor,
               tmp_req_ids: list[str]):
        diff_req_ids = [item for item in self._req_ids if item not in tmp_req_ids]
        index = [self._req_ids.index(req_id) for req_id in diff_req_ids]
        self._req_ids = diff_req_ids
        self.draft_probs = self.draft_probs[index]
        self.draft_probs = torch.cat([self.draft_probs, draft_probs])
        
        self._req_ids.extend(tmp_req_ids)
        assert len(self._req_ids) == len(self.draft_probs)

    def prune(self, req_ids: list[str]):
        new_req_ids = [req_id for req_id in self._req_ids if req_id not in req_ids]
        if new_req_ids != self._req_ids:
            # Batch contents changed - prune removed sequences.
            index = [self._req_ids.index(req_id) for req_id in new_req_ids]
            self.draft_probs = self.draft_probs[index]
            self._req_ids = new_req_ids

    def get_probs(self, req_ids: list[str]):
        index = [self._req_ids.index(req_id) for req_id in req_ids]
        return self.draft_probs[index]