utils.py 1.95 KB
Newer Older
lizhigong's avatar
lizhigong committed
1
2


lizhigong's avatar
lizhigong committed
3
from enum import Enum
lizhigong's avatar
lizhigong committed
4
import os
5
import torch
6
import vllm.envs as envs
lizhigong's avatar
lizhigong committed
7
8
9
10

zero_no_thread = os.environ.get('VLLM_ZERO_NO_THREAD') == '1'

def is_zero_no_thread():
11
    return zero_no_thread and envs.VLLM_ZERO_OVERHEAD
lizhigong's avatar
lizhigong committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

class SpecStepKind(Enum):
    KIND_DEFAULT = 0
    PREFILL = 1
    FIRST_PROPOSAL = 2
    OTHER_PROPOSAL = 3
    SCORE_DECODE = 4

class ZeroOverheadSpecContext():
    def __init__(self):
        self.step_kind = SpecStepKind.KIND_DEFAULT
        self.last_step = SpecStepKind.KIND_DEFAULT
        self.proposal_lens_list = None
        self.proposal_token_ids = None
        self.accepted_token_ids = None
        self.accepted_seq_ids = None

spec_context = ZeroOverheadSpecContext()  

def set_spec_step(_step):
    global spec_context
    spec_context.last_step = spec_context.step_kind
    spec_context.step_kind = _step

def get_spec_step():
    return spec_context.step_kind

def get_spec_last_step():
    return spec_context.last_step

def record_proposal_lens_list(list):
    global spec_context
    spec_context.proposal_lens_list = list

def get_proposal_lens_list():
    return spec_context.proposal_lens_list

def record_proposal_token_ids(tensor):
    global spec_context
    spec_context.proposal_token_ids = tensor

def get_proposal_token_ids():
    return spec_context.proposal_token_ids

def record_accepted_token_ids(tensor, seq_ids):
    global spec_context
    spec_context.accepted_token_ids = tensor
    spec_context.accepted_seq_ids = seq_ids

def get_accepted_token_ids():
    return spec_context.accepted_token_ids, spec_context.accepted_seq_ids
63
64
65
66
67
68
69
70
71

# 零消耗调度不在默认流上推理,用以规避runtime引入的内存申请流同步问题。
alloc_stream = {}

def zero_overhead_stream(target_device):
    """Asynchronously create a tensor and copy it from host to device."""
    if target_device not in alloc_stream.keys():
        alloc_stream[target_device] = torch.cuda.Stream(device=target_device)
    return alloc_stream[target_device]