"vllm/multimodal/processing/processor.py" did not exist on "6173682b6e98ae62f612db7af6813831097b23dc"
utils.py 7.73 KB
Newer Older
1
from array import array
2
from itertools import count
3
4
5
from typing import Callable, Dict, List, Optional
from typing import Sequence as GenericSequence
from typing import TypeVar, Union
6
from unittest.mock import MagicMock
7

8
9
import torch

10
from vllm.engine.arg_utils import EngineArgs
11
from vllm.model_executor.layers.sampler import SamplerOutput
12
from vllm.model_executor.utils import set_random_seed
13
from vllm.sampling_params import SamplingParams
14
15
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE,
                           CompletionSequenceGroupOutput, Logprob,
16
                           SequenceData, SequenceGroupMetadata, SequenceOutput)
17
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
18
from vllm.worker.cache_engine import CacheEngine
19
from vllm.worker.model_runner import ModelRunner
20
from vllm.worker.worker import Worker
21

22
23
T = TypeVar("T", bound=Worker)

24
25
26
27
28

def round_up_to_next_block(seq_len: int, block_size: int) -> int:
    return (seq_len + block_size - 1) // block_size


29
30
31
def mock_worker(cls=None,
                vocab_size: int = 30_000,
                max_model_len: int = 2048,
32
33
                rank: int = 0,
                use_spec: bool = True) -> MagicMock:
34
35
36
    if cls is None:
        cls = Worker

37
38
39
    spec = cls if use_spec else None

    worker = MagicMock(spec=spec)
40
41
42
43
44
45
46
    worker.vocab_size = vocab_size
    worker.max_model_len = max_model_len
    worker.rank = rank
    worker.device = 'cuda:0'
    return worker


47
48
49
50
51
52
53
54
55
56
57
58
def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
    seed_iter = iter(rand_seeds)
    original_execute_model = worker.execute_model

    def new_execute_model(*args, **kwargs):
        result = original_execute_model(*args, **kwargs)
        set_random_seed(next(seed_iter))
        return result

    return new_execute_model


59
60
61
def zero_kv_cache(cache_engine: List[CacheEngine]):
    assert cache_engine[0].gpu_cache
    for key_blocks, value_blocks in cache_engine[0].gpu_cache:
62
63
64
65
        key_blocks.zero_()
        value_blocks.zero_()


66
def create_worker(cls: Callable[..., T],
67
68
69
70
71
                  model_name: str,
                  block_size: int,
                  num_gpu_blocks: int,
                  seed: int,
                  is_driver_worker: bool = True,
72
73
                  enforce_eager: bool = True,
                  model_runner_cls: Optional[ModelRunner] = None) -> T:
74
75
76
77
78
79
    engine_args = EngineArgs(
        model=model_name,
        seed=seed,
        block_size=block_size,
        enforce_eager=enforce_eager,
    )
80
    engine_config = engine_args.create_engine_config()
81
82
83
84
85

    distributed_init_method = get_distributed_init_method(
        get_ip(), get_open_port())

    worker = cls(
86
87
88
89
        model_config=engine_config.model_config,
        parallel_config=engine_config.parallel_config,
        scheduler_config=engine_config.scheduler_config,
        device_config=engine_config.device_config,
90
        cache_config=engine_config.cache_config,
91
        load_config=engine_config.load_config,
92
93
94
95
        local_rank=0,
        rank=0,
        distributed_init_method=distributed_init_method,
        is_driver_worker=is_driver_worker,
96
        model_runner_cls=model_runner_cls,
97
98
    )

99
    worker.init_device()
100
101
    worker.load_model()

102
103
    engine_config.cache_config.num_gpu_blocks = num_gpu_blocks
    engine_config.cache_config.num_cpu_blocks = 0
104
105
106
    worker.initialize_cache(
        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
107
108
109
110
111
112
113
114

    return worker


def create_seq_group_metadata_from_prompts(
    prompts: List[List[int]],
    num_gpu_blocks: int,
    block_size: int,
115
    final_prompt_lens: List[int],
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    continuations: Optional[List[List[int]]] = None,
    seq_ids: Optional[List[int]] = None,
) -> List[SequenceGroupMetadata]:

    if continuations is None:
        continuations = [[] for _ in prompts]

    if seq_ids is None:
        seq_ids = list(i for i, _ in enumerate(prompts))

    free_gpu_blocks = list(range(num_gpu_blocks))

    block_allocations = {
        i: [
            free_gpu_blocks.pop()
            for _ in range(round_up_to_next_block(final_len, block_size))
        ]
133
        for i, final_len in enumerate(final_prompt_lens)
134
135
136
137
138
139
140
141
    }

    return [
        SequenceGroupMetadata(
            request_id=str(i),
            is_prompt=len(cont_token_ids) == 0,
            seq_data={
                i:
142
                SequenceData(
143
144
145
                    array(VLLM_TOKEN_ID_ARRAY_TYPE, prompt_token_ids[:]),
                    _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                            cont_token_ids[:]),
146
                ),
147
148
149
            },
            sampling_params=SamplingParams(temperature=0.0, ),
            block_tables={i: block_allocations[i][:]},
150
151
        ) for i, (prompt_token_ids,
                  cont_token_ids) in enumerate(zip(prompts, continuations))
152
153
154
155
    ]


def assert_logprobs_dict_allclose(
156
157
        actual_logprobs: List[Dict[int, Logprob]],
        expected_logprobs: List[Dict[int, Logprob]]) -> None:
158
159
160
161
162
    for single_step_actual_logprobs, single_step_expected_logprobs in zip(
            actual_logprobs, expected_logprobs):
        assert set(single_step_actual_logprobs.keys()) == set(
            single_step_expected_logprobs.keys())
        for token_id in single_step_actual_logprobs:
163
164
165
166
            actual = torch.tensor(
                single_step_actual_logprobs[token_id].logprob)
            expected = torch.tensor(
                single_step_expected_logprobs[token_id].logprob)
167
            torch.testing.assert_close(actual, expected)
168
169
170
171


def create_sampler_output_list(
        token_ids: torch.Tensor,
172
173
        probs: GenericSequence[Optional[torch.Tensor]],
        logprobs: GenericSequence[Optional[torch.Tensor]],
174
175
176
177
178
179
180
181
182
        seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
    num_steps, batch_size = token_ids.shape
    token_ids_by_step = token_ids.tolist()

    if seq_ids is None:
        seq_ids = list(range(batch_size))

    return [
        SamplerOutput(outputs=[
183
            CompletionSequenceGroupOutput(
184
185
186
187
                samples=[
                    SequenceOutput(
                        output_token=token_id,
                        parent_seq_id=seq_ids[seq_index],
188
                        logprobs={token_id: Logprob(0)},
189
190
191
192
193
194
                    )
                ],
                prompt_logprobs=None,
            ) for seq_index, token_id in enumerate(token_ids_by_step[step])
        ],
                      sampled_token_probs=probs[step],
195
                      logprobs=logprobs[step],
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
                      sampled_token_ids=token_ids[step])
        for step in range(num_steps)
    ]


def create_batch(batch_size,
                 k,
                 prompt_len: Union[int, List[int]] = 10,
                 prev_output_token_len: int = 10,
                 seq_ids: Optional[List[int]] = None,
                 num_gpu_blocks: Optional[int] = None,
                 block_size: Optional[int] = None):
    if block_size is None:
        block_size = 8

    if num_gpu_blocks is None:
        num_gpu_blocks = 2048 // block_size

    iterator = count()

    if isinstance(prompt_len, int):
        prompt_lens = [prompt_len for _ in range(batch_size)]
    else:
        prompt_lens = prompt_len

    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
    prev_output_tokens = [[
        next(iterator) for _ in range(prev_output_token_len)
    ] for _ in range(batch_size)]
225
    final_prompt_lens = [
226
227
228
229
        len(prompt) + len(prev_output_token) + k + 1
        for prompt, prev_output_token in zip(prompts, prev_output_tokens)
    ]

230
231
232
233
    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
        prompts, num_gpu_blocks, block_size, final_prompt_lens,
        prev_output_tokens, seq_ids)
    return seq_group_metadata_list, prompts, prev_output_tokens