gpu_input_batch.py 24.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
4
# Datastructures defining an input batch

from dataclasses import dataclass
5
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, cast
6
7
8
9

import numpy as np
import torch

10
from vllm.lora.request import LoRARequest
11
12
13
from vllm.multimodal import MultiModalKwargs
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.v1.sample.metadata import SamplingMetadata
14
from vllm.v1.utils import copy_slice
15
from vllm.v1.worker.block_table import BlockTable
16

17
18
_SAMPLING_EPS = 1e-5

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
if TYPE_CHECKING:
    from vllm.multimodal.inputs import PlaceholderRange


@dataclass
class CachedRequestState:

    req_id: str
    prompt_token_ids: List[int]
    prompt: Optional[str]
    mm_inputs: List[MultiModalKwargs]
    mm_positions: List["PlaceholderRange"]
    sampling_params: SamplingParams
    generator: Optional[torch.Generator]

    block_ids: List[int]
    num_computed_tokens: int
    output_token_ids: List[int]

38
39
40
    mrope_positions: Optional[torch.Tensor] = None
    mrope_position_delta: Optional[int] = None

41
42
    lora_request: Optional[LoRARequest] = None

43
44
45
46
47
48
49
50
51
52
53
54
55
56
    @property
    def num_tokens(self) -> int:
        return len(self.prompt_token_ids) + len(self.output_token_ids)


class InputBatch:

    def __init__(
        self,
        max_num_reqs: int,
        max_model_len: int,
        max_num_blocks_per_req: int,
        device: torch.device,
        pin_memory: bool,
57
        vocab_size: int,
58
59
60
61
62
63
    ):
        self.max_num_reqs = max_num_reqs
        self.max_model_len = max_model_len
        self.max_num_blocks_per_req = max_num_blocks_per_req
        self.device = device
        self.pin_memory = pin_memory
64
        self.vocab_size = vocab_size
65

66
        self._req_ids: List[Optional[str]] = []
67
68
        self.req_id_to_index: Dict[str, int] = {}

69
70
        # TODO(woosuk): This buffer could be too large if max_model_len is big.
        # Find a way to reduce the CPU memory usage.
71
72
        # This buffer is not directly transferred to the GPU, so it does not
        # need to be pinned.
73
74
75
76
        self.token_ids_cpu_tensor = torch.zeros(
            (max_num_reqs, max_model_len),
            device="cpu",
            dtype=torch.int32,
77
            pin_memory=False,
78
79
        )
        self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
80
        self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
81
        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
82
        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
83
        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
84

85
86
87
88
89
        # Block table.
        self.block_table = BlockTable(
            max_num_reqs=max_num_reqs,
            max_model_len=max_model_len,
            max_num_blocks_per_req=max_num_blocks_per_req,
90
            pin_memory=pin_memory,
91
            device=device,
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        )

        # Sampling-related.
        self.temperature = torch.empty((max_num_reqs, ),
                                       dtype=torch.float32,
                                       device=device)
        self.temperature_cpu_tensor = torch.empty((max_num_reqs, ),
                                                  dtype=torch.float32,
                                                  device="cpu",
                                                  pin_memory=pin_memory)
        self.temperature_cpu = self.temperature_cpu_tensor.numpy()
        self.greedy_reqs: Set[str] = set()
        self.random_reqs: Set[str] = set()

        self.top_p = torch.empty((max_num_reqs, ),
                                 dtype=torch.float32,
                                 device=device)
        self.top_p_cpu_tensor = torch.empty((max_num_reqs, ),
                                            dtype=torch.float32,
                                            device="cpu",
                                            pin_memory=pin_memory)
        self.top_p_cpu = self.top_p_cpu_tensor.numpy()
        self.top_p_reqs: Set[str] = set()

        self.top_k = torch.empty((max_num_reqs, ),
                                 dtype=torch.int32,
                                 device=device)
        self.top_k_cpu_tensor = torch.empty((max_num_reqs, ),
                                            dtype=torch.int32,
                                            device="cpu",
                                            pin_memory=pin_memory)
        self.top_k_cpu = self.top_k_cpu_tensor.numpy()
        self.top_k_reqs: Set[str] = set()

126
127
128
129
130
131
132
133
134
135
        self.min_p = torch.empty((max_num_reqs, ),
                                 dtype=torch.float32,
                                 device=device)
        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
                                            dtype=torch.float32,
                                            device="cpu",
                                            pin_memory=pin_memory)
        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
        self.min_p_reqs: Set[str] = set()

136
137
138
139
140
141
142
143
144
145
        # Frequency penalty related data structures
        self.frequency_penalties = torch.empty((max_num_reqs, ),
                                               dtype=torch.float,
                                               device=device)
        self.frequency_penalties_cpu_tensor = torch.empty(
            (max_num_reqs, ),
            dtype=torch.float,
            device="cpu",
            pin_memory=pin_memory)
        self.frequency_penalties_cpu = \
146
            self.frequency_penalties_cpu_tensor.numpy()
147
148
149
150
151
152
153
154
155
156
        self.frequency_penalties_reqs: Set[str] = set()

        # Presence penalty related data structures
        self.presence_penalties = torch.empty((max_num_reqs, ),
                                              dtype=torch.float,
                                              device=device)
        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
                                                         dtype=torch.float,
                                                         device="cpu",
                                                         pin_memory=pin_memory)
157
158
        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
        )
159
160
161
162
163
164
165
166
167
168
169
170
        self.presence_penalties_reqs: Set[str] = set()

        # Repetition penalty related data structures
        self.repetition_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
                                                device=device)
        self.repetition_penalties_cpu_tensor = torch.empty(
            (max_num_reqs, ),
            dtype=torch.float,
            device="cpu",
            pin_memory=pin_memory)
        self.repetition_penalties_cpu = \
171
            self.repetition_penalties_cpu_tensor.numpy()
172
173
        self.repetition_penalties_reqs: Set[str] = set()

174
175
        # req_index -> (min_tokens, stop_token_ids)
        self.min_tokens: Dict[int, Tuple[int, Set[int]]] = {}
176

177
178
179
180
181
182
        # lora related
        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                             dtype=np.int32)
        self.lora_id_to_request_ids: Dict[int, Set[str]] = {}
        self.lora_id_to_lora_request: Dict[int, LoRARequest] = {}

183
        # req_index -> generator
184
185
        # NOTE(woosuk): The indices of the requests that do not have their own
        # generator should not be included in the dictionary.
186
187
188
        self.generators: Dict[int, torch.Generator] = {}

        self.num_logprobs: Dict[str, int] = {}
189
190
191
        # NOTE(rob): num_prompt_logprobs only includes reqs
        # that are currently in the prefill phase.
        self.num_prompt_logprobs: Dict[str, int] = {}
192

193
194
        self.logit_bias: List[Optional[Dict[int,
                                            float]]] = [None] * max_num_reqs
195
196
197
        self.has_allowed_token_ids: Set[str] = set()
        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
198

199
200
201
202
203
204
205
206
207
208
209
        self.req_output_token_ids: List[Optional[List[int]]] = []

        # This is updated each time the batch constituents change.
        self.sampling_metadata = self._make_sampling_metadata()

    @property
    def req_ids(self) -> List[str]:
        # None elements should only be present transiently
        # while performing state updates to the batch.
        return cast(List[str], self._req_ids)

210
211
212
213
214
215
216
217
218
219
    def add_request(
        self,
        request: "CachedRequestState",
        req_index: Optional[int] = None,
    ) -> None:
        if req_index is None:
            req_index = self.num_reqs
        assert req_index < self.max_num_reqs

        req_id = request.req_id
220
221
222
223
224
225
226
        if req_index == len(self._req_ids):
            self._req_ids.append(req_id)
            self.req_output_token_ids.append(request.output_token_ids)
        else:
            self._req_ids[req_index] = req_id
            self.req_output_token_ids[req_index] = request.output_token_ids

227
228
229
230
        self.req_id_to_index[req_id] = req_index

        # Copy the prompt token ids and output token ids.
        num_prompt_tokens = len(request.prompt_token_ids)
231
        self.num_prompt_tokens[req_index] = num_prompt_tokens
232
233
234
235
236
237
        self.token_ids_cpu[
            req_index, :num_prompt_tokens] = request.prompt_token_ids
        start_idx = num_prompt_tokens
        end_idx = start_idx + len(request.output_token_ids)
        self.token_ids_cpu[req_index,
                           start_idx:end_idx] = request.output_token_ids
238
239
        # Number of token ids in token_ids_cpu.
        # NOTE(woosuk): This may include spec decode tokens.
240
        self.num_tokens[req_index] = request.num_tokens
241
242
        # Number of tokens without spec decode tokens.
        self.num_tokens_no_spec[req_index] = request.num_tokens
243
244

        self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
245
        self.block_table.add_row(req_index, request.block_ids)
246
247
248

        sampling_params = request.sampling_params
        if sampling_params.sampling_type == SamplingType.GREEDY:
249
250
            # Avoid later division by zero.
            self.temperature_cpu[req_index] = -1.0
251
252
            self.greedy_reqs.add(req_id)
        else:
253
            self.temperature_cpu[req_index] = sampling_params.temperature
254
255
256
257
258
259
260
261
            self.random_reqs.add(req_id)

        self.top_p_cpu[req_index] = sampling_params.top_p
        if sampling_params.top_p < 1:
            self.top_p_reqs.add(req_id)
        self.top_k_cpu[req_index] = sampling_params.top_k
        if sampling_params.top_k > 0:
            self.top_k_reqs.add(req_id)
262
        self.min_p_cpu[req_index] = sampling_params.min_p
263
264
        self.frequency_penalties_cpu[
            req_index] = sampling_params.frequency_penalty
265
266
        if sampling_params.min_p > _SAMPLING_EPS:
            self.min_p_reqs.add(req_id)
267
268
        if sampling_params.frequency_penalty != 0.0:
            self.frequency_penalties_reqs.add(req_id)
269
270
        self.presence_penalties_cpu[
            req_index] = sampling_params.presence_penalty
271
272
        if sampling_params.presence_penalty != 0.0:
            self.presence_penalties_reqs.add(req_id)
273
274
        self.repetition_penalties_cpu[
            req_index] = sampling_params.repetition_penalty
275
276
        if sampling_params.repetition_penalty != 1.0:
            self.repetition_penalties_reqs.add(req_id)
277
278
279
        if sampling_params.min_tokens:
            self.min_tokens[req_index] = (sampling_params.min_tokens,
                                          sampling_params.all_stop_token_ids)
280

281
282
283
284
        # NOTE(woosuk): self.generators should not include the requests that
        # do not have their own generator.
        if request.generator is not None:
            self.generators[req_index] = request.generator
285

286
287
288
289
        if sampling_params.logprobs is not None:
            self.num_logprobs[req_id] = sampling_params.logprobs
        if sampling_params.prompt_logprobs is not None:
            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
290
291
        if sampling_params.logit_bias is not None:
            self.logit_bias[req_index] = sampling_params.logit_bias
292

293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
        if sampling_params.allowed_token_ids:
            self.has_allowed_token_ids.add(req_id)
            if self.allowed_token_ids_mask_cpu_tensor is None:
                # Lazy allocation for this tensor, which can be large.
                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
                                                          self.vocab_size,
                                                          dtype=torch.bool,
                                                          device=self.device)
                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
                    self.max_num_reqs,
                    self.vocab_size,
                    dtype=torch.bool,
                    device="cpu")
            self.allowed_token_ids_mask_cpu_tensor[req_index][
                sampling_params.allowed_token_ids] = True

309
310
311
312
313
314
315
316
317
318
319
320
321
        # Add request lora ID
        if request.lora_request:
            lora_id = request.lora_request.lora_int_id
            if lora_id not in self.lora_id_to_request_ids:
                self.lora_id_to_request_ids[lora_id] = set()

            self.request_lora_mapping[req_index] = lora_id
            self.lora_id_to_request_ids[lora_id].add(request.req_id)
            self.lora_id_to_lora_request[lora_id] = request.lora_request
        else:
            # No LoRA
            self.request_lora_mapping[req_index] = 0

322
    def remove_request(self, req_id: str) -> Optional[int]:
323
324
        """This method must always be followed by a call to condense()."""

325
326
327
        req_index = self.req_id_to_index.pop(req_id, None)
        if req_index is None:
            return None
328
329
        self._req_ids[req_index] = None
        self.req_output_token_ids[req_index] = None
330
331
332
333
334

        self.greedy_reqs.discard(req_id)
        self.random_reqs.discard(req_id)
        self.top_p_reqs.discard(req_id)
        self.top_k_reqs.discard(req_id)
335
        self.min_p_reqs.discard(req_id)
336
        self.min_tokens.pop(req_index, None)
337
338
339
        self.frequency_penalties_reqs.discard(req_id)
        self.presence_penalties_reqs.discard(req_id)
        self.repetition_penalties_reqs.discard(req_id)
340
341
        self.generators.pop(req_index, None)
        self.num_logprobs.pop(req_id, None)
342
        self.num_prompt_logprobs.pop(req_id, None)
343
344
345
346
347
348
349
350
351
352

        # LoRA
        lora_id = self.request_lora_mapping[req_index]
        if lora_id != 0:
            self.lora_id_to_request_ids[lora_id].discard(req_id)
            if len(self.lora_id_to_request_ids[lora_id]) == 0:
                self.lora_id_to_request_ids.pop(lora_id)
                self.lora_id_to_lora_request.pop(lora_id)
            self.request_lora_mapping[req_index] = 0

353
        self.logit_bias[req_index] = None
354
355
356
        self.has_allowed_token_ids.discard(req_id)
        if self.allowed_token_ids_mask_cpu_tensor is not None:
            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
357
358
359
        return req_index

    def condense(self, empty_req_indices: List[int]) -> None:
360
361
        num_reqs = self.num_reqs
        if num_reqs == 0:
362
            # The batched states are empty.
363
364
            self._req_ids.clear()
            self.req_output_token_ids.clear()
365
366
367
368
            return

        # NOTE(woosuk): This function assumes that the empty_req_indices
        # is sorted in descending order.
369
        last_req_index = num_reqs + len(empty_req_indices) - 1
370
371
372
373
374
375
376
377
378
379
380
        while empty_req_indices:
            # Find the largest non-empty index.
            while last_req_index in empty_req_indices:
                last_req_index -= 1

            # Find the smallest empty index.
            empty_index = empty_req_indices.pop()
            if empty_index >= last_req_index:
                break

            # Swap the states.
381
382
            req_id = self._req_ids[last_req_index]
            output_token_ids = self.req_output_token_ids[last_req_index]
383
            assert req_id is not None
384
385
386
387
            self._req_ids[empty_index] = req_id
            self._req_ids[last_req_index] = None
            self.req_output_token_ids[empty_index] = output_token_ids
            self.req_output_token_ids[last_req_index] = None
388
389
            self.req_id_to_index[req_id] = empty_index

390
391
392
393
            num_tokens = self.num_tokens[last_req_index]
            self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                last_req_index, :num_tokens]
            self.num_tokens[empty_index] = num_tokens
394
395
            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                last_req_index]
396
397
            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
                last_req_index]
398
399
            self.num_computed_tokens_cpu[
                empty_index] = self.num_computed_tokens_cpu[last_req_index]
400
            self.block_table.move_row(last_req_index, empty_index)
401
402
403
404
            self.temperature_cpu[empty_index] = self.temperature_cpu[
                last_req_index]
            self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
            self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
405
406
407
408
409
410
            self.frequency_penalties_cpu[
                empty_index] = self.frequency_penalties_cpu[last_req_index]
            self.presence_penalties_cpu[
                empty_index] = self.presence_penalties_cpu[last_req_index]
            self.repetition_penalties_cpu[
                empty_index] = self.repetition_penalties_cpu[last_req_index]
411
            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
412
413
414
415
            generator = self.generators.pop(last_req_index, None)
            if generator is not None:
                self.generators[empty_index] = generator

416
417
418
419
            min_token = self.min_tokens.pop(last_req_index, None)
            if min_token is not None:
                self.min_tokens[empty_index] = min_token

420
421
422
            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                last_req_index]

423
424
            self.logit_bias[empty_index] = self.logit_bias[last_req_index]

425
426
427
428
429
            if self.allowed_token_ids_mask_cpu_tensor is not None:
                self.allowed_token_ids_mask_cpu_tensor[
                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
                        last_req_index]

430
431
432
            # Decrement last_req_index since it is now empty.
            last_req_index -= 1

433
434
435
436
437
438
439
440
441
        # Trim lists to the batch size.
        del self._req_ids[self.num_reqs:]
        del self.req_output_token_ids[self.num_reqs:]

    def refresh_sampling_metadata(self):
        self.sampling_metadata = self._make_sampling_metadata()

    def _make_sampling_metadata(self) -> SamplingMetadata:
        num_reqs = self.num_reqs
442
443
444
445
446
        if not self.all_greedy:
            temperature = copy_slice(self.temperature_cpu_tensor,
                                     self.temperature, num_reqs)
        else:
            temperature = None
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
        if not self.no_top_p:
            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
        if not self.no_top_k:
            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
        if not self.no_min_p:
            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)

        if not self.no_penalties:
            # Since syncing these tensors is expensive only copy them
            # if necessary i.e. if there are requests which require
            # penalties to be applied during sampling.
            copy_slice(self.frequency_penalties_cpu_tensor,
                       self.frequency_penalties, num_reqs)
            copy_slice(self.presence_penalties_cpu_tensor,
                       self.presence_penalties, num_reqs)
            copy_slice(self.repetition_penalties_cpu_tensor,
                       self.repetition_penalties, num_reqs)

            # The prompt tokens are used only for applying penalties during
            # the sampling process. Hence copy these tensors only when
            # there are requests which need penalties to be applied.
            prompt_token_ids = self._make_prompt_token_ids_tensor()
        else:
            prompt_token_ids = None
471

472
473
474
475
476
477
478
        allowed_token_ids_mask: Optional[torch.Tensor] = None
        if not self.no_allowed_token_ids:
            assert self.allowed_token_ids_mask is not None
            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
                       self.allowed_token_ids_mask, num_reqs)
            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]

479
        return SamplingMetadata(
480
            temperature=temperature,
481
482
            all_greedy=self.all_greedy,
            all_random=self.all_random,
483
484
485
            top_p=None if self.no_top_p else self.top_p[:num_reqs],
            top_k=None if self.no_top_k else self.top_k[:num_reqs],
            min_p=None if self.no_min_p else self.min_p[:num_reqs],
486
487
            generators=self.generators,
            max_num_logprobs=self.max_num_logprobs,
488
489
490
491
492
493
            prompt_token_ids=prompt_token_ids,
            frequency_penalties=self.frequency_penalties[:num_reqs],
            presence_penalties=self.presence_penalties[:num_reqs],
            repetition_penalties=self.repetition_penalties[:num_reqs],
            output_token_ids=cast(List[List[int]], self.req_output_token_ids),
            min_tokens=self.min_tokens,
494
            no_penalties=self.no_penalties,
495
            logit_bias=self.logit_bias[:num_reqs],
496
            allowed_token_ids_mask=allowed_token_ids_mask,
497
498
        )

499
500
501
502
503
504
    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
        prompt_token_ids_cpu_tensor = torch.empty(
            (self.num_reqs, max_prompt_len),
            device="cpu",
            dtype=torch.int64,
505
506
            pin_memory=self.pin_memory,
        )
507
        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
508
509
        prompt_token_ids[:] = self.token_ids_cpu[:self.
                                                 num_reqs, :max_prompt_len]
510
511
512
513
514
515
516
        # Use the value of vocab_size as a pad since we don't have a
        # token_id of this value.
        for i in range(self.num_reqs):
            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
        return prompt_token_ids_cpu_tensor.to(device=self.device,
                                              non_blocking=True)

517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
    def make_lora_inputs(
        self, num_scheduled_tokens: np.ndarray
    ) -> Tuple[Tuple[int, ...], Tuple[int, ...], Set[LoRARequest]]:
        """
        Given the num_scheduled_tokens for each request in the batch, return
        datastructures used to activate the current LoRAs.
        Returns:
            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
               where, token_lora_mapping[i] is the LoRA id to use for ith token.
            3. lora_requests: Set of relevant LoRA requests.
        """

        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
        prompt_lora_mapping = tuple(req_lora_mapping)
        token_lora_mapping = tuple(
            req_lora_mapping.repeat(num_scheduled_tokens))
        active_lora_requests: Set[LoRARequest] = set(
            self.lora_id_to_lora_request.values())

        return prompt_lora_mapping, token_lora_mapping, active_lora_requests

540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
    @property
    def num_reqs(self) -> int:
        return len(self.req_id_to_index)

    @property
    def all_greedy(self) -> bool:
        return len(self.random_reqs) == 0

    @property
    def all_random(self) -> bool:
        return len(self.greedy_reqs) == 0

    @property
    def no_top_p(self) -> bool:
        return len(self.top_p_reqs) == 0

    @property
    def no_top_k(self) -> bool:
        return len(self.top_k_reqs) == 0

560
561
562
563
    @property
    def no_min_p(self) -> bool:
        return len(self.min_p_reqs) == 0

564
565
566
567
568
569
    @property
    def no_penalties(self) -> bool:
        return (len(self.presence_penalties_reqs) == 0
                and len(self.frequency_penalties_reqs) == 0
                and len(self.repetition_penalties_reqs) == 0)

570
    @property
571
572
    def max_num_logprobs(self) -> Optional[int]:
        return max(self.num_logprobs.values()) if self.num_logprobs else None
573
574
575

    @property
    def no_prompt_logprob(self) -> bool:
576
        return not self.num_prompt_logprobs
577
578
579
580

    @property
    def no_allowed_token_ids(self) -> bool:
        return len(self.has_allowed_token_ids) == 0