sampling_metadata.py 24.8 KB
Newer Older
1
import random
2
from array import array
3
from dataclasses import dataclass
4
from typing import Dict, List, Optional, Tuple
5
6
7

import torch

8
from vllm.sampling_params import SamplingParams, SamplingType
9
from vllm.sequence import SequenceData, SequenceGroupMetadata
10
from vllm.triton_utils.sample import get_num_triton_sampler_splits
11
from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
12
                        make_tensor_with_pad, maybe_expand_dim)
13
14

_SAMPLING_EPS = 1e-5
15
_SEED_0_REPLACEMENT = 3403598558
16
17
# Some triton sampler related code is guarded before it is ready.
_USE_TRITON_SAMPLER = False
18
19


20
21
@dataclass
class SequenceGroupToSample:
22
23
24
25
26
27
28
    # |---------- N-1 iteration --------|
    # |---------------- N iteration ---------------------|
    # |- tokenA -|......................|-- newTokens ---|
    # |---------- context_len ----------|
    # |-------------------- seq_len ----------------------|
    #                                   |-- query_len ---|

29
30
31
32
33
    # Sequence ids for the sequence group in a previous step.
    seq_ids: List[int]
    sampling_params: SamplingParams
    # seq_id -> sequence data.
    seq_data: Dict[int, SequenceData]
34
35
    # The length of the sequence (all tokens seen in the past + new token to
    # compute attention) of the sequence group. None if it is in a decode
36
    # stage.
37
38
39
40
41
    seq_len: Optional[int]
    # The length of new query tokens to compute in the current step. None if it
    # is in a decode stage. The length of query_len <= seq_len if chunked
    # prefill is enabled.
    query_len: Optional[int]
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    # A random number generator for sampling.
    generator: Optional[torch.Generator]
    # True if the sequence group is in prefill stage. False if it is in a
    # decode stage.
    is_prompt: bool
    # Query token indices from logits. to compute prompt logprob. Empty if
    # prompt logprob is not required.
    prompt_logprob_indices: List[int]
    # Sample token indices from logits. Empty if sampling is not required.
    sample_indices: List[int]

    @property
    def do_sample(self):
        return len(self.sample_indices) > 0

    def __post_init__(self):
        if len(self.prompt_logprob_indices) > 0:
            assert self.sampling_params.prompt_logprobs is not None
        if self.is_prompt:
61
62
            assert self.seq_len is not None
            assert self.query_len is not None
63
64


65
66
67
class SamplingMetadata:
    """Metadata for input sequences. Used in sampler.

68
69
70
71
72
73
74
75
76
77
    The usage is as follow;
    ```
    hidden_states = execute_model(...)
    logits = hidden_states[sampling_metadata.selected_token_indices]
    sample(logits)

    def sample(logits):
        # Use categorized_sample_indices for sampling....
    ```

78
    Args:
79
80
81
        seq_groups: List of batched sequence groups.
        selected_token_indices: (num_query_tokens_to_logprob). Indices to find
            logits from the initial model output hidden states.
82
        categorized_sample_indices: SamplingType -> token indices to sample.
83
84
85
86
87
88
89
90
91
            Each token indices is 2D tensor of (num_indices, num_indices) where
            the first item means the sample index within the returned logit
            (before pruning padding), and the second item means the sample
            index after pruning using selected_token_indices.
            For example, if the returned logit is [1, 2, 3], and we select
            [1, 2] for sampling, the pruned logit will be [2, 3]. In this case,
            The first tuple is [1, 2] (sampled index within original logit),
            and the second tuple is [0, 1] (sampled index within pruned logit).
        num_prompts: Number of prompt sequence groups in seq_groups.
92
93
94
95
96
97
        skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU 
            serialization of token outputs.
        reuse_sampling_tensors: Indicates if we want to reuse sampling 
            tensors that are part of the sampler forward pass. Currently,
            it is mainly used for multi-step decode.
            
98
99
100
101
    """

    def __init__(
        self,
102
        seq_groups: List[SequenceGroupToSample],
103
        selected_token_indices: torch.Tensor,
104
105
        categorized_sample_indices: Dict[SamplingType, torch.Tensor],
        num_prompts: int,
106
107
        skip_sampler_cpu_output: bool = False,
        reuse_sampling_tensors: bool = False,
108
109
110
111
    ) -> None:
        self.seq_groups = seq_groups
        self.selected_token_indices = selected_token_indices
        self.categorized_sample_indices = categorized_sample_indices
112
        self.num_prompts = num_prompts
113
114
        self.skip_sampler_cpu_output = skip_sampler_cpu_output
        self.reuse_sampling_tensors = reuse_sampling_tensors
115

116
117
118
    @staticmethod
    def prepare(
        seq_group_metadata_list: List[SequenceGroupMetadata],
119
120
        seq_lens: List[int],
        query_lens: Optional[List[int]],
121
122
        device: str,
        pin_memory: bool,
123
        generators: Optional[Dict[str, torch.Generator]] = None,
124
125
126
127
128
129
    ) -> "SamplingMetadata":
        (
            seq_groups,
            selected_token_indices,
            categorized_sample_indices,
            num_prompts,
130
        ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
131
                                device, generators)
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
        selected_token_indices = async_tensor_h2d(selected_token_indices,
                                                  dtype=torch.long,
                                                  target_device=device,
                                                  pin_memory=pin_memory)
        categorized_sample_indices = {
            t: maybe_expand_dim(
                async_tensor_h2d(seq_ids,
                                 dtype=torch.int,
                                 target_device=device,
                                 pin_memory=pin_memory), 2, 2)
            for t, seq_ids in categorized_sample_indices.items()
        }

        sampling_metadata = SamplingMetadata(
            seq_groups=seq_groups,
            selected_token_indices=selected_token_indices,
            categorized_sample_indices=categorized_sample_indices,
            num_prompts=num_prompts,
        )
        return sampling_metadata
152
153
154
155
156
157

    def __repr__(self) -> str:
        return (
            "SamplingMetadata("
            f"seq_groups={self.seq_groups}, "
            f"selected_token_indices={self.selected_token_indices}, "
158
159
160
161
162
            f"categorized_sample_indices={self.categorized_sample_indices}), ")


def _prepare_seq_groups(
    seq_group_metadata_list: List[SequenceGroupMetadata],
163
164
    seq_lens: List[int],
    query_lens: Optional[List[int]],
165
    device: str,
166
    generators: Optional[Dict[str, torch.Generator]] = None,
167
168
169
170
171
172
) -> Tuple[List[SequenceGroupToSample], List[int], Dict[
        SamplingType, List[Tuple[int, int]]], int]:
    """Prepare sequence groups and indices for sampling.

    Args:
        seq_group_metadata_list: A list of sequence group to batch.
173
        seq_lens: A list of sequence lens per sequence group.
174
            Index of prompt len should match with seq_group_metadata_list.
175
        query_lens: A list of query lengths. Prompt lens include the length
176
            of entire prompt tokens, and it could be shorter.
177
        device: A device to use for random number generators,
178
            `SequenceGroupToSample.generator`.
179
180
        generators: A store of per-request random number generators used
            for seeded requests.
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

    Returns:
        seq_groups: A list of sequence group to sample.
        selected_token_indices: See the definition from `SamplingMetadata`.
        categorized_sample_indices: See the definition from `SamplingMetadata`.
        num_prompts: Total number of prompts from `seq_group_metadata_list`.
    """
    # Batched sequence groups for the current model forward stsep.
    seq_groups: List[SequenceGroupToSample] = []
    # A list of token indices to sample/compute logprob. It is used to
    # prune the outcome logits from the model for the performance.
    selected_token_indices: List[int] = []
    # Used for selected_token_indices.
    model_output_idx = 0

    # Sampling type -> (
    # indices to sample/prompt logprob within pruned output logits,
    # indices to sample within pruned logits)
    categorized_sample_indices: Dict[SamplingType, List[Tuple[int, int]]] = {
        t: []
        for t in SamplingType
    }
    # Index of logits to compute logprob. Logits include both prompt logprob
    # and sample logprob indices.
    logit_idx = 0
    # Index to sample from a sample tensor. It is used by triton sample kernel.
    # See `_sample_with_triton_kernel` for more details.
    sample_idx = 0
    # Total number of prompts from given sequence groups.
    num_prompts = 0

    for i, seq_group_metadata in enumerate(seq_group_metadata_list):
        seq_ids = list(seq_group_metadata.seq_data.keys())
        sampling_params = seq_group_metadata.sampling_params
        is_prompt = seq_group_metadata.is_prompt
        generator: Optional[torch.Generator] = None
        # If the current seq group is in decode stage, it is None.
218
219
        seq_len: Optional[int] = None
        query_len: Optional[int] = None
220
221
222
223
224
225
        prompt_logprob_indices: List[int] = []
        sample_indices: List[int] = []
        do_sample = seq_group_metadata.do_sample

        if seq_group_metadata.is_prompt:
            if sampling_params.seed is not None:
226
227
228
229
                generator = torch.Generator(device=device).manual_seed(
                    sampling_params.seed)
                if generators is not None:
                    generators[seq_group_metadata.request_id] = generator
230
231
232
233

            num_prompts += 1
            num_prefill_sample = len(seq_ids)
            assert num_prefill_sample == 1
234
235
            assert query_lens is not None and seq_lens is not None
            query_len, seq_len = query_lens[i], seq_lens[i]
236
237
            # If we need sampling, exclude num_prefill_sample tokens from
            # prompt logprob.
238
239
            prompt_logprob_len = (query_len - num_prefill_sample
                                  if do_sample else query_len)
240
241
242
243
244
245
            sample_len = num_prefill_sample if do_sample else 0
        else:
            # Decode
            prompt_logprob_len = 0
            sample_len = len(seq_ids) if do_sample else 0

246
247
248
            if sampling_params.seed is not None and generators is not None:
                generator = generators.get(seq_group_metadata.request_id)

249
250
251
252
253
254
255
256
257
        # Update indices to select from the model output.
        """
        This blocks computes selected_token_indices which is used in the
        following way.

        hidden_states = model(...)
        logits = hidden_states[selected_token_indices]
        """

258
        if sampling_params.prompt_logprobs is not None:
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
            selected_token_indices.extend(
                range(model_output_idx, model_output_idx + prompt_logprob_len))
        model_output_idx += prompt_logprob_len
        if do_sample:
            selected_token_indices.extend(
                range(model_output_idx, model_output_idx + sample_len))
        model_output_idx += sample_len

        # We now find indices for logprob computation and sampling.
        """
        This block computes categorized_sample_indices which is used in the
        following way.

        hidden_states = model(...)
        logits = hidden_states[selected_token_indices]
        def sample(logits):
           # Use categorized_sample_indices for sampling.
           # prompt_logprob_indices to find prompt logprob indices.
           # sample_indices to find sample indices.
        """

        if sampling_params.prompt_logprobs is not None:
            prompt_logprob_indices.extend(
                range(logit_idx, logit_idx + prompt_logprob_len))
            logit_idx += prompt_logprob_len
        if do_sample:
            sample_indices.extend(range(logit_idx, logit_idx + sample_len))
            categorized_sample_indices[sampling_params.sampling_type].extend(
                list(
                    zip(range(logit_idx, logit_idx + sample_len),
                        range(sample_idx, sample_idx + sample_len))))
            logit_idx += sample_len
            sample_idx += sample_len

        seq_groups.append(
            SequenceGroupToSample(
                seq_ids=seq_ids,
                sampling_params=sampling_params,
                seq_data=seq_group_metadata.seq_data,
298
299
                seq_len=seq_len,
                query_len=query_len,
300
301
302
303
304
305
                generator=generator,
                is_prompt=is_prompt,
                prompt_logprob_indices=list(prompt_logprob_indices),
                sample_indices=list(sample_indices)))
    return (seq_groups, selected_token_indices, categorized_sample_indices,
            num_prompts)
306
307
308
309
310
311
312
313
314
315
316
317
318


@dataclass
class SamplingTensors:
    """Tensors for sampling."""

    temperatures: torch.Tensor
    top_ps: torch.Tensor
    top_ks: torch.Tensor
    min_ps: torch.Tensor
    presence_penalties: torch.Tensor
    frequency_penalties: torch.Tensor
    repetition_penalties: torch.Tensor
319
320
321
    sampling_seeds: torch.Tensor
    sample_indices: torch.Tensor
    extra_seeds: Optional[torch.Tensor]
322
323
324
325
326
    prompt_tokens: torch.Tensor
    output_tokens: torch.Tensor

    @classmethod
    def from_sampling_metadata(
327
328
329
330
331
332
333
334
335
336
337
338
339
340
        cls,
        sampling_metadata: "SamplingMetadata",
        vocab_size: int,
        device: torch.device,
        dtype: torch.dtype,
        *,
        extra_seeds_to_generate: int = 0,
        extra_entropy: Optional[Tuple[int, ...]] = None
    ) -> Tuple["SamplingTensors", bool, bool, bool]:
        """
        extra_seeds_to_generate: extra seeds to generate using the
            user-defined seed for each sequence.
        extra_entropy: extra entropy to use when generating seeds.
        """
341
342
        prompt_tokens: List[array] = []
        output_tokens: List[array] = []
343
344
345
346
347
348
349
        top_ks: List[int] = []
        temperatures: List[float] = []
        top_ps: List[float] = []
        min_ps: List[float] = []
        presence_penalties: List[float] = []
        frequency_penalties: List[float] = []
        repetition_penalties: List[float] = []
350
351
        sampling_seeds: List[int] = []
        sample_indices: List[int] = []
352
353
354
        do_penalties = False
        do_top_p_top_k = False
        do_min_p = False
355

356
357
358
359
360
361
        if _USE_TRITON_SAMPLER:
            prompt_best_of: List[int] = []

            # We need one base seed per Triton slice.
            seeds_to_generate = (extra_seeds_to_generate +
                                 get_num_triton_sampler_splits(vocab_size))
362

363
        assert sampling_metadata.seq_groups is not None
364
365
366
        for seq_group in sampling_metadata.seq_groups:
            seq_ids = seq_group.seq_ids
            sampling_params = seq_group.sampling_params
367
368
369
370
371
372
            temperature = sampling_params.temperature
            p = sampling_params.presence_penalty
            f = sampling_params.frequency_penalty
            r = sampling_params.repetition_penalty
            top_p = sampling_params.top_p
            min_p = sampling_params.min_p
373

374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
            # k should not be greater than the vocab size.
            top_k = min(sampling_params.top_k, vocab_size)
            top_k = vocab_size if top_k == -1 else top_k
            if temperature < _SAMPLING_EPS:
                # NOTE: Zero temperature means deterministic sampling
                # (i.e., greedy sampling or beam search).
                # Set the temperature to 1 to avoid division by zero.
                temperature = 1.0
            if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS
                                       or top_k != vocab_size):
                do_top_p_top_k = True
            if not do_min_p and min_p > _SAMPLING_EPS:
                do_min_p = True
            if not do_penalties and (abs(p) >= _SAMPLING_EPS
                                     or abs(f) >= _SAMPLING_EPS
                                     or abs(r - 1.0) >= _SAMPLING_EPS):
                do_penalties = True
391

392
            is_prompt = seq_group.is_prompt
393
            if (is_prompt and sampling_params.prompt_logprobs is not None):
394
395
                # For tokens in the prompt that we only need to get
                # their logprobs
396
397
                query_len = seq_group.query_len
                assert query_len is not None
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
                prefill_len = len(seq_group.prompt_logprob_indices)
                temperatures += [temperature] * prefill_len
                top_ps += [top_p] * prefill_len
                top_ks += [top_k] * prefill_len
                min_ps += [min_p] * prefill_len
                presence_penalties += [0] * prefill_len
                frequency_penalties += [0] * prefill_len
                repetition_penalties += [1] * prefill_len

            if seq_group.do_sample:
                sample_lens = len(seq_group.sample_indices)
                assert sample_lens == len(seq_ids)
                temperatures += [temperature] * len(seq_ids)
                top_ps += [top_p] * len(seq_ids)
                top_ks += [top_k] * len(seq_ids)
                min_ps += [min_p] * len(seq_ids)
                presence_penalties += [p] * len(seq_ids)
                frequency_penalties += [f] * len(seq_ids)
                repetition_penalties += [r] * len(seq_ids)

418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
            if _USE_TRITON_SAMPLER:
                if is_prompt:
                    prompt_best_of.append(sampling_params.best_of)
                    query_len = seq_group.query_len
                    assert query_len is not None

                seed = sampling_params.seed
                is_greedy = sampling_params.sampling_type == SamplingType.GREEDY

                for seq_id in seq_ids:
                    seq_data = seq_group.seq_data[seq_id]
                    extra_entropy = extra_entropy or ()
                    seq_seeds = cls._get_sequence_seeds(
                        seed,
                        seq_data.get_len(),
                        *extra_entropy,
                        seq_id,
                        seeds_to_generate=seeds_to_generate,
                        is_greedy=is_greedy)
                    sampling_seeds.append(seq_seeds)
                sample_indices.extend(seq_group.sample_indices)
439

440
441
442
443
444
445
        if do_penalties:
            for seq_group in sampling_metadata.seq_groups:
                seq_ids = seq_group.seq_ids
                if (seq_group.is_prompt
                        and sampling_params.prompt_logprobs is not None):
                    prefill_len = len(seq_group.prompt_logprob_indices)
446
447
448
449
                    prompt_tokens.extend(
                        array('l') for _ in range(prefill_len))
                    output_tokens.extend(
                        array('l') for _ in range(prefill_len))
450
451
452
                if seq_group.do_sample:
                    for seq_id in seq_ids:
                        seq_data = seq_group.seq_data[seq_id]
453
454
                        prompt_tokens.append(seq_data.prompt_token_ids_array)
                        output_tokens.append(seq_data.output_token_ids_array)
455

456
457
        sampling_tensors = SamplingTensors.from_lists(
            temperatures, top_ps, top_ks, min_ps, presence_penalties,
458
459
460
            frequency_penalties, repetition_penalties, sampling_seeds,
            sample_indices, prompt_tokens, output_tokens, vocab_size,
            extra_seeds_to_generate, device, dtype)
461
462
463
464
465
466
467
468
        return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)

    @classmethod
    def from_lists(cls, temperatures: List[float], top_ps: List[float],
                   top_ks: List[int], min_ps: List[float],
                   presence_penalties: List[float],
                   frequency_penalties: List[float],
                   repetition_penalties: List[float],
469
                   sampling_seeds: List[int], sample_indices: List[int],
470
471
472
                   prompt_tokens: List[array], output_tokens: List[array],
                   vocab_size: int, extra_seeds_to_generate: int,
                   device: torch.device,
473
474
475
                   dtype: torch.dtype) -> "SamplingTensors":
        # Note that the performance will be very bad without
        # pinned memory.
476
        pin_memory = is_pin_memory_available()
477
478
479
480

        do_penalties = prompt_tokens or output_tokens

        if do_penalties:
481
482
            prompt_t = make_tensor_with_pad(
                prompt_tokens,
483
                vocab_size,
484
485
486
487
488
489
                device="cpu",
                dtype=torch.int64,
                pin_memory=pin_memory,
            )
            output_t = make_tensor_with_pad(
                output_tokens,
490
                vocab_size,
491
492
493
494
495
496
497
498
                device="cpu",
                dtype=torch.int64,
                pin_memory=pin_memory,
            )
        else:
            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
            prompt_t = empty_tensor
            output_t = empty_tensor
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541

        temperatures_t = torch.tensor(
            temperatures,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        top_ps_t = torch.tensor(
            top_ps,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        min_ps_t = torch.tensor(
            min_ps,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        presence_penalties_t = torch.tensor(
            presence_penalties,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        frequency_penalties_t = torch.tensor(
            frequency_penalties,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        repetition_penalties_t = torch.tensor(
            repetition_penalties,
            device="cpu",
            dtype=dtype,
            pin_memory=pin_memory,
        )
        top_ks_t = torch.tensor(
            top_ks,
            device="cpu",
            dtype=torch.int,
            pin_memory=pin_memory,
        )
542
543
544
545
546
547
548
549
550
551
552
553
554
555
        sample_indices_t = torch.tensor(
            sample_indices,
            device="cpu",
            dtype=torch.long,
            pin_memory=pin_memory,
        )
        # need to transpose and make contiguous to
        # copy the tensor correctly.
        # [batch_size, n_seeds] -> [n_seeds, batch_size]
        sampling_seeds_t = torch.tensor(
            sampling_seeds,
            device="cpu",
            dtype=torch.long,
            pin_memory=pin_memory,
556
        ).t().contiguous()
557

558
559
        # Because the memory is pinned, we can do non-blocking
        # transfer to device.
560
561
562
563
564
565
566
567
568
569

        # How many seeds the sample operation itself will need.
        num_base_seeds = sampling_seeds_t.shape[0] - extra_seeds_to_generate
        sampling_seeds_gpu = sampling_seeds_t.to(device=device,
                                                 non_blocking=True)
        extra_seeds_gpu = sampling_seeds_gpu[num_base_seeds:]
        if not extra_seeds_gpu.numel():
            extra_seeds_gpu = None
        sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]

570
571
572
573
574
575
576
577
578
579
580
        return cls(
            temperatures=temperatures_t.to(device=device, non_blocking=True),
            top_ps=top_ps_t.to(device=device, non_blocking=True),
            top_ks=top_ks_t.to(device=device, non_blocking=True),
            min_ps=min_ps_t.to(device=device, non_blocking=True),
            presence_penalties=presence_penalties_t.to(device=device,
                                                       non_blocking=True),
            frequency_penalties=frequency_penalties_t.to(device=device,
                                                         non_blocking=True),
            repetition_penalties=repetition_penalties_t.to(device=device,
                                                           non_blocking=True),
581
582
            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
            output_tokens=output_t.to(device=device, non_blocking=True),
583
584
585
586
            sampling_seeds=sampling_seeds_gpu,
            sample_indices=sample_indices_t.to(device=device,
                                               non_blocking=True),
            extra_seeds=extra_seeds_gpu,
587
        )
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617

    @staticmethod
    def _get_sequence_seeds(
        seed: int,
        *extra_entropy: int,
        seeds_to_generate: int,
        is_greedy: bool,
    ):
        """Get `seeds_to_generate` child seeds from `seed` and extra entropy."""
        if not is_greedy:
            if seed is None:
                randint_fn = random.randint
            else:
                generator = random.Random(str((seed, ) + extra_entropy))
                randint_fn = generator.randint
            lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max
            # If the user/random sets seed = 0 but request should
            # have sampling, we need to change it to something
            # else. We use a constant in that case.
            # This way we don't need to create and load a bool
            # matrix in the sampling kernel, which reduces CPU
            # overhead and latency.
            seq_seeds = [
                randint_fn(lo, hi) or _SEED_0_REPLACEMENT
                for _ in range(seeds_to_generate)
            ]
        else:
            # For the kernel, seed == 0 means greedy decoding.
            seq_seeds = [0] * seeds_to_generate
        return seq_seeds