"vllm/vscode:/vscode.git/clone" did not exist on "d830656a9722bfc719426ce6bdd13b3d9d456304"
serving.py 28 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
36
from vllm.exceptions import VLLMValidationError
37
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
38
from vllm.inputs.parse import get_prompt_components
39
from vllm.logger import init_logger
40
from vllm.logprobs import Logprob
41
from vllm.outputs import RequestOutput
42
from vllm.sampling_params import BeamSearchParams, SamplingParams
43
from vllm.tokenizers import TokenizerLike
44
45
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
46
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
47
48
49
50
51

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
52
53
    def __init__(
        self,
54
        engine_client: EngineClient,
55
        models: OpenAIServingModels,
56
        *,
57
        request_logger: RequestLogger | None,
58
        return_tokens_as_token_ids: bool = False,
59
        enable_prompt_tokens_details: bool = False,
60
        enable_force_include_usage: bool = False,
61
        log_error_stack: bool = False,
62
    ):
63
64
65
66
67
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
68
            log_error_stack=log_error_stack,
69
        )
70
71
72
73

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

74
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
75
        self.enable_force_include_usage = enable_force_include_usage
76
77

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
78

79
    async def render_completion_request(
80
81
        self,
        request: CompletionRequest,
82
83
84
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
85

86
87
88
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
89
90
91
92
93
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

94
95
96
97
98
99
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

100
        # Return error for unsupported features.
101
        if request.suffix is not None:
102
            return self.create_error_response("suffix is not currently supported")
103

104
        if request.echo and request.prompt_embeds is not None:
105
            return self.create_error_response("Echo is unsupported with prompt embeds.")
106

107
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
108
            return self.create_error_response(
109
110
                "prompt_logprobs is not compatible with prompt embeds."
            )
111

112
        try:
113
114
115
            engine_prompts = await self._preprocess_completion(
                request,
                prompt_input=request.prompt,
116
                prompt_embeds=request.prompt_embeds,
117
            )
118
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
119
            logger.exception("Error in preprocessing prompt inputs")
120
            return self.create_error_response(e)
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
155
            return self.create_error_response(e)
156

157
158
159
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

160
        # Schedule the request and get the result generator.
161
        generators: list[AsyncGenerator[RequestOutput, None]] = []
162
163
        try:
            for i, engine_prompt in enumerate(engine_prompts):
164
                prompt_text, _, _ = get_prompt_components(engine_prompt)
165
166
167
168

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
169
                    prompt=engine_prompt,
170
171
                    default_sampling_params=self.default_sampling_params,
                )
172

173
                sampling_params: SamplingParams | BeamSearchParams
174
175
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
176
177
                        max_tokens, self.default_sampling_params
                    )
178
179
                else:
                    sampling_params = request.to_sampling_params(
180
181
182
183
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
184
185
186
187
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
188

189
190
                request_id_item = f"{request_id}-{i}"

191
192
                self._log_inputs(
                    request_id_item,
193
                    engine_prompt,
194
195
196
                    params=sampling_params,
                    lora_request=lora_request,
                )
197

198
199
200
201
202
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
203

204
                if isinstance(sampling_params, BeamSearchParams):
205
                    generator = self.beam_search(
206
                        prompt=engine_prompt,
207
208
                        request_id=request_id,
                        params=sampling_params,
209
                        lora_request=lora_request,
210
                        trace_headers=trace_headers,
211
                    )
212
                else:
213
214
215
216
                    tok_params = request.build_tok_params(self.model_config)
                    tokenization_kwargs = tok_params.get_encode_kwargs()

                    engine_request = self.input_processor.process_inputs(
217
218
219
220
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
221
                        tokenization_kwargs=tokenization_kwargs,
222
223
                        trace_headers=trace_headers,
                        priority=request.priority,
224
                        data_parallel_rank=data_parallel_rank,
225
                    )
226

227
                    generator = self.engine_client.generate(
228
                        engine_request,
229
230
231
232
233
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
234
235
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
236
                        data_parallel_rank=data_parallel_rank,
237
                    )
238
239

                generators.append(generator)
240
        except ValueError as e:
241
            return self.create_error_response(e)
242

243
        result_generator = merge_async_iterators(*generators)
244

245
        model_name = self.models.model_name(lora_request)
246
247
        num_prompts = len(engine_prompts)

248
249
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
250
251

        # Streaming response
252
253
        tokenizer = self.renderer.tokenizer

254
        if stream:
255
256
            return self.completion_stream_generator(
                request,
257
                engine_prompts,
258
259
260
261
                result_generator,
                request_id,
                created_time,
                model_name,
262
                num_prompts=num_prompts,
263
                tokenizer=tokenizer,
264
                request_metadata=request_metadata,
265
            )
266
267

        # Non-streaming response
268
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
269
270
271
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
272
273
274
275
276
277
278
279

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
280
                    engine_prompt = engine_prompts[i]
281
282
283
284
285
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
286

287
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
288

289
            response = self.request_output_to_completion_response(
290
291
292
293
294
295
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
296
                request_metadata,
297
            )
298
299
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
300
301
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
302
        except ValueError as e:
303
            return self.create_error_response(e)
304

305
306
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
307
        if request.stream:
308
            response_json = response.model_dump_json()
309
310
311
312
313
314
315
316

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
317
318
319
320

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
321
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
322
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
323
324
325
326
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
327
        tokenizer: TokenizerLike | None,
328
        request_metadata: RequestResponseMetadata,
329
    ) -> AsyncGenerator[str, None]:
330
        num_choices = 1 if request.n is None else request.n
331
        previous_text_lens = [0] * num_choices * num_prompts
332
333
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
334
        num_prompt_tokens = [0] * num_prompts
335
336
        num_cached_tokens = None
        first_iteration = True
337

338
        stream_options = request.stream_options
339
340
341
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
342

343
344
        try:
            async for prompt_idx, res in result_generator:
345
346
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
347

348
349
350
351
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

352
353
354
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
355
356
357
358
359
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
360

361
                # Prompt details are excluded from later streamed outputs
362
363
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
364

365
                delta_token_ids: GenericSequence[int]
366
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
367
368

                for output in res.outputs:
369
                    i = output.index + prompt_idx * num_choices
370

371
372
373
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
374
                    prompt_token_ids_to_return: list[int] | None = None
375

376
                    assert request.max_tokens is not None
377
                    if request.echo and not has_echoed[i]:
378
                        assert prompt_token_ids is not None
379
380
                        if request.return_token_ids:
                            prompt_text = ""
381
                        assert prompt_text is not None
382
383
384
385
386
387
388
389
390
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
391
392
                                *prompt_token_ids,
                                *output.token_ids,
393
394
                            ]
                            out_logprobs = [
395
                                *(prompt_logprobs or []),
396
397
                                *(output.logprobs or []),
                            ]
398
                        prompt_token_ids_to_return = prompt_token_ids
399
400
401
                        has_echoed[i] = True
                    else:
                        # return just the delta
402
403
404
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
405

406
407
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
408
                        if not has_echoed[i] and request.return_token_ids:
409
410
411
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

412
413
414
415
416
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
417
418
419
                            # Chunked prefill case, don't return empty chunks
                            continue

420
                    if request.logprobs is not None:
421
                        assert out_logprobs is not None, "Did not output logprobs"
422
                        logprobs = self._create_completion_logprobs(
423
                            token_ids=delta_token_ids,
424
                            top_logprobs=out_logprobs,
425
                            num_output_top_logprobs=request.logprobs,
426
                            tokenizer=tokenizer,
427
                            initial_text_offset=previous_text_lens[i],
428
                            return_as_token_id=request.return_tokens_as_token_ids,
429
430
431
432
                        )
                    else:
                        logprobs = None

433
434
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
435
                    finish_reason = output.finish_reason
436
                    stop_reason = output.stop_reason
437

438
439
                    self._raise_if_error(finish_reason, request_id)

440
                    chunk = CompletionStreamResponse(
441
442
443
444
445
446
447
448
449
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
450
                                stop_reason=stop_reason,
451
                                prompt_token_ids=prompt_token_ids_to_return,
452
453
454
455
456
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
457
                            )
458
459
                        ],
                    )
460
461
462
463
464
465
466
467
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
468

469
                    response_json = chunk.model_dump_json(exclude_unset=False)
470
                    yield f"data: {response_json}\n\n"
471

472
473
474
475
476
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
477
478
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
479

480
481
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
482
483
                    cached_tokens=num_cached_tokens
                )
484

485
            if include_usage:
486
487
488
489
490
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
491
                    usage=final_usage_info,
492
                )
493
                final_usage_data = final_usage_chunk.model_dump_json(
494
495
                    exclude_unset=False, exclude_none=True
                )
496
497
                yield f"data: {final_usage_data}\n\n"

498
            # report to FastAPI middleware aggregate usage across all choices
499
            request_metadata.final_usage_info = final_usage_info
500

501
502
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
503
        except Exception as e:
504
            logger.exception("Error in completion stream generator.")
505
            data = self.create_streaming_error_response(e)
506
507
508
509
510
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
511
        final_res_batch: list[RequestOutput],
512
513
514
515
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
516
        tokenizer: TokenizerLike | None,
517
        request_metadata: RequestResponseMetadata,
518
    ) -> CompletionResponse:
519
        choices: list[CompletionResponseChoice] = []
520
521
        num_prompt_tokens = 0
        num_generated_tokens = 0
522
523
        kv_transfer_params = None
        last_final_res = None
524
        for final_res in final_res_batch:
525
            last_final_res = final_res
526
            prompt_token_ids = final_res.prompt_token_ids
527
            assert prompt_token_ids is not None
528
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
529
530
            prompt_text = final_res.prompt

531
            token_ids: GenericSequence[int]
532
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
533

534
            for output in final_res.outputs:
535
536
                self._raise_if_error(output.finish_reason, request_id)

537
                assert request.max_tokens is not None
538
                if request.echo:
539
540
                    if request.return_token_ids:
                        prompt_text = ""
541
                    assert prompt_text is not None
542
543
544
545
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
546
                    else:
547
548
549
550
551
552
553
554
555
556
557
558
559
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
560
561
                else:
                    token_ids = output.token_ids
562
                    out_logprobs = output.logprobs
563
564
565
                    output_text = output.text

                if request.logprobs is not None:
566
                    assert out_logprobs is not None, "Did not output logprobs"
567
                    logprobs = self._create_completion_logprobs(
568
                        token_ids=token_ids,
569
                        top_logprobs=out_logprobs,
570
                        tokenizer=tokenizer,
571
                        num_output_top_logprobs=request.logprobs,
572
                        return_as_token_id=request.return_tokens_as_token_ids,
573
574
575
576
577
578
579
580
581
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
582
                    stop_reason=output.stop_reason,
583
                    prompt_logprobs=final_res.prompt_logprobs,
584
585
586
587
588
589
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
590
591
592
                )
                choices.append(choice_data)

593
594
                num_generated_tokens += len(output.token_ids)

595
596
597
598
599
600
601
602
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

603
604
605
606
607
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
608
            usage.prompt_tokens_details = PromptTokenUsageInfo(
609
610
                cached_tokens=last_final_res.num_cached_tokens
            )
611

612
        request_metadata.final_usage_info = usage
613
614
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
615
616
617
618
619
620
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
621
622
            kv_transfer_params=kv_transfer_params,
        )
623
624
625
626

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
627
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
628
        num_output_top_logprobs: int,
629
        tokenizer: TokenizerLike | None,
630
        initial_text_offset: int = 0,
631
        return_as_token_id: bool | None = None,
632
633
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
634
        out_text_offset: list[int] = []
635
        out_token_logprobs: list[float | None] = []
636
        out_tokens: list[str] = []
637
        out_top_logprobs: list[dict[str, float] | None] = []
638
639
640

        last_token_len = 0

641
642
643
644
645
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
646
647
648
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
649
                if should_return_as_token_id:
650
                    token = f"token_id:{token_id}"
651
652
                else:
                    if tokenizer is None:
653
654
655
656
657
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
658
659
660
                        )

                    token = tokenizer.decode(token_id)
661

662
663
664
665
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
666
667
                step_token = step_top_logprobs[token_id]

668
                token = self._get_decoded_token(
669
                    step_token,
670
671
                    token_id,
                    tokenizer,
672
                    return_as_token_id=should_return_as_token_id,
673
674
675
                )
                token_logprob = max(step_token.logprob, -9999.0)

676
677
678
679
680
681
682
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
683
684
685
686
687
688
689
690
691
692
693
694
695
696
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
697
698
699
700
701
702
703
704
705
706
707
708
709

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )