serving_completion.py 28.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
8
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Optional, Union, cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.config import ModelConfig
14
from vllm.engine.protocol import EngineClient
15
from vllm.entrypoints.logger import RequestLogger
16
17
18
19
20
21
22
23
24
25
26
27
28
from vllm.entrypoints.openai.protocol import (
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
29
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
30
from vllm.entrypoints.renderer import RenderConfig
31
from vllm.entrypoints.utils import get_max_tokens
32
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
33
from vllm.logger import init_logger
34
from vllm.logprobs import Logprob
35
from vllm.outputs import RequestOutput
36
from vllm.sampling_params import BeamSearchParams, SamplingParams
37
from vllm.transformers_utils.tokenizer import AnyTokenizer
38
from vllm.utils import as_list, merge_async_iterators
39
40
41
42
43

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
44
45
    def __init__(
        self,
46
        engine_client: EngineClient,
47
        model_config: ModelConfig,
48
        models: OpenAIServingModels,
49
50
        *,
        request_logger: Optional[RequestLogger],
51
        return_tokens_as_token_ids: bool = False,
52
        enable_prompt_tokens_details: bool = False,
53
        enable_force_include_usage: bool = False,
54
        log_error_stack: bool = False,
55
    ):
56
57
58
59
60
61
62
        super().__init__(
            engine_client=engine_client,
            model_config=model_config,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            enable_force_include_usage=enable_force_include_usage,
63
            log_error_stack=log_error_stack,
64
        )
65
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
66
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
67
        if self.default_sampling_params:
68
69
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
70
71
72
73
74
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
75

76
77
78
    async def create_completion(
        self,
        request: CompletionRequest,
79
        raw_request: Optional[Request] = None,
80
    ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
81
82
83
84
85
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

86
        NOTE: Currently we do not support the following feature:
87
88
89
90
91
92
93
            - suffix (the language models we currently support do not support
            suffix)
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

94
95
96
97
98
99
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

100
        # Return error for unsupported features.
101
        if request.suffix is not None:
102
            return self.create_error_response("suffix is not currently supported")
103

104
        if request.echo and request.prompt_embeds is not None:
105
            return self.create_error_response("Echo is unsupported with prompt embeds.")
106

107
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
108
            return self.create_error_response(
109
110
                "prompt_logprobs is not compatible with prompt embeds."
            )
111

112
        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
113
        created_time = int(time.time())
114

115
116
117
118
        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

119
        try:
120
            lora_request = self._maybe_get_adapters(request)
121

122
123
124
            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
125
                tokenizer = await self.engine_client.get_tokenizer()
126
127
128
129
130
            renderer = self._get_renderer(tokenizer)

            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
131
                config=self._build_render_config(request),
132
133
134
135
            )
        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
136
137
138
139
140
141
142
143
144
        except TypeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except RuntimeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except jinja2.TemplateError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
145

146
        # Schedule the request and get the result generator.
147
        generators: list[AsyncGenerator[RequestOutput, None]] = []
148
149
        try:
            for i, engine_prompt in enumerate(engine_prompts):
150
                prompt_text, prompt_token_ids, prompt_embeds = (
151
152
                    self._get_prompt_components(engine_prompt)
                )
153
154
155
156
157
158

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
159
                else:
160
                    raise NotImplementedError
161
162
163
164
165
166
167
168

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
169
170
                    default_sampling_params=self.default_sampling_params,
                )
171

172
                sampling_params: Union[SamplingParams, BeamSearchParams]
173
174
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
175
176
                        max_tokens, self.default_sampling_params
                    )
177
178
                else:
                    sampling_params = request.to_sampling_params(
179
180
181
182
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
183

184
185
                request_id_item = f"{request_id}-{i}"

186
187
                self._log_inputs(
                    request_id_item,
188
                    engine_prompt,
189
190
191
                    params=sampling_params,
                    lora_request=lora_request,
                )
192

193
194
195
196
197
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
198

199
200
201
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
202
                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt], engine_prompt)
203
204
                if isinstance(sampling_params, BeamSearchParams):
                    generator = self.engine_client.beam_search(
205
                        prompt=engine_prompt,
206
207
                        request_id=request_id,
                        params=sampling_params,
208
                        lora_request=lora_request,
209
                    )
210
                else:
211
212
213
214
215
216
217
218
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
                    )
219

220
                    generator = self.engine_client.generate(
221
                        engine_request,
222
223
224
225
226
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
227
228
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
229
                    )
230
231

                generators.append(generator)
232
        except ValueError as e:
233
            # TODO: Use a vllm-specific Validation Error
234
            return self.create_error_response(str(e))
235

236
        result_generator = merge_async_iterators(*generators)
237

238
        model_name = self.models.model_name(lora_request)
239
240
        num_prompts = len(engine_prompts)

241
242
243
        # Similar to the OpenAI API, when n != best_of, we do not stream the
        # results. Noting that best_of is only supported in V0. In addition,
        # we do not stream the results when use beam search.
244
245
246
247
248
        stream = (
            request.stream
            and (request.best_of is None or request.n == request.best_of)
            and not request.use_beam_search
        )
249
250
251

        # Streaming response
        if stream:
252
253
            return self.completion_stream_generator(
                request,
254
                engine_prompts,
255
256
257
258
                result_generator,
                request_id,
                created_time,
                model_name,
259
                num_prompts=num_prompts,
260
                tokenizer=tokenizer,
261
                request_metadata=request_metadata,
262
263
                enable_force_include_usage=self.enable_force_include_usage,
            )
264
265

        # Non-streaming response
266
        final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
267
268
269
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
270
271
272
273
274
275
276
277

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
278
                    engine_prompt = engine_prompts[i]
279
280
281
282
283
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
284

285
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
286

287
            response = self.request_output_to_completion_response(
288
289
290
291
292
293
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
294
                request_metadata,
295
            )
296
297
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
298
299
300
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
301

302
303
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
304
        if request.stream:
305
            response_json = response.model_dump_json()
306
307
308
309
310
311
312
313

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
314
315
316
317

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
318
        engine_prompts: list[Union[TokensPrompt, EmbedsPrompt]],
319
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
320
321
322
323
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
324
        tokenizer: AnyTokenizer,
325
        request_metadata: RequestResponseMetadata,
326
        enable_force_include_usage: bool,
327
    ) -> AsyncGenerator[str, None]:
328
        num_choices = 1 if request.n is None else request.n
329
        previous_text_lens = [0] * num_choices * num_prompts
330
331
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
332
        num_prompt_tokens = [0] * num_prompts
333
334
        num_cached_tokens = None
        first_iteration = True
335

336
337
        stream_options = request.stream_options
        if stream_options:
338
339
340
341
            include_usage = stream_options.include_usage or enable_force_include_usage
            include_continuous_usage = (
                include_usage and stream_options.continuous_usage_stats
            )
342
343
344
        else:
            include_usage, include_continuous_usage = False, False

345
346
        try:
            async for prompt_idx, res in result_generator:
347
348
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
349

350
351
352
353
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

354
355
356
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
357
358
359
360
361
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
362

363
                # Prompt details are excluded from later streamed outputs
364
365
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
366

367
                delta_token_ids: GenericSequence[int]
368
                out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]]
369
370

                for output in res.outputs:
371
                    i = output.index + prompt_idx * num_choices
372

373
374
375
376
377
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
                    prompt_token_ids_to_return: Optional[list[int]] = None

378
                    assert request.max_tokens is not None
379
                    if request.echo and not has_echoed[i]:
380
                        assert prompt_token_ids is not None
381
382
                        if request.return_token_ids:
                            prompt_text = ""
383
                        assert prompt_text is not None
384
385
386
387
388
389
390
391
392
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
393
394
                                *prompt_token_ids,
                                *output.token_ids,
395
396
                            ]
                            out_logprobs = [
397
                                *(prompt_logprobs or []),
398
399
                                *(output.logprobs or []),
                            ]
400
                        prompt_token_ids_to_return = prompt_token_ids
401
402
403
                        has_echoed[i] = True
                    else:
                        # return just the delta
404
405
406
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
407

408
409
410
411
412
413
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
                        if not has_echoed[i]:
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

414
415
416
417
418
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
419
420
421
                            # Chunked prefill case, don't return empty chunks
                            continue

422
                    if request.logprobs is not None:
423
                        assert out_logprobs is not None, "Did not output logprobs"
424
                        logprobs = self._create_completion_logprobs(
425
                            token_ids=delta_token_ids,
426
                            top_logprobs=out_logprobs,
427
                            num_output_top_logprobs=request.logprobs,
428
                            tokenizer=tokenizer,
429
                            initial_text_offset=previous_text_lens[i],
430
                            return_as_token_id=request.return_tokens_as_token_ids,
431
432
433
434
                        )
                    else:
                        logprobs = None

435
436
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
437
                    finish_reason = output.finish_reason
438
                    stop_reason = output.stop_reason
439
440

                    chunk = CompletionStreamResponse(
441
442
443
444
445
446
447
448
449
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
450
                                stop_reason=stop_reason,
451
                                prompt_token_ids=prompt_token_ids_to_return,
452
453
454
455
456
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
457
                            )
458
459
                        ],
                    )
460
461
462
463
464
465
466
467
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
468

469
                    response_json = chunk.model_dump_json(exclude_unset=False)
470
                    yield f"data: {response_json}\n\n"
471

472
473
474
475
476
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
477
478
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
479

480
481
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
482
483
                    cached_tokens=num_cached_tokens
                )
484

485
            if include_usage:
486
487
488
489
490
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
491
                    usage=final_usage_info,
492
                )
493
                final_usage_data = final_usage_chunk.model_dump_json(
494
495
                    exclude_unset=False, exclude_none=True
                )
496
497
                yield f"data: {final_usage_data}\n\n"

498
            # report to FastAPI middleware aggregate usage across all choices
499
            request_metadata.final_usage_info = final_usage_info
500

501
        except Exception as e:
502
503
504
505
506
507
508
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
509
        final_res_batch: list[RequestOutput],
510
511
512
513
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
514
        tokenizer: AnyTokenizer,
515
        request_metadata: RequestResponseMetadata,
516
    ) -> CompletionResponse:
517
        choices: list[CompletionResponseChoice] = []
518
519
        num_prompt_tokens = 0
        num_generated_tokens = 0
520
521
        kv_transfer_params = None
        last_final_res = None
522
        for final_res in final_res_batch:
523
            last_final_res = final_res
524
            prompt_token_ids = final_res.prompt_token_ids
525
            assert prompt_token_ids is not None
526
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
527
528
            prompt_text = final_res.prompt

529
            token_ids: GenericSequence[int]
530
            out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]]
531

532
            for output in final_res.outputs:
533
                assert request.max_tokens is not None
534
                if request.echo:
535
536
                    if request.return_token_ids:
                        prompt_text = ""
537
                    assert prompt_text is not None
538
539
540
541
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
542
                    else:
543
544
545
546
547
548
549
550
551
552
553
554
555
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
556
557
                else:
                    token_ids = output.token_ids
558
                    out_logprobs = output.logprobs
559
560
561
                    output_text = output.text

                if request.logprobs is not None:
562
                    assert out_logprobs is not None, "Did not output logprobs"
563
                    logprobs = self._create_completion_logprobs(
564
                        token_ids=token_ids,
565
                        top_logprobs=out_logprobs,
566
                        tokenizer=tokenizer,
567
                        num_output_top_logprobs=request.logprobs,
568
                        return_as_token_id=request.return_tokens_as_token_ids,
569
570
571
572
573
574
575
576
577
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
578
                    stop_reason=output.stop_reason,
579
                    prompt_logprobs=final_res.prompt_logprobs,
580
581
582
583
584
585
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
586
587
588
                )
                choices.append(choice_data)

589
590
                num_generated_tokens += len(output.token_ids)

591
592
593
594
595
596
597
598
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

599
600
601
602
603
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
604
            usage.prompt_tokens_details = PromptTokenUsageInfo(
605
606
                cached_tokens=last_final_res.num_cached_tokens
            )
607

608
        request_metadata.final_usage_info = usage
609
610
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
611
612
613
614
615
616
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
617
618
            kv_transfer_params=kv_transfer_params,
        )
619
620
621
622

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
623
        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
624
        num_output_top_logprobs: int,
625
        tokenizer: AnyTokenizer,
626
        initial_text_offset: int = 0,
627
        return_as_token_id: Optional[bool] = None,
628
629
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
630
631
632
633
        out_text_offset: list[int] = []
        out_token_logprobs: list[Optional[float]] = []
        out_tokens: list[str] = []
        out_top_logprobs: list[Optional[dict[str, float]]] = []
634
635
636

        last_token_len = 0

637
638
639
640
641
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
642
643
644
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
645
                token = tokenizer.decode(token_id)
646
                if should_return_as_token_id:
647
                    token = f"token_id:{token_id}"
648

649
650
651
652
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
653
654
                step_token = step_top_logprobs[token_id]

655
                token = self._get_decoded_token(
656
                    step_token,
657
658
                    token_id,
                    tokenizer,
659
                    return_as_token_id=should_return_as_token_id,
660
661
662
                )
                token_logprob = max(step_token.logprob, -9999.0)

663
664
665
666
667
668
669
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
670
671
672
673
674
675
676
677
678
679
680
681
682
683
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
684
685
686
687
688
689
690
691
692
693
694
695
696

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
697
698
699
700
701
702
703
704
705
706
707
708

    def _build_render_config(
        self,
        request: CompletionRequest,
        max_input_length: Optional[int] = None,
    ) -> RenderConfig:
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
709
            needs_detokenization=bool(request.echo and not request.return_token_ids),
710
        )