serving_completion.py 28.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
16
17
18
19
20
21
22
23
24
25
26
27
from vllm.entrypoints.openai.protocol import (
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
28
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
29
from vllm.entrypoints.renderer import RenderConfig
30
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
31
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
32
from vllm.logger import init_logger
33
from vllm.logprobs import Logprob
34
from vllm.outputs import RequestOutput
35
from vllm.sampling_params import BeamSearchParams, SamplingParams
36
from vllm.transformers_utils.tokenizer import AnyTokenizer
37
38
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
39
40
41
42
43

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
44
45
    def __init__(
        self,
46
        engine_client: EngineClient,
47
        models: OpenAIServingModels,
48
        *,
49
        request_logger: RequestLogger | None,
50
        return_tokens_as_token_ids: bool = False,
51
        enable_prompt_tokens_details: bool = False,
52
        enable_force_include_usage: bool = False,
53
        log_error_stack: bool = False,
54
    ):
55
56
57
58
59
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
60
            log_error_stack=log_error_stack,
61
        )
62
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
63
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
64
        self.enable_force_include_usage = enable_force_include_usage
65
        if self.default_sampling_params:
66
67
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
68
69
70
71
72
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
73

74
75
76
    async def create_completion(
        self,
        request: CompletionRequest,
77
78
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
79
80
81
82
83
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

84
        NOTE: Currently we do not support the following feature:
85
86
87
88
89
90
91
            - suffix (the language models we currently support do not support
            suffix)
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

92
93
94
95
96
97
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

98
        # Return error for unsupported features.
99
        if request.suffix is not None:
100
            return self.create_error_response("suffix is not currently supported")
101

102
        if request.echo and request.prompt_embeds is not None:
103
            return self.create_error_response("Echo is unsupported with prompt embeds.")
104

105
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
106
            return self.create_error_response(
107
108
                "prompt_logprobs is not compatible with prompt embeds."
            )
109

110
        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
111
        created_time = int(time.time())
112

113
114
115
116
        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

117
        try:
118
            lora_request = self._maybe_get_adapters(request)
119

120
121
122
            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
123
                tokenizer = await self.engine_client.get_tokenizer()
124
125
126
127
128
            renderer = self._get_renderer(tokenizer)

            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
129
                config=self._build_render_config(request),
130
131
132
133
            )
        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
134
135
136
137
138
139
140
141
142
        except TypeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except RuntimeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except jinja2.TemplateError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
143

144
145
146
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

147
        # Schedule the request and get the result generator.
148
        generators: list[AsyncGenerator[RequestOutput, None]] = []
149
150
        try:
            for i, engine_prompt in enumerate(engine_prompts):
151
                prompt_text, prompt_token_ids, prompt_embeds = (
152
153
                    self._get_prompt_components(engine_prompt)
                )
154
155
156
157
158
159

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
160
                else:
161
                    raise NotImplementedError
162
163
164
165
166
167
168
169

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
170
171
                    default_sampling_params=self.default_sampling_params,
                )
172

173
                sampling_params: SamplingParams | BeamSearchParams
174
175
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
176
177
                        max_tokens, self.default_sampling_params
                    )
178
179
                else:
                    sampling_params = request.to_sampling_params(
180
181
182
183
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
184

185
186
                request_id_item = f"{request_id}-{i}"

187
188
                self._log_inputs(
                    request_id_item,
189
                    engine_prompt,
190
191
192
                    params=sampling_params,
                    lora_request=lora_request,
                )
193

194
195
196
197
198
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
199

200
201
202
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
203
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
204
                if isinstance(sampling_params, BeamSearchParams):
205
                    generator = self.beam_search(
206
                        prompt=engine_prompt,
207
208
                        request_id=request_id,
                        params=sampling_params,
209
                        lora_request=lora_request,
210
                    )
211
                else:
212
213
214
215
216
217
218
219
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
                    )
220

221
                    generator = self.engine_client.generate(
222
                        engine_request,
223
224
225
226
227
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
228
229
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
230
                        data_parallel_rank=data_parallel_rank,
231
                    )
232
233

                generators.append(generator)
234
        except ValueError as e:
235
            # TODO: Use a vllm-specific Validation Error
236
            return self.create_error_response(str(e))
237

238
        result_generator = merge_async_iterators(*generators)
239

240
        model_name = self.models.model_name(lora_request)
241
242
        num_prompts = len(engine_prompts)

243
244
245
        # Similar to the OpenAI API, when n != best_of, we do not stream the
        # results. Noting that best_of is only supported in V0. In addition,
        # we do not stream the results when use beam search.
246
247
248
249
250
        stream = (
            request.stream
            and (request.best_of is None or request.n == request.best_of)
            and not request.use_beam_search
        )
251
252
253

        # Streaming response
        if stream:
254
255
            return self.completion_stream_generator(
                request,
256
                engine_prompts,
257
258
259
260
                result_generator,
                request_id,
                created_time,
                model_name,
261
                num_prompts=num_prompts,
262
                tokenizer=tokenizer,
263
                request_metadata=request_metadata,
264
            )
265
266

        # Non-streaming response
267
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
268
269
270
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
271
272
273
274
275
276
277
278

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
279
                    engine_prompt = engine_prompts[i]
280
281
282
283
284
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
285

286
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
287

288
            response = self.request_output_to_completion_response(
289
290
291
292
293
294
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
295
                request_metadata,
296
            )
297
298
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
299
300
301
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
302

303
304
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
305
        if request.stream:
306
            response_json = response.model_dump_json()
307
308
309
310
311
312
313
314

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
315
316
317
318

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
319
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
320
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
321
322
323
324
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
325
        tokenizer: AnyTokenizer,
326
        request_metadata: RequestResponseMetadata,
327
    ) -> AsyncGenerator[str, None]:
328
        num_choices = 1 if request.n is None else request.n
329
        previous_text_lens = [0] * num_choices * num_prompts
330
331
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
332
        num_prompt_tokens = [0] * num_prompts
333
334
        num_cached_tokens = None
        first_iteration = True
335

336
        stream_options = request.stream_options
337
338
339
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
340

341
342
        try:
            async for prompt_idx, res in result_generator:
343
344
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
345

346
347
348
349
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

350
351
352
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
353
354
355
356
357
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
358

359
                # Prompt details are excluded from later streamed outputs
360
361
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
362

363
                delta_token_ids: GenericSequence[int]
364
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
365
366

                for output in res.outputs:
367
                    i = output.index + prompt_idx * num_choices
368

369
370
371
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
372
                    prompt_token_ids_to_return: list[int] | None = None
373

374
                    assert request.max_tokens is not None
375
                    if request.echo and not has_echoed[i]:
376
                        assert prompt_token_ids is not None
377
378
                        if request.return_token_ids:
                            prompt_text = ""
379
                        assert prompt_text is not None
380
381
382
383
384
385
386
387
388
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
389
390
                                *prompt_token_ids,
                                *output.token_ids,
391
392
                            ]
                            out_logprobs = [
393
                                *(prompt_logprobs or []),
394
395
                                *(output.logprobs or []),
                            ]
396
                        prompt_token_ids_to_return = prompt_token_ids
397
398
399
                        has_echoed[i] = True
                    else:
                        # return just the delta
400
401
402
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
403

404
405
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
406
                        if not has_echoed[i] and request.return_token_ids:
407
408
409
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

410
411
412
413
414
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
415
416
417
                            # Chunked prefill case, don't return empty chunks
                            continue

418
                    if request.logprobs is not None:
419
                        assert out_logprobs is not None, "Did not output logprobs"
420
                        logprobs = self._create_completion_logprobs(
421
                            token_ids=delta_token_ids,
422
                            top_logprobs=out_logprobs,
423
                            num_output_top_logprobs=request.logprobs,
424
                            tokenizer=tokenizer,
425
                            initial_text_offset=previous_text_lens[i],
426
                            return_as_token_id=request.return_tokens_as_token_ids,
427
428
429
430
                        )
                    else:
                        logprobs = None

431
432
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
433
                    finish_reason = output.finish_reason
434
                    stop_reason = output.stop_reason
435
436

                    chunk = CompletionStreamResponse(
437
438
439
440
441
442
443
444
445
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
446
                                stop_reason=stop_reason,
447
                                prompt_token_ids=prompt_token_ids_to_return,
448
449
450
451
452
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
453
                            )
454
455
                        ],
                    )
456
457
458
459
460
461
462
463
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
464

465
                    response_json = chunk.model_dump_json(exclude_unset=False)
466
                    yield f"data: {response_json}\n\n"
467

468
469
470
471
472
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
473
474
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
475

476
477
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
478
479
                    cached_tokens=num_cached_tokens
                )
480

481
            if include_usage:
482
483
484
485
486
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
487
                    usage=final_usage_info,
488
                )
489
                final_usage_data = final_usage_chunk.model_dump_json(
490
491
                    exclude_unset=False, exclude_none=True
                )
492
493
                yield f"data: {final_usage_data}\n\n"

494
            # report to FastAPI middleware aggregate usage across all choices
495
            request_metadata.final_usage_info = final_usage_info
496

497
        except Exception as e:
498
499
500
501
502
503
504
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
505
        final_res_batch: list[RequestOutput],
506
507
508
509
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
510
        tokenizer: AnyTokenizer,
511
        request_metadata: RequestResponseMetadata,
512
    ) -> CompletionResponse:
513
        choices: list[CompletionResponseChoice] = []
514
515
        num_prompt_tokens = 0
        num_generated_tokens = 0
516
517
        kv_transfer_params = None
        last_final_res = None
518
        for final_res in final_res_batch:
519
            last_final_res = final_res
520
            prompt_token_ids = final_res.prompt_token_ids
521
            assert prompt_token_ids is not None
522
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
523
524
            prompt_text = final_res.prompt

525
            token_ids: GenericSequence[int]
526
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
527

528
            for output in final_res.outputs:
529
                assert request.max_tokens is not None
530
                if request.echo:
531
532
                    if request.return_token_ids:
                        prompt_text = ""
533
                    assert prompt_text is not None
534
535
536
537
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
538
                    else:
539
540
541
542
543
544
545
546
547
548
549
550
551
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
552
553
                else:
                    token_ids = output.token_ids
554
                    out_logprobs = output.logprobs
555
556
557
                    output_text = output.text

                if request.logprobs is not None:
558
                    assert out_logprobs is not None, "Did not output logprobs"
559
                    logprobs = self._create_completion_logprobs(
560
                        token_ids=token_ids,
561
                        top_logprobs=out_logprobs,
562
                        tokenizer=tokenizer,
563
                        num_output_top_logprobs=request.logprobs,
564
                        return_as_token_id=request.return_tokens_as_token_ids,
565
566
567
568
569
570
571
572
573
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
574
                    stop_reason=output.stop_reason,
575
                    prompt_logprobs=final_res.prompt_logprobs,
576
577
578
579
580
581
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
582
583
584
                )
                choices.append(choice_data)

585
586
                num_generated_tokens += len(output.token_ids)

587
588
589
590
591
592
593
594
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

595
596
597
598
599
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
600
            usage.prompt_tokens_details = PromptTokenUsageInfo(
601
602
                cached_tokens=last_final_res.num_cached_tokens
            )
603

604
        request_metadata.final_usage_info = usage
605
606
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
607
608
609
610
611
612
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
613
614
            kv_transfer_params=kv_transfer_params,
        )
615
616
617
618

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
619
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
620
        num_output_top_logprobs: int,
621
        tokenizer: AnyTokenizer,
622
        initial_text_offset: int = 0,
623
        return_as_token_id: bool | None = None,
624
625
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
626
        out_text_offset: list[int] = []
627
        out_token_logprobs: list[float | None] = []
628
        out_tokens: list[str] = []
629
        out_top_logprobs: list[dict[str, float] | None] = []
630
631
632

        last_token_len = 0

633
634
635
636
637
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
638
639
640
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
641
                token = tokenizer.decode(token_id)
642
                if should_return_as_token_id:
643
                    token = f"token_id:{token_id}"
644

645
646
647
648
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
649
650
                step_token = step_top_logprobs[token_id]

651
                token = self._get_decoded_token(
652
                    step_token,
653
654
                    token_id,
                    tokenizer,
655
                    return_as_token_id=should_return_as_token_id,
656
657
658
                )
                token_logprob = max(step_token.logprob, -9999.0)

659
660
661
662
663
664
665
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
666
667
668
669
670
671
672
673
674
675
676
677
678
679
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
680
681
682
683
684
685
686
687
688
689
690
691
692

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
693
694
695
696

    def _build_render_config(
        self,
        request: CompletionRequest,
697
        max_input_length: int | None = None,
698
699
700
701
702
703
704
    ) -> RenderConfig:
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
705
            needs_detokenization=bool(request.echo and not request.return_token_ids),
706
        )