serving_completion.py 28.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
16
17
18
19
20
21
22
23
24
25
26
27
from vllm.entrypoints.openai.protocol import (
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
28
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
29
from vllm.entrypoints.renderer import RenderConfig
30
from vllm.entrypoints.utils import get_max_tokens
31
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
32
from vllm.logger import init_logger
33
from vllm.logprobs import Logprob
34
from vllm.outputs import RequestOutput
35
from vllm.sampling_params import BeamSearchParams, SamplingParams
36
from vllm.transformers_utils.tokenizer import AnyTokenizer
37
from vllm.utils import as_list, merge_async_iterators
38
39
40
41
42

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
43
44
    def __init__(
        self,
45
        engine_client: EngineClient,
46
        models: OpenAIServingModels,
47
        *,
48
        request_logger: RequestLogger | None,
49
        return_tokens_as_token_ids: bool = False,
50
        enable_prompt_tokens_details: bool = False,
51
        enable_force_include_usage: bool = False,
52
        log_error_stack: bool = False,
53
    ):
54
55
56
57
58
59
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
            enable_force_include_usage=enable_force_include_usage,
60
            log_error_stack=log_error_stack,
61
        )
62
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
63
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
64
        if self.default_sampling_params:
65
66
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
67
68
69
70
71
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
72

73
74
75
    async def create_completion(
        self,
        request: CompletionRequest,
76
77
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
78
79
80
81
82
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

83
        NOTE: Currently we do not support the following feature:
84
85
86
87
88
89
90
            - suffix (the language models we currently support do not support
            suffix)
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

91
92
93
94
95
96
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

97
        # Return error for unsupported features.
98
        if request.suffix is not None:
99
            return self.create_error_response("suffix is not currently supported")
100

101
        if request.echo and request.prompt_embeds is not None:
102
            return self.create_error_response("Echo is unsupported with prompt embeds.")
103

104
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
105
            return self.create_error_response(
106
107
                "prompt_logprobs is not compatible with prompt embeds."
            )
108

109
        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
110
        created_time = int(time.time())
111

112
113
114
115
        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

116
        try:
117
            lora_request = self._maybe_get_adapters(request)
118

119
120
121
            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
122
                tokenizer = await self.engine_client.get_tokenizer()
123
124
125
126
127
            renderer = self._get_renderer(tokenizer)

            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
128
                config=self._build_render_config(request),
129
130
131
132
            )
        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
133
134
135
136
137
138
139
140
141
        except TypeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except RuntimeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except jinja2.TemplateError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
142

143
        # Schedule the request and get the result generator.
144
        generators: list[AsyncGenerator[RequestOutput, None]] = []
145
146
        try:
            for i, engine_prompt in enumerate(engine_prompts):
147
                prompt_text, prompt_token_ids, prompt_embeds = (
148
149
                    self._get_prompt_components(engine_prompt)
                )
150
151
152
153
154
155

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
156
                else:
157
                    raise NotImplementedError
158
159
160
161
162
163
164
165

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
166
167
                    default_sampling_params=self.default_sampling_params,
                )
168

169
                sampling_params: SamplingParams | BeamSearchParams
170
171
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
172
173
                        max_tokens, self.default_sampling_params
                    )
174
175
                else:
                    sampling_params = request.to_sampling_params(
176
177
178
179
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
180

181
182
                request_id_item = f"{request_id}-{i}"

183
184
                self._log_inputs(
                    request_id_item,
185
                    engine_prompt,
186
187
188
                    params=sampling_params,
                    lora_request=lora_request,
                )
189

190
191
192
193
194
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
195

196
197
198
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
199
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
200
                if isinstance(sampling_params, BeamSearchParams):
201
                    generator = self.beam_search(
202
                        prompt=engine_prompt,
203
204
                        request_id=request_id,
                        params=sampling_params,
205
                        lora_request=lora_request,
206
                    )
207
                else:
208
209
210
211
212
213
214
215
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
                    )
216

217
                    generator = self.engine_client.generate(
218
                        engine_request,
219
220
221
222
223
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
224
225
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
226
                    )
227
228

                generators.append(generator)
229
        except ValueError as e:
230
            # TODO: Use a vllm-specific Validation Error
231
            return self.create_error_response(str(e))
232

233
        result_generator = merge_async_iterators(*generators)
234

235
        model_name = self.models.model_name(lora_request)
236
237
        num_prompts = len(engine_prompts)

238
239
240
        # Similar to the OpenAI API, when n != best_of, we do not stream the
        # results. Noting that best_of is only supported in V0. In addition,
        # we do not stream the results when use beam search.
241
242
243
244
245
        stream = (
            request.stream
            and (request.best_of is None or request.n == request.best_of)
            and not request.use_beam_search
        )
246
247
248

        # Streaming response
        if stream:
249
250
            return self.completion_stream_generator(
                request,
251
                engine_prompts,
252
253
254
255
                result_generator,
                request_id,
                created_time,
                model_name,
256
                num_prompts=num_prompts,
257
                tokenizer=tokenizer,
258
                request_metadata=request_metadata,
259
260
                enable_force_include_usage=self.enable_force_include_usage,
            )
261
262

        # Non-streaming response
263
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
264
265
266
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
267
268
269
270
271
272
273
274

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
275
                    engine_prompt = engine_prompts[i]
276
277
278
279
280
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
281

282
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
283

284
            response = self.request_output_to_completion_response(
285
286
287
288
289
290
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
291
                request_metadata,
292
            )
293
294
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
295
296
297
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
298

299
300
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
301
        if request.stream:
302
            response_json = response.model_dump_json()
303
304
305
306
307
308
309
310

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
311
312
313
314

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
315
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
316
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
317
318
319
320
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
321
        tokenizer: AnyTokenizer,
322
        request_metadata: RequestResponseMetadata,
323
        enable_force_include_usage: bool,
324
    ) -> AsyncGenerator[str, None]:
325
        num_choices = 1 if request.n is None else request.n
326
        previous_text_lens = [0] * num_choices * num_prompts
327
328
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
329
        num_prompt_tokens = [0] * num_prompts
330
331
        num_cached_tokens = None
        first_iteration = True
332

333
334
        stream_options = request.stream_options
        if stream_options:
335
336
337
338
            include_usage = stream_options.include_usage or enable_force_include_usage
            include_continuous_usage = (
                include_usage and stream_options.continuous_usage_stats
            )
339
340
341
        else:
            include_usage, include_continuous_usage = False, False

342
343
        try:
            async for prompt_idx, res in result_generator:
344
345
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
346

347
348
349
350
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

351
352
353
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
354
355
356
357
358
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
359

360
                # Prompt details are excluded from later streamed outputs
361
362
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
363

364
                delta_token_ids: GenericSequence[int]
365
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
366
367

                for output in res.outputs:
368
                    i = output.index + prompt_idx * num_choices
369

370
371
372
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
373
                    prompt_token_ids_to_return: list[int] | None = None
374

375
                    assert request.max_tokens is not None
376
                    if request.echo and not has_echoed[i]:
377
                        assert prompt_token_ids is not None
378
379
                        if request.return_token_ids:
                            prompt_text = ""
380
                        assert prompt_text is not None
381
382
383
384
385
386
387
388
389
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
390
391
                                *prompt_token_ids,
                                *output.token_ids,
392
393
                            ]
                            out_logprobs = [
394
                                *(prompt_logprobs or []),
395
396
                                *(output.logprobs or []),
                            ]
397
                        prompt_token_ids_to_return = prompt_token_ids
398
399
400
                        has_echoed[i] = True
                    else:
                        # return just the delta
401
402
403
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
404

405
406
407
408
409
410
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
                        if not has_echoed[i]:
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

411
412
413
414
415
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
416
417
418
                            # Chunked prefill case, don't return empty chunks
                            continue

419
                    if request.logprobs is not None:
420
                        assert out_logprobs is not None, "Did not output logprobs"
421
                        logprobs = self._create_completion_logprobs(
422
                            token_ids=delta_token_ids,
423
                            top_logprobs=out_logprobs,
424
                            num_output_top_logprobs=request.logprobs,
425
                            tokenizer=tokenizer,
426
                            initial_text_offset=previous_text_lens[i],
427
                            return_as_token_id=request.return_tokens_as_token_ids,
428
429
430
431
                        )
                    else:
                        logprobs = None

432
433
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
434
                    finish_reason = output.finish_reason
435
                    stop_reason = output.stop_reason
436
437

                    chunk = CompletionStreamResponse(
438
439
440
441
442
443
444
445
446
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
447
                                stop_reason=stop_reason,
448
                                prompt_token_ids=prompt_token_ids_to_return,
449
450
451
452
453
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
454
                            )
455
456
                        ],
                    )
457
458
459
460
461
462
463
464
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
465

466
                    response_json = chunk.model_dump_json(exclude_unset=False)
467
                    yield f"data: {response_json}\n\n"
468

469
470
471
472
473
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
474
475
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
476

477
478
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
479
480
                    cached_tokens=num_cached_tokens
                )
481

482
            if include_usage:
483
484
485
486
487
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
488
                    usage=final_usage_info,
489
                )
490
                final_usage_data = final_usage_chunk.model_dump_json(
491
492
                    exclude_unset=False, exclude_none=True
                )
493
494
                yield f"data: {final_usage_data}\n\n"

495
            # report to FastAPI middleware aggregate usage across all choices
496
            request_metadata.final_usage_info = final_usage_info
497

498
        except Exception as e:
499
500
501
502
503
504
505
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
506
        final_res_batch: list[RequestOutput],
507
508
509
510
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
511
        tokenizer: AnyTokenizer,
512
        request_metadata: RequestResponseMetadata,
513
    ) -> CompletionResponse:
514
        choices: list[CompletionResponseChoice] = []
515
516
        num_prompt_tokens = 0
        num_generated_tokens = 0
517
518
        kv_transfer_params = None
        last_final_res = None
519
        for final_res in final_res_batch:
520
            last_final_res = final_res
521
            prompt_token_ids = final_res.prompt_token_ids
522
            assert prompt_token_ids is not None
523
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
524
525
            prompt_text = final_res.prompt

526
            token_ids: GenericSequence[int]
527
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
528

529
            for output in final_res.outputs:
530
                assert request.max_tokens is not None
531
                if request.echo:
532
533
                    if request.return_token_ids:
                        prompt_text = ""
534
                    assert prompt_text is not None
535
536
537
538
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
539
                    else:
540
541
542
543
544
545
546
547
548
549
550
551
552
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
553
554
                else:
                    token_ids = output.token_ids
555
                    out_logprobs = output.logprobs
556
557
558
                    output_text = output.text

                if request.logprobs is not None:
559
                    assert out_logprobs is not None, "Did not output logprobs"
560
                    logprobs = self._create_completion_logprobs(
561
                        token_ids=token_ids,
562
                        top_logprobs=out_logprobs,
563
                        tokenizer=tokenizer,
564
                        num_output_top_logprobs=request.logprobs,
565
                        return_as_token_id=request.return_tokens_as_token_ids,
566
567
568
569
570
571
572
573
574
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
575
                    stop_reason=output.stop_reason,
576
                    prompt_logprobs=final_res.prompt_logprobs,
577
578
579
580
581
582
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
583
584
585
                )
                choices.append(choice_data)

586
587
                num_generated_tokens += len(output.token_ids)

588
589
590
591
592
593
594
595
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

596
597
598
599
600
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
601
            usage.prompt_tokens_details = PromptTokenUsageInfo(
602
603
                cached_tokens=last_final_res.num_cached_tokens
            )
604

605
        request_metadata.final_usage_info = usage
606
607
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
608
609
610
611
612
613
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
614
615
            kv_transfer_params=kv_transfer_params,
        )
616
617
618
619

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
620
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
621
        num_output_top_logprobs: int,
622
        tokenizer: AnyTokenizer,
623
        initial_text_offset: int = 0,
624
        return_as_token_id: bool | None = None,
625
626
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
627
        out_text_offset: list[int] = []
628
        out_token_logprobs: list[float | None] = []
629
        out_tokens: list[str] = []
630
        out_top_logprobs: list[dict[str, float] | None] = []
631
632
633

        last_token_len = 0

634
635
636
637
638
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
639
640
641
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
642
                token = tokenizer.decode(token_id)
643
                if should_return_as_token_id:
644
                    token = f"token_id:{token_id}"
645

646
647
648
649
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
650
651
                step_token = step_top_logprobs[token_id]

652
                token = self._get_decoded_token(
653
                    step_token,
654
655
                    token_id,
                    tokenizer,
656
                    return_as_token_id=should_return_as_token_id,
657
658
659
                )
                token_logprob = max(step_token.logprob, -9999.0)

660
661
662
663
664
665
666
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
667
668
669
670
671
672
673
674
675
676
677
678
679
680
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
681
682
683
684
685
686
687
688
689
690
691
692
693

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
694
695
696
697

    def _build_render_config(
        self,
        request: CompletionRequest,
698
        max_input_length: int | None = None,
699
700
701
702
703
704
705
    ) -> RenderConfig:
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
706
            needs_detokenization=bool(request.echo and not request.return_token_ids),
707
        )