serving_completion.py 28.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
16
17
18
19
20
21
22
23
24
25
26
27
from vllm.entrypoints.openai.protocol import (
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
28
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
29
from vllm.entrypoints.renderer import RenderConfig
30
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
31
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
32
from vllm.logger import init_logger
33
from vllm.logprobs import Logprob
34
from vllm.outputs import RequestOutput
35
from vllm.sampling_params import BeamSearchParams, SamplingParams
36
from vllm.transformers_utils.tokenizer import AnyTokenizer
37
38
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
39
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
40
41
42
43
44

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
45
46
    def __init__(
        self,
47
        engine_client: EngineClient,
48
        models: OpenAIServingModels,
49
        *,
50
        request_logger: RequestLogger | None,
51
        return_tokens_as_token_ids: bool = False,
52
        enable_prompt_tokens_details: bool = False,
53
        enable_force_include_usage: bool = False,
54
        log_error_stack: bool = False,
55
    ):
56
57
58
59
60
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
61
            log_error_stack=log_error_stack,
62
        )
63
64
65
66

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

67
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
68
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
69
        self.enable_force_include_usage = enable_force_include_usage
70
        if self.default_sampling_params:
71
72
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
73
74
75
76
77
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
78

79
80
81
    async def create_completion(
        self,
        request: CompletionRequest,
82
83
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
84
85
86
87
88
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

89
        NOTE: Currently we do not support the following feature:
90
91
92
93
94
95
96
            - suffix (the language models we currently support do not support
            suffix)
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

97
98
99
100
101
102
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

103
        # Return error for unsupported features.
104
        if request.suffix is not None:
105
            return self.create_error_response("suffix is not currently supported")
106

107
        if request.echo and request.prompt_embeds is not None:
108
            return self.create_error_response("Echo is unsupported with prompt embeds.")
109

110
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
111
            return self.create_error_response(
112
113
                "prompt_logprobs is not compatible with prompt embeds."
            )
114

115
        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
116
        created_time = int(time.time())
117

118
119
120
121
        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

122
        try:
123
            lora_request = self._maybe_get_adapters(request)
124

125
126
127
            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
128
                tokenizer = await self.engine_client.get_tokenizer()
129
130
131
132
133
            renderer = self._get_renderer(tokenizer)

            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
134
                config=self._build_render_config(request),
135
136
137
138
            )
        except ValueError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
139
140
141
142
143
144
145
146
147
        except TypeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except RuntimeError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
        except jinja2.TemplateError as e:
            logger.exception("Error in preprocessing prompt inputs")
            return self.create_error_response(str(e))
148

149
150
151
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

152
        # Schedule the request and get the result generator.
153
        generators: list[AsyncGenerator[RequestOutput, None]] = []
154
155
        try:
            for i, engine_prompt in enumerate(engine_prompts):
156
                prompt_text, prompt_token_ids, prompt_embeds = (
157
158
                    self._get_prompt_components(engine_prompt)
                )
159
160
161
162
163
164

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
165
                else:
166
                    raise NotImplementedError
167
168
169
170
171
172
173
174

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
175
176
                    default_sampling_params=self.default_sampling_params,
                )
177

178
                sampling_params: SamplingParams | BeamSearchParams
179
180
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
181
182
                        max_tokens, self.default_sampling_params
                    )
183
184
                else:
                    sampling_params = request.to_sampling_params(
185
186
187
188
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
189
190
191
192
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
193

194
195
                request_id_item = f"{request_id}-{i}"

196
197
                self._log_inputs(
                    request_id_item,
198
                    engine_prompt,
199
200
201
                    params=sampling_params,
                    lora_request=lora_request,
                )
202

203
204
205
206
207
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
208

209
210
211
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
212
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
213
                if isinstance(sampling_params, BeamSearchParams):
214
                    generator = self.beam_search(
215
                        prompt=engine_prompt,
216
217
                        request_id=request_id,
                        params=sampling_params,
218
                        lora_request=lora_request,
219
                    )
220
                else:
221
222
223
224
225
226
227
228
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
                    )
229

230
                    generator = self.engine_client.generate(
231
                        engine_request,
232
233
234
235
236
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
237
238
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
239
                        data_parallel_rank=data_parallel_rank,
240
                    )
241
242

                generators.append(generator)
243
        except ValueError as e:
244
            # TODO: Use a vllm-specific Validation Error
245
            return self.create_error_response(str(e))
246

247
        result_generator = merge_async_iterators(*generators)
248

249
        model_name = self.models.model_name(lora_request)
250
251
        num_prompts = len(engine_prompts)

252
253
254
        # Similar to the OpenAI API, when n != best_of, we do not stream the
        # results. Noting that best_of is only supported in V0. In addition,
        # we do not stream the results when use beam search.
255
256
257
258
259
        stream = (
            request.stream
            and (request.best_of is None or request.n == request.best_of)
            and not request.use_beam_search
        )
260
261
262

        # Streaming response
        if stream:
263
264
            return self.completion_stream_generator(
                request,
265
                engine_prompts,
266
267
268
269
                result_generator,
                request_id,
                created_time,
                model_name,
270
                num_prompts=num_prompts,
271
                tokenizer=tokenizer,
272
                request_metadata=request_metadata,
273
            )
274
275

        # Non-streaming response
276
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
277
278
279
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
280
281
282
283
284
285
286
287

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
288
                    engine_prompt = engine_prompts[i]
289
290
291
292
293
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
294

295
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
296

297
            response = self.request_output_to_completion_response(
298
299
300
301
302
303
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
304
                request_metadata,
305
            )
306
307
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
308
309
310
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
311

312
313
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
314
        if request.stream:
315
            response_json = response.model_dump_json()
316
317
318
319
320
321
322
323

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
324
325
326
327

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
328
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
329
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
330
331
332
333
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
334
        tokenizer: AnyTokenizer,
335
        request_metadata: RequestResponseMetadata,
336
    ) -> AsyncGenerator[str, None]:
337
        num_choices = 1 if request.n is None else request.n
338
        previous_text_lens = [0] * num_choices * num_prompts
339
340
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
341
        num_prompt_tokens = [0] * num_prompts
342
343
        num_cached_tokens = None
        first_iteration = True
344

345
        stream_options = request.stream_options
346
347
348
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
349

350
351
        try:
            async for prompt_idx, res in result_generator:
352
353
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
354

355
356
357
358
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

359
360
361
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
362
363
364
365
366
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
367

368
                # Prompt details are excluded from later streamed outputs
369
370
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
371

372
                delta_token_ids: GenericSequence[int]
373
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
374
375

                for output in res.outputs:
376
                    i = output.index + prompt_idx * num_choices
377

378
379
380
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
381
                    prompt_token_ids_to_return: list[int] | None = None
382

383
                    assert request.max_tokens is not None
384
                    if request.echo and not has_echoed[i]:
385
                        assert prompt_token_ids is not None
386
387
                        if request.return_token_ids:
                            prompt_text = ""
388
                        assert prompt_text is not None
389
390
391
392
393
394
395
396
397
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
398
399
                                *prompt_token_ids,
                                *output.token_ids,
400
401
                            ]
                            out_logprobs = [
402
                                *(prompt_logprobs or []),
403
404
                                *(output.logprobs or []),
                            ]
405
                        prompt_token_ids_to_return = prompt_token_ids
406
407
408
                        has_echoed[i] = True
                    else:
                        # return just the delta
409
410
411
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
412

413
414
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
415
                        if not has_echoed[i] and request.return_token_ids:
416
417
418
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

419
420
421
422
423
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
424
425
426
                            # Chunked prefill case, don't return empty chunks
                            continue

427
                    if request.logprobs is not None:
428
                        assert out_logprobs is not None, "Did not output logprobs"
429
                        logprobs = self._create_completion_logprobs(
430
                            token_ids=delta_token_ids,
431
                            top_logprobs=out_logprobs,
432
                            num_output_top_logprobs=request.logprobs,
433
                            tokenizer=tokenizer,
434
                            initial_text_offset=previous_text_lens[i],
435
                            return_as_token_id=request.return_tokens_as_token_ids,
436
437
438
439
                        )
                    else:
                        logprobs = None

440
441
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
442
                    finish_reason = output.finish_reason
443
                    stop_reason = output.stop_reason
444
445

                    chunk = CompletionStreamResponse(
446
447
448
449
450
451
452
453
454
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
455
                                stop_reason=stop_reason,
456
                                prompt_token_ids=prompt_token_ids_to_return,
457
458
459
460
461
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
462
                            )
463
464
                        ],
                    )
465
466
467
468
469
470
471
472
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
473

474
                    response_json = chunk.model_dump_json(exclude_unset=False)
475
                    yield f"data: {response_json}\n\n"
476

477
478
479
480
481
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
482
483
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
484

485
486
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
487
488
                    cached_tokens=num_cached_tokens
                )
489

490
            if include_usage:
491
492
493
494
495
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
496
                    usage=final_usage_info,
497
                )
498
                final_usage_data = final_usage_chunk.model_dump_json(
499
500
                    exclude_unset=False, exclude_none=True
                )
501
502
                yield f"data: {final_usage_data}\n\n"

503
            # report to FastAPI middleware aggregate usage across all choices
504
            request_metadata.final_usage_info = final_usage_info
505

506
        except Exception as e:
507
508
509
510
511
512
513
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
514
        final_res_batch: list[RequestOutput],
515
516
517
518
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
519
        tokenizer: AnyTokenizer,
520
        request_metadata: RequestResponseMetadata,
521
    ) -> CompletionResponse:
522
        choices: list[CompletionResponseChoice] = []
523
524
        num_prompt_tokens = 0
        num_generated_tokens = 0
525
526
        kv_transfer_params = None
        last_final_res = None
527
        for final_res in final_res_batch:
528
            last_final_res = final_res
529
            prompt_token_ids = final_res.prompt_token_ids
530
            assert prompt_token_ids is not None
531
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
532
533
            prompt_text = final_res.prompt

534
            token_ids: GenericSequence[int]
535
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
536

537
            for output in final_res.outputs:
538
                assert request.max_tokens is not None
539
                if request.echo:
540
541
                    if request.return_token_ids:
                        prompt_text = ""
542
                    assert prompt_text is not None
543
544
545
546
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
547
                    else:
548
549
550
551
552
553
554
555
556
557
558
559
560
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
561
562
                else:
                    token_ids = output.token_ids
563
                    out_logprobs = output.logprobs
564
565
566
                    output_text = output.text

                if request.logprobs is not None:
567
                    assert out_logprobs is not None, "Did not output logprobs"
568
                    logprobs = self._create_completion_logprobs(
569
                        token_ids=token_ids,
570
                        top_logprobs=out_logprobs,
571
                        tokenizer=tokenizer,
572
                        num_output_top_logprobs=request.logprobs,
573
                        return_as_token_id=request.return_tokens_as_token_ids,
574
575
576
577
578
579
580
581
582
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
583
                    stop_reason=output.stop_reason,
584
                    prompt_logprobs=final_res.prompt_logprobs,
585
586
587
588
589
590
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
591
592
593
                )
                choices.append(choice_data)

594
595
                num_generated_tokens += len(output.token_ids)

596
597
598
599
600
601
602
603
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

604
605
606
607
608
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
609
            usage.prompt_tokens_details = PromptTokenUsageInfo(
610
611
                cached_tokens=last_final_res.num_cached_tokens
            )
612

613
        request_metadata.final_usage_info = usage
614
615
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
616
617
618
619
620
621
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
622
623
            kv_transfer_params=kv_transfer_params,
        )
624
625
626
627

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
628
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
629
        num_output_top_logprobs: int,
630
        tokenizer: AnyTokenizer,
631
        initial_text_offset: int = 0,
632
        return_as_token_id: bool | None = None,
633
634
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
635
        out_text_offset: list[int] = []
636
        out_token_logprobs: list[float | None] = []
637
        out_tokens: list[str] = []
638
        out_top_logprobs: list[dict[str, float] | None] = []
639
640
641

        last_token_len = 0

642
643
644
645
646
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
647
648
649
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
650
                token = tokenizer.decode(token_id)
651
                if should_return_as_token_id:
652
                    token = f"token_id:{token_id}"
653

654
655
656
657
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
658
659
                step_token = step_top_logprobs[token_id]

660
                token = self._get_decoded_token(
661
                    step_token,
662
663
                    token_id,
                    tokenizer,
664
                    return_as_token_id=should_return_as_token_id,
665
666
667
                )
                token_logprob = max(step_token.logprob, -9999.0)

668
669
670
671
672
673
674
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
675
676
677
678
679
680
681
682
683
684
685
686
687
688
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
689
690
691
692
693
694
695
696
697
698
699
700
701

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
702
703
704
705

    def _build_render_config(
        self,
        request: CompletionRequest,
706
        max_input_length: int | None = None,
707
708
709
710
711
712
713
    ) -> RenderConfig:
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
714
            needs_detokenization=bool(request.echo and not request.return_token_ids),
715
        )