"vscode:/vscode.git/clone" did not exist on "9112b443a042d8d815880b8780633882ad32b183"
serving.py 29.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.renderer import RenderConfig
36
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
37
from vllm.exceptions import VLLMValidationError
38
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
39
from vllm.inputs.parse import get_prompt_components
40
from vllm.logger import init_logger
41
from vllm.logprobs import Logprob
42
from vllm.outputs import RequestOutput
43
from vllm.sampling_params import BeamSearchParams, SamplingParams
44
from vllm.tokenizers import TokenizerLike
45
46
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
47
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
48
49
50
51
52

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
53
54
    def __init__(
        self,
55
        engine_client: EngineClient,
56
        models: OpenAIServingModels,
57
        *,
58
        request_logger: RequestLogger | None,
59
        return_tokens_as_token_ids: bool = False,
60
        enable_prompt_tokens_details: bool = False,
61
        enable_force_include_usage: bool = False,
62
        log_error_stack: bool = False,
63
    ):
64
65
66
67
68
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
69
            log_error_stack=log_error_stack,
70
        )
71
72
73
74

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

75
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
76
        self.enable_force_include_usage = enable_force_include_usage
77
78

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
79

80
    async def render_completion_request(
81
82
        self,
        request: CompletionRequest,
83
84
85
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
86

87
88
89
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
90
91
92
93
94
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

95
96
97
98
99
100
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

101
        # Return error for unsupported features.
102
        if request.suffix is not None:
103
            return self.create_error_response("suffix is not currently supported")
104

105
        if request.echo and request.prompt_embeds is not None:
106
            return self.create_error_response("Echo is unsupported with prompt embeds.")
107

108
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
109
            return self.create_error_response(
110
111
                "prompt_logprobs is not compatible with prompt embeds."
            )
112

113
        try:
114
            renderer = self._get_completion_renderer()
115
116
117
            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
118
                config=self._build_render_config(request),
119
            )
120
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
121
            logger.exception("Error in preprocessing prompt inputs")
122
            return self.create_error_response(e)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
157
            return self.create_error_response(e)
158

159
160
161
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

162
        # Schedule the request and get the result generator.
163
        generators: list[AsyncGenerator[RequestOutput, None]] = []
164
165
        try:
            for i, engine_prompt in enumerate(engine_prompts):
166
                prompt_text, _, _ = get_prompt_components(engine_prompt)
167
168
169
170

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
171
                    prompt=engine_prompt,
172
173
                    default_sampling_params=self.default_sampling_params,
                )
174

175
                sampling_params: SamplingParams | BeamSearchParams
176
177
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
178
179
                        max_tokens, self.default_sampling_params
                    )
180
181
                else:
                    sampling_params = request.to_sampling_params(
182
183
184
185
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
186
187
188
189
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
190

191
192
                request_id_item = f"{request_id}-{i}"

193
194
                self._log_inputs(
                    request_id_item,
195
                    engine_prompt,
196
197
198
                    params=sampling_params,
                    lora_request=lora_request,
                )
199

200
201
202
203
204
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
205

206
207
208
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
209
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
210
                if isinstance(sampling_params, BeamSearchParams):
211
                    generator = self.beam_search(
212
                        prompt=engine_prompt,
213
214
                        request_id=request_id,
                        params=sampling_params,
215
                        lora_request=lora_request,
216
                        trace_headers=trace_headers,
217
                    )
218
                else:
219
220
221
222
223
224
225
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
226
                        data_parallel_rank=data_parallel_rank,
227
                    )
228

229
                    generator = self.engine_client.generate(
230
                        engine_request,
231
232
233
234
235
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
236
237
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
238
                        data_parallel_rank=data_parallel_rank,
239
                    )
240
241

                generators.append(generator)
242
        except ValueError as e:
243
            return self.create_error_response(e)
244

245
        result_generator = merge_async_iterators(*generators)
246

247
        model_name = self.models.model_name(lora_request)
248
249
        num_prompts = len(engine_prompts)

250
251
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
252
253

        # Streaming response
254
255
        tokenizer = self.renderer.tokenizer

256
        if stream:
257
258
            return self.completion_stream_generator(
                request,
259
                engine_prompts,
260
261
262
263
                result_generator,
                request_id,
                created_time,
                model_name,
264
                num_prompts=num_prompts,
265
                tokenizer=tokenizer,
266
                request_metadata=request_metadata,
267
            )
268
269

        # Non-streaming response
270
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
271
272
273
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
274
275
276
277
278
279
280
281

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
282
                    engine_prompt = engine_prompts[i]
283
284
285
286
287
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
288

289
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
290

291
            response = self.request_output_to_completion_response(
292
293
294
295
296
297
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
298
                request_metadata,
299
            )
300
301
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
302
303
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
304
        except ValueError as e:
305
            return self.create_error_response(e)
306

307
308
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
309
        if request.stream:
310
            response_json = response.model_dump_json()
311
312
313
314
315
316
317
318

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
319
320
321
322

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
323
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
324
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
325
326
327
328
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
329
        tokenizer: TokenizerLike | None,
330
        request_metadata: RequestResponseMetadata,
331
    ) -> AsyncGenerator[str, None]:
332
        num_choices = 1 if request.n is None else request.n
333
        previous_text_lens = [0] * num_choices * num_prompts
334
335
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
336
        num_prompt_tokens = [0] * num_prompts
337
338
        num_cached_tokens = None
        first_iteration = True
339

340
        stream_options = request.stream_options
341
342
343
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
344

345
346
        try:
            async for prompt_idx, res in result_generator:
347
348
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
349

350
351
352
353
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

354
355
356
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
357
358
359
360
361
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
362

363
                # Prompt details are excluded from later streamed outputs
364
365
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
366

367
                delta_token_ids: GenericSequence[int]
368
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
369
370

                for output in res.outputs:
371
                    i = output.index + prompt_idx * num_choices
372

373
374
375
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
376
                    prompt_token_ids_to_return: list[int] | None = None
377

378
                    assert request.max_tokens is not None
379
                    if request.echo and not has_echoed[i]:
380
                        assert prompt_token_ids is not None
381
382
                        if request.return_token_ids:
                            prompt_text = ""
383
                        assert prompt_text is not None
384
385
386
387
388
389
390
391
392
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
393
394
                                *prompt_token_ids,
                                *output.token_ids,
395
396
                            ]
                            out_logprobs = [
397
                                *(prompt_logprobs or []),
398
399
                                *(output.logprobs or []),
                            ]
400
                        prompt_token_ids_to_return = prompt_token_ids
401
402
403
                        has_echoed[i] = True
                    else:
                        # return just the delta
404
405
406
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
407

408
409
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
410
                        if not has_echoed[i] and request.return_token_ids:
411
412
413
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

414
415
416
417
418
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
419
420
421
                            # Chunked prefill case, don't return empty chunks
                            continue

422
                    if request.logprobs is not None:
423
                        assert out_logprobs is not None, "Did not output logprobs"
424
                        logprobs = self._create_completion_logprobs(
425
                            token_ids=delta_token_ids,
426
                            top_logprobs=out_logprobs,
427
                            num_output_top_logprobs=request.logprobs,
428
                            tokenizer=tokenizer,
429
                            initial_text_offset=previous_text_lens[i],
430
                            return_as_token_id=request.return_tokens_as_token_ids,
431
432
433
434
                        )
                    else:
                        logprobs = None

435
436
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
437
                    finish_reason = output.finish_reason
438
                    stop_reason = output.stop_reason
439

440
441
                    self._raise_if_error(finish_reason, request_id)

442
                    chunk = CompletionStreamResponse(
443
444
445
446
447
448
449
450
451
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
452
                                stop_reason=stop_reason,
453
                                prompt_token_ids=prompt_token_ids_to_return,
454
455
456
457
458
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
459
                            )
460
461
                        ],
                    )
462
463
464
465
466
467
468
469
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
470

471
                    response_json = chunk.model_dump_json(exclude_unset=False)
472
                    yield f"data: {response_json}\n\n"
473

474
475
476
477
478
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
479
480
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
481

482
483
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
484
485
                    cached_tokens=num_cached_tokens
                )
486

487
            if include_usage:
488
489
490
491
492
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
493
                    usage=final_usage_info,
494
                )
495
                final_usage_data = final_usage_chunk.model_dump_json(
496
497
                    exclude_unset=False, exclude_none=True
                )
498
499
                yield f"data: {final_usage_data}\n\n"

500
            # report to FastAPI middleware aggregate usage across all choices
501
            request_metadata.final_usage_info = final_usage_info
502

503
504
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
505
        except Exception as e:
506
            logger.exception("Error in completion stream generator.")
507
            data = self.create_streaming_error_response(e)
508
509
510
511
512
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
513
        final_res_batch: list[RequestOutput],
514
515
516
517
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
518
        tokenizer: TokenizerLike | None,
519
        request_metadata: RequestResponseMetadata,
520
    ) -> CompletionResponse:
521
        choices: list[CompletionResponseChoice] = []
522
523
        num_prompt_tokens = 0
        num_generated_tokens = 0
524
525
        kv_transfer_params = None
        last_final_res = None
526
        for final_res in final_res_batch:
527
            last_final_res = final_res
528
            prompt_token_ids = final_res.prompt_token_ids
529
            assert prompt_token_ids is not None
530
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
531
532
            prompt_text = final_res.prompt

533
            token_ids: GenericSequence[int]
534
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
535

536
            for output in final_res.outputs:
537
538
                self._raise_if_error(output.finish_reason, request_id)

539
                assert request.max_tokens is not None
540
                if request.echo:
541
542
                    if request.return_token_ids:
                        prompt_text = ""
543
                    assert prompt_text is not None
544
545
546
547
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
548
                    else:
549
550
551
552
553
554
555
556
557
558
559
560
561
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
562
563
                else:
                    token_ids = output.token_ids
564
                    out_logprobs = output.logprobs
565
566
567
                    output_text = output.text

                if request.logprobs is not None:
568
                    assert out_logprobs is not None, "Did not output logprobs"
569
                    logprobs = self._create_completion_logprobs(
570
                        token_ids=token_ids,
571
                        top_logprobs=out_logprobs,
572
                        tokenizer=tokenizer,
573
                        num_output_top_logprobs=request.logprobs,
574
                        return_as_token_id=request.return_tokens_as_token_ids,
575
576
577
578
579
580
581
582
583
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
584
                    stop_reason=output.stop_reason,
585
                    prompt_logprobs=final_res.prompt_logprobs,
586
587
588
589
590
591
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
592
593
594
                )
                choices.append(choice_data)

595
596
                num_generated_tokens += len(output.token_ids)

597
598
599
600
601
602
603
604
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

605
606
607
608
609
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
610
            usage.prompt_tokens_details = PromptTokenUsageInfo(
611
612
                cached_tokens=last_final_res.num_cached_tokens
            )
613

614
        request_metadata.final_usage_info = usage
615
616
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
617
618
619
620
621
622
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
623
624
            kv_transfer_params=kv_transfer_params,
        )
625
626
627
628

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
629
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
630
        num_output_top_logprobs: int,
631
        tokenizer: TokenizerLike | None,
632
        initial_text_offset: int = 0,
633
        return_as_token_id: bool | None = None,
634
635
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
636
        out_text_offset: list[int] = []
637
        out_token_logprobs: list[float | None] = []
638
        out_tokens: list[str] = []
639
        out_top_logprobs: list[dict[str, float] | None] = []
640
641
642

        last_token_len = 0

643
644
645
646
647
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
648
649
650
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
651
                if should_return_as_token_id:
652
                    token = f"token_id:{token_id}"
653
654
                else:
                    if tokenizer is None:
655
656
657
658
659
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
660
661
662
                        )

                    token = tokenizer.decode(token_id)
663

664
665
666
667
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
668
669
                step_token = step_top_logprobs[token_id]

670
                token = self._get_decoded_token(
671
                    step_token,
672
673
                    token_id,
                    tokenizer,
674
                    return_as_token_id=should_return_as_token_id,
675
676
677
                )
                token_logprob = max(step_token.logprob, -9999.0)

678
679
680
681
682
683
684
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
685
686
687
688
689
690
691
692
693
694
695
696
697
698
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
699
700
701
702
703
704
705
706
707
708
709
710
711

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
712
713
714
715

    def _build_render_config(
        self,
        request: CompletionRequest,
716
        max_input_length: int | None = None,
717
    ) -> RenderConfig:
718
719
720
721
722
723
724
725
726
        # Validate max_tokens before using it
        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
            raise VLLMValidationError(
                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
                f"the model's maximum context length ({self.max_model_len}).",
                parameter="max_tokens",
                value=request.max_tokens,
            )

727
728
729
730
731
732
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
733
            needs_detokenization=bool(request.echo and not request.return_token_ids),
734
        )