"csrc/vscode:/vscode.git/clone" did not exist on "8678a69ab51956031e3bb70bdf1a781a8652e67d"
serving.py 29.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.renderer import RenderConfig
36
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
37
from vllm.exceptions import VLLMValidationError
38
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
39
from vllm.logger import init_logger
40
from vllm.logprobs import Logprob
41
from vllm.outputs import RequestOutput
42
from vllm.sampling_params import BeamSearchParams, SamplingParams
43
from vllm.tokenizers import TokenizerLike
44
45
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
46
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
47
48
49
50
51

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
52
53
    def __init__(
        self,
54
        engine_client: EngineClient,
55
        models: OpenAIServingModels,
56
        *,
57
        request_logger: RequestLogger | None,
58
        return_tokens_as_token_ids: bool = False,
59
        enable_prompt_tokens_details: bool = False,
60
        enable_force_include_usage: bool = False,
61
        log_error_stack: bool = False,
62
    ):
63
64
65
66
67
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
68
            log_error_stack=log_error_stack,
69
        )
70
71
72
73

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

74
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
75
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
76
        self.enable_force_include_usage = enable_force_include_usage
77
        if self.default_sampling_params:
78
79
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
80
81
82
83
84
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
85

86
    async def render_completion_request(
87
88
        self,
        request: CompletionRequest,
89
90
91
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
92

93
94
95
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
96
97
98
99
100
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

101
102
103
104
105
106
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

107
        # Return error for unsupported features.
108
        if request.suffix is not None:
109
            return self.create_error_response("suffix is not currently supported")
110

111
        if request.echo and request.prompt_embeds is not None:
112
            return self.create_error_response("Echo is unsupported with prompt embeds.")
113

114
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
115
            return self.create_error_response(
116
117
                "prompt_logprobs is not compatible with prompt embeds."
            )
118

119
        try:
120
            renderer = self._get_completion_renderer()
121
122
123
            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
124
                config=self._build_render_config(request),
125
            )
126
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
127
            logger.exception("Error in preprocessing prompt inputs")
128
            return self.create_error_response(e)
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
163
            return self.create_error_response(e)
164

165
166
167
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

168
        # Schedule the request and get the result generator.
169
        generators: list[AsyncGenerator[RequestOutput, None]] = []
170
171
        try:
            for i, engine_prompt in enumerate(engine_prompts):
172
                prompt_text, prompt_token_ids, prompt_embeds = (
173
174
                    self._get_prompt_components(engine_prompt)
                )
175
176
177
178
179
180

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
181
                else:
182
                    raise NotImplementedError
183
184
185
186
187
188
189
190

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
191
192
                    default_sampling_params=self.default_sampling_params,
                )
193

194
                sampling_params: SamplingParams | BeamSearchParams
195
196
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
197
198
                        max_tokens, self.default_sampling_params
                    )
199
200
                else:
                    sampling_params = request.to_sampling_params(
201
202
203
204
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
205
206
207
208
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
209

210
211
                request_id_item = f"{request_id}-{i}"

212
213
                self._log_inputs(
                    request_id_item,
214
                    engine_prompt,
215
216
217
                    params=sampling_params,
                    lora_request=lora_request,
                )
218

219
220
221
222
223
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
224

225
226
227
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
228
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
229
                if isinstance(sampling_params, BeamSearchParams):
230
                    generator = self.beam_search(
231
                        prompt=engine_prompt,
232
233
                        request_id=request_id,
                        params=sampling_params,
234
                        lora_request=lora_request,
235
                        trace_headers=trace_headers,
236
                    )
237
                else:
238
239
240
241
242
243
244
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
245
                        data_parallel_rank=data_parallel_rank,
246
                    )
247

248
                    generator = self.engine_client.generate(
249
                        engine_request,
250
251
252
253
254
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
255
256
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
257
                        data_parallel_rank=data_parallel_rank,
258
                    )
259
260

                generators.append(generator)
261
        except ValueError as e:
262
            return self.create_error_response(e)
263

264
        result_generator = merge_async_iterators(*generators)
265

266
        model_name = self.models.model_name(lora_request)
267
268
        num_prompts = len(engine_prompts)

269
270
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
271
272

        # Streaming response
273
274
        tokenizer = self.renderer.tokenizer

275
        if stream:
276
277
            return self.completion_stream_generator(
                request,
278
                engine_prompts,
279
280
281
282
                result_generator,
                request_id,
                created_time,
                model_name,
283
                num_prompts=num_prompts,
284
                tokenizer=tokenizer,
285
                request_metadata=request_metadata,
286
            )
287
288

        # Non-streaming response
289
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
290
291
292
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
293
294
295
296
297
298
299
300

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
301
                    engine_prompt = engine_prompts[i]
302
303
304
305
306
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
307

308
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
309

310
            response = self.request_output_to_completion_response(
311
312
313
314
315
316
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
317
                request_metadata,
318
            )
319
320
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
321
322
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
323
        except ValueError as e:
324
            return self.create_error_response(e)
325

326
327
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
328
        if request.stream:
329
            response_json = response.model_dump_json()
330
331
332
333
334
335
336
337

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
338
339
340
341

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
342
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
343
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
344
345
346
347
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
348
        tokenizer: TokenizerLike | None,
349
        request_metadata: RequestResponseMetadata,
350
    ) -> AsyncGenerator[str, None]:
351
        num_choices = 1 if request.n is None else request.n
352
        previous_text_lens = [0] * num_choices * num_prompts
353
354
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
355
        num_prompt_tokens = [0] * num_prompts
356
357
        num_cached_tokens = None
        first_iteration = True
358

359
        stream_options = request.stream_options
360
361
362
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
363

364
365
        try:
            async for prompt_idx, res in result_generator:
366
367
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
368

369
370
371
372
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

373
374
375
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
376
377
378
379
380
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
381

382
                # Prompt details are excluded from later streamed outputs
383
384
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
385

386
                delta_token_ids: GenericSequence[int]
387
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
388
389

                for output in res.outputs:
390
                    i = output.index + prompt_idx * num_choices
391

392
393
394
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
395
                    prompt_token_ids_to_return: list[int] | None = None
396

397
                    assert request.max_tokens is not None
398
                    if request.echo and not has_echoed[i]:
399
                        assert prompt_token_ids is not None
400
401
                        if request.return_token_ids:
                            prompt_text = ""
402
                        assert prompt_text is not None
403
404
405
406
407
408
409
410
411
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
412
413
                                *prompt_token_ids,
                                *output.token_ids,
414
415
                            ]
                            out_logprobs = [
416
                                *(prompt_logprobs or []),
417
418
                                *(output.logprobs or []),
                            ]
419
                        prompt_token_ids_to_return = prompt_token_ids
420
421
422
                        has_echoed[i] = True
                    else:
                        # return just the delta
423
424
425
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
426

427
428
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
429
                        if not has_echoed[i] and request.return_token_ids:
430
431
432
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

433
434
435
436
437
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
438
439
440
                            # Chunked prefill case, don't return empty chunks
                            continue

441
                    if request.logprobs is not None:
442
                        assert out_logprobs is not None, "Did not output logprobs"
443
                        logprobs = self._create_completion_logprobs(
444
                            token_ids=delta_token_ids,
445
                            top_logprobs=out_logprobs,
446
                            num_output_top_logprobs=request.logprobs,
447
                            tokenizer=tokenizer,
448
                            initial_text_offset=previous_text_lens[i],
449
                            return_as_token_id=request.return_tokens_as_token_ids,
450
451
452
453
                        )
                    else:
                        logprobs = None

454
455
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
456
                    finish_reason = output.finish_reason
457
                    stop_reason = output.stop_reason
458

459
460
                    self._raise_if_error(finish_reason, request_id)

461
                    chunk = CompletionStreamResponse(
462
463
464
465
466
467
468
469
470
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
471
                                stop_reason=stop_reason,
472
                                prompt_token_ids=prompt_token_ids_to_return,
473
474
475
476
477
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
478
                            )
479
480
                        ],
                    )
481
482
483
484
485
486
487
488
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
489

490
                    response_json = chunk.model_dump_json(exclude_unset=False)
491
                    yield f"data: {response_json}\n\n"
492

493
494
495
496
497
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
498
499
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
500

501
502
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
503
504
                    cached_tokens=num_cached_tokens
                )
505

506
            if include_usage:
507
508
509
510
511
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
512
                    usage=final_usage_info,
513
                )
514
                final_usage_data = final_usage_chunk.model_dump_json(
515
516
                    exclude_unset=False, exclude_none=True
                )
517
518
                yield f"data: {final_usage_data}\n\n"

519
            # report to FastAPI middleware aggregate usage across all choices
520
            request_metadata.final_usage_info = final_usage_info
521

522
523
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
524
        except Exception as e:
525
            logger.exception("Error in completion stream generator.")
526
            data = self.create_streaming_error_response(e)
527
528
529
530
531
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
532
        final_res_batch: list[RequestOutput],
533
534
535
536
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
537
        tokenizer: TokenizerLike | None,
538
        request_metadata: RequestResponseMetadata,
539
    ) -> CompletionResponse:
540
        choices: list[CompletionResponseChoice] = []
541
542
        num_prompt_tokens = 0
        num_generated_tokens = 0
543
544
        kv_transfer_params = None
        last_final_res = None
545
        for final_res in final_res_batch:
546
            last_final_res = final_res
547
            prompt_token_ids = final_res.prompt_token_ids
548
            assert prompt_token_ids is not None
549
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
550
551
            prompt_text = final_res.prompt

552
            token_ids: GenericSequence[int]
553
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
554

555
            for output in final_res.outputs:
556
557
                self._raise_if_error(output.finish_reason, request_id)

558
                assert request.max_tokens is not None
559
                if request.echo:
560
561
                    if request.return_token_ids:
                        prompt_text = ""
562
                    assert prompt_text is not None
563
564
565
566
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
567
                    else:
568
569
570
571
572
573
574
575
576
577
578
579
580
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
581
582
                else:
                    token_ids = output.token_ids
583
                    out_logprobs = output.logprobs
584
585
586
                    output_text = output.text

                if request.logprobs is not None:
587
                    assert out_logprobs is not None, "Did not output logprobs"
588
                    logprobs = self._create_completion_logprobs(
589
                        token_ids=token_ids,
590
                        top_logprobs=out_logprobs,
591
                        tokenizer=tokenizer,
592
                        num_output_top_logprobs=request.logprobs,
593
                        return_as_token_id=request.return_tokens_as_token_ids,
594
595
596
597
598
599
600
601
602
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
603
                    stop_reason=output.stop_reason,
604
                    prompt_logprobs=final_res.prompt_logprobs,
605
606
607
608
609
610
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
611
612
613
                )
                choices.append(choice_data)

614
615
                num_generated_tokens += len(output.token_ids)

616
617
618
619
620
621
622
623
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

624
625
626
627
628
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
629
            usage.prompt_tokens_details = PromptTokenUsageInfo(
630
631
                cached_tokens=last_final_res.num_cached_tokens
            )
632

633
        request_metadata.final_usage_info = usage
634
635
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
636
637
638
639
640
641
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
642
643
            kv_transfer_params=kv_transfer_params,
        )
644
645
646
647

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
648
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
649
        num_output_top_logprobs: int,
650
        tokenizer: TokenizerLike | None,
651
        initial_text_offset: int = 0,
652
        return_as_token_id: bool | None = None,
653
654
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
655
        out_text_offset: list[int] = []
656
        out_token_logprobs: list[float | None] = []
657
        out_tokens: list[str] = []
658
        out_top_logprobs: list[dict[str, float] | None] = []
659
660
661

        last_token_len = 0

662
663
664
665
666
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
667
668
669
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
670
                if should_return_as_token_id:
671
                    token = f"token_id:{token_id}"
672
673
                else:
                    if tokenizer is None:
674
675
676
677
678
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
679
680
681
                        )

                    token = tokenizer.decode(token_id)
682

683
684
685
686
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
687
688
                step_token = step_top_logprobs[token_id]

689
                token = self._get_decoded_token(
690
                    step_token,
691
692
                    token_id,
                    tokenizer,
693
                    return_as_token_id=should_return_as_token_id,
694
695
696
                )
                token_logprob = max(step_token.logprob, -9999.0)

697
698
699
700
701
702
703
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
704
705
706
707
708
709
710
711
712
713
714
715
716
717
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
718
719
720
721
722
723
724
725
726
727
728
729
730

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
731
732
733
734

    def _build_render_config(
        self,
        request: CompletionRequest,
735
        max_input_length: int | None = None,
736
    ) -> RenderConfig:
737
738
739
740
741
742
743
744
745
        # Validate max_tokens before using it
        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
            raise VLLMValidationError(
                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
                f"the model's maximum context length ({self.max_model_len}).",
                parameter="max_tokens",
                value=request.max_tokens,
            )

746
747
748
749
750
751
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
752
            needs_detokenization=bool(request.echo and not request.return_token_ids),
753
        )