serving.py 29.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.renderer import RenderConfig
36
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
37
from vllm.exceptions import VLLMValidationError
38
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
39
from vllm.inputs.parse import get_prompt_components
40
from vllm.logger import init_logger
41
from vllm.logprobs import Logprob
42
from vllm.outputs import RequestOutput
43
from vllm.sampling_params import BeamSearchParams, SamplingParams
44
from vllm.tokenizers import TokenizerLike
45
46
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
47
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
48
49
50
51
52

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
53
54
    def __init__(
        self,
55
        engine_client: EngineClient,
56
        models: OpenAIServingModels,
57
        *,
58
        request_logger: RequestLogger | None,
59
        return_tokens_as_token_ids: bool = False,
60
        enable_prompt_tokens_details: bool = False,
61
        enable_force_include_usage: bool = False,
62
        log_error_stack: bool = False,
63
    ):
64
65
66
67
68
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
69
            log_error_stack=log_error_stack,
70
        )
71
72
73
74

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

75
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
76
        self.enable_force_include_usage = enable_force_include_usage
77
78

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
79

80
    async def render_completion_request(
81
82
        self,
        request: CompletionRequest,
83
84
85
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
86

87
88
89
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
90
91
92
93
94
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

95
96
97
98
99
100
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

101
        # Return error for unsupported features.
102
        if request.suffix is not None:
103
            return self.create_error_response("suffix is not currently supported")
104

105
        if request.echo and request.prompt_embeds is not None:
106
            return self.create_error_response("Echo is unsupported with prompt embeds.")
107

108
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
109
            return self.create_error_response(
110
111
                "prompt_logprobs is not compatible with prompt embeds."
            )
112

113
        try:
114
            renderer = self._get_completion_renderer()
115
116
117
            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
118
                config=self._build_render_config(request),
119
            )
120
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
121
            logger.exception("Error in preprocessing prompt inputs")
122
            return self.create_error_response(e)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
157
            return self.create_error_response(e)
158

159
160
161
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

162
        # Schedule the request and get the result generator.
163
        generators: list[AsyncGenerator[RequestOutput, None]] = []
164
165
        try:
            for i, engine_prompt in enumerate(engine_prompts):
zhuwenwen's avatar
zhuwenwen committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
                prompt_text, prompt_token_ids, prompt_embeds = (
                    self._get_prompt_components(engine_prompt)
                )

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
                else:
                    raise NotImplementedError

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}
180
181
182
183

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
zhuwenwen's avatar
zhuwenwen committed
184
                    input_length=input_length,
185
186
                    default_sampling_params=self.default_sampling_params,
                )
187

188
                sampling_params: SamplingParams | BeamSearchParams
189
190
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
191
192
                        max_tokens, self.default_sampling_params
                    )
193
194
                else:
                    sampling_params = request.to_sampling_params(
195
196
197
198
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
199
200
201
202
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
203

204
205
                request_id_item = f"{request_id}-{i}"

206
207
                self._log_inputs(
                    request_id_item,
208
                    engine_prompt,
209
210
211
                    params=sampling_params,
                    lora_request=lora_request,
                )
212

213
214
215
216
217
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
218

219
220
221
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
222
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
223
                if isinstance(sampling_params, BeamSearchParams):
224
                    generator = self.beam_search(
225
                        prompt=engine_prompt,
226
227
                        request_id=request_id,
                        params=sampling_params,
228
                        lora_request=lora_request,
229
                        trace_headers=trace_headers,
230
                    )
231
                else:
232
233
234
235
236
237
238
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
239
                        data_parallel_rank=data_parallel_rank,
240
                    )
241

242
                    generator = self.engine_client.generate(
243
                        engine_request,
244
245
246
247
248
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
249
250
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
251
                        data_parallel_rank=data_parallel_rank,
252
                    )
253
254

                generators.append(generator)
255
        except ValueError as e:
256
            return self.create_error_response(e)
257

258
        result_generator = merge_async_iterators(*generators)
259

260
        model_name = self.models.model_name(lora_request)
261
262
        num_prompts = len(engine_prompts)

263
264
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
265
266

        # Streaming response
267
268
        tokenizer = self.renderer.tokenizer

269
        if stream:
270
271
            return self.completion_stream_generator(
                request,
272
                engine_prompts,
273
274
275
276
                result_generator,
                request_id,
                created_time,
                model_name,
277
                num_prompts=num_prompts,
278
                tokenizer=tokenizer,
279
                request_metadata=request_metadata,
280
            )
281
282

        # Non-streaming response
283
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
284
285
286
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
287
288
289
290
291
292
293
294

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
295
                    engine_prompt = engine_prompts[i]
296
297
298
299
300
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
301

302
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
303

304
            response = self.request_output_to_completion_response(
305
306
307
308
309
310
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
311
                request_metadata,
312
            )
313
314
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
315
316
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
317
        except ValueError as e:
318
            return self.create_error_response(e)
319

320
321
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
322
        if request.stream:
323
            response_json = response.model_dump_json()
324
325
326
327
328
329
330
331

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
332
333
334
335

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
336
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
337
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
338
339
340
341
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
342
        tokenizer: TokenizerLike | None,
343
        request_metadata: RequestResponseMetadata,
344
    ) -> AsyncGenerator[str, None]:
345
        num_choices = 1 if request.n is None else request.n
346
        previous_text_lens = [0] * num_choices * num_prompts
347
348
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
349
        num_prompt_tokens = [0] * num_prompts
350
351
        num_cached_tokens = None
        first_iteration = True
352

353
        stream_options = request.stream_options
354
355
356
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
357

358
359
        try:
            async for prompt_idx, res in result_generator:
360
361
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
362

363
364
365
366
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

367
368
369
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
370
371
372
373
374
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
375

376
                # Prompt details are excluded from later streamed outputs
377
378
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
379

380
                delta_token_ids: GenericSequence[int]
381
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
382
383

                for output in res.outputs:
384
                    i = output.index + prompt_idx * num_choices
385

386
387
388
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
389
                    prompt_token_ids_to_return: list[int] | None = None
390

391
                    assert request.max_tokens is not None
392
                    if request.echo and not has_echoed[i]:
393
                        assert prompt_token_ids is not None
394
395
                        if request.return_token_ids:
                            prompt_text = ""
396
                        assert prompt_text is not None
397
398
399
400
401
402
403
404
405
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
406
407
                                *prompt_token_ids,
                                *output.token_ids,
408
409
                            ]
                            out_logprobs = [
410
                                *(prompt_logprobs or []),
411
412
                                *(output.logprobs or []),
                            ]
413
                        prompt_token_ids_to_return = prompt_token_ids
414
415
416
                        has_echoed[i] = True
                    else:
                        # return just the delta
417
418
419
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
420

421
422
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
423
                        if not has_echoed[i] and request.return_token_ids:
424
425
426
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

427
428
429
430
431
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
432
433
434
                            # Chunked prefill case, don't return empty chunks
                            continue

435
                    if request.logprobs is not None:
436
                        assert out_logprobs is not None, "Did not output logprobs"
437
                        logprobs = self._create_completion_logprobs(
438
                            token_ids=delta_token_ids,
439
                            top_logprobs=out_logprobs,
440
                            num_output_top_logprobs=request.logprobs,
441
                            tokenizer=tokenizer,
442
                            initial_text_offset=previous_text_lens[i],
443
                            return_as_token_id=request.return_tokens_as_token_ids,
444
445
446
447
                        )
                    else:
                        logprobs = None

448
449
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
450
                    finish_reason = output.finish_reason
451
                    stop_reason = output.stop_reason
452

453
454
                    self._raise_if_error(finish_reason, request_id)

455
                    chunk = CompletionStreamResponse(
456
457
458
459
460
461
462
463
464
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
465
                                stop_reason=stop_reason,
466
                                prompt_token_ids=prompt_token_ids_to_return,
467
468
469
470
471
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
472
                            )
473
474
                        ],
                    )
475
476
477
478
479
480
481
482
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
483

484
                    response_json = chunk.model_dump_json(exclude_unset=False)
485
                    yield f"data: {response_json}\n\n"
486

487
488
489
490
491
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
492
493
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
494

495
496
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
497
498
                    cached_tokens=num_cached_tokens
                )
499

500
            if include_usage:
501
502
503
504
505
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
506
                    usage=final_usage_info,
507
                )
508
                final_usage_data = final_usage_chunk.model_dump_json(
509
510
                    exclude_unset=False, exclude_none=True
                )
511
512
                yield f"data: {final_usage_data}\n\n"

513
            # report to FastAPI middleware aggregate usage across all choices
514
            request_metadata.final_usage_info = final_usage_info
515

516
517
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
518
        except Exception as e:
519
            logger.exception("Error in completion stream generator.")
520
            data = self.create_streaming_error_response(e)
521
522
523
524
525
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
526
        final_res_batch: list[RequestOutput],
527
528
529
530
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
531
        tokenizer: TokenizerLike | None,
532
        request_metadata: RequestResponseMetadata,
533
    ) -> CompletionResponse:
534
        choices: list[CompletionResponseChoice] = []
535
536
        num_prompt_tokens = 0
        num_generated_tokens = 0
537
538
        kv_transfer_params = None
        last_final_res = None
539
        for final_res in final_res_batch:
540
            last_final_res = final_res
541
            prompt_token_ids = final_res.prompt_token_ids
542
            assert prompt_token_ids is not None
543
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
544
545
            prompt_text = final_res.prompt

546
            token_ids: GenericSequence[int]
547
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
548

549
            for output in final_res.outputs:
550
551
                self._raise_if_error(output.finish_reason, request_id)

552
                assert request.max_tokens is not None
553
                if request.echo:
554
555
                    if request.return_token_ids:
                        prompt_text = ""
556
                    assert prompt_text is not None
557
558
559
560
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
561
                    else:
562
563
564
565
566
567
568
569
570
571
572
573
574
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
575
576
                else:
                    token_ids = output.token_ids
577
                    out_logprobs = output.logprobs
578
579
580
                    output_text = output.text

                if request.logprobs is not None:
581
                    assert out_logprobs is not None, "Did not output logprobs"
582
                    logprobs = self._create_completion_logprobs(
583
                        token_ids=token_ids,
584
                        top_logprobs=out_logprobs,
585
                        tokenizer=tokenizer,
586
                        num_output_top_logprobs=request.logprobs,
587
                        return_as_token_id=request.return_tokens_as_token_ids,
588
589
590
591
592
593
594
595
596
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
597
                    stop_reason=output.stop_reason,
598
                    prompt_logprobs=final_res.prompt_logprobs,
599
600
601
602
603
604
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
605
606
607
                )
                choices.append(choice_data)

608
609
                num_generated_tokens += len(output.token_ids)

610
611
612
613
614
615
616
617
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

618
619
620
621
622
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
623
            usage.prompt_tokens_details = PromptTokenUsageInfo(
624
625
                cached_tokens=last_final_res.num_cached_tokens
            )
626

627
        request_metadata.final_usage_info = usage
628
629
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
630
631
632
633
634
635
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
636
637
            kv_transfer_params=kv_transfer_params,
        )
638
639
640
641

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
642
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
643
        num_output_top_logprobs: int,
644
        tokenizer: TokenizerLike | None,
645
        initial_text_offset: int = 0,
646
        return_as_token_id: bool | None = None,
647
648
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
649
        out_text_offset: list[int] = []
650
        out_token_logprobs: list[float | None] = []
651
        out_tokens: list[str] = []
652
        out_top_logprobs: list[dict[str, float] | None] = []
653
654
655

        last_token_len = 0

656
657
658
659
660
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
661
662
663
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
664
                if should_return_as_token_id:
665
                    token = f"token_id:{token_id}"
666
667
                else:
                    if tokenizer is None:
668
669
670
671
672
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
673
674
675
                        )

                    token = tokenizer.decode(token_id)
676

677
678
679
680
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
681
682
                step_token = step_top_logprobs[token_id]

683
                token = self._get_decoded_token(
684
                    step_token,
685
686
                    token_id,
                    tokenizer,
687
                    return_as_token_id=should_return_as_token_id,
688
689
690
                )
                token_logprob = max(step_token.logprob, -9999.0)

691
692
693
694
695
696
697
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
698
699
700
701
702
703
704
705
706
707
708
709
710
711
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
712
713
714
715
716
717
718
719
720
721
722
723
724

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
725
726
727
728

    def _build_render_config(
        self,
        request: CompletionRequest,
729
        max_input_length: int | None = None,
730
    ) -> RenderConfig:
731
732
733
734
735
736
737
738
739
        # Validate max_tokens before using it
        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
            raise VLLMValidationError(
                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
                f"the model's maximum context length ({self.max_model_len}).",
                parameter="max_tokens",
                value=request.max_tokens,
            )

740
741
742
743
744
745
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
746
            needs_detokenization=bool(request.echo and not request.return_token_ids),
747
        )