serving.py 29.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.renderer import RenderConfig
36
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
37
from vllm.exceptions import VLLMValidationError
38
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
39
from vllm.logger import init_logger
40
from vllm.logprobs import Logprob
41
from vllm.outputs import RequestOutput
42
from vllm.sampling_params import BeamSearchParams, SamplingParams
43
from vllm.tokenizers import TokenizerLike
44
45
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
46
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
47
48
49
50
51

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
52
53
    def __init__(
        self,
54
        engine_client: EngineClient,
55
        models: OpenAIServingModels,
56
        *,
57
        request_logger: RequestLogger | None,
58
        return_tokens_as_token_ids: bool = False,
59
        enable_prompt_tokens_details: bool = False,
60
        enable_force_include_usage: bool = False,
61
        log_error_stack: bool = False,
62
    ):
63
64
65
66
67
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
68
            log_error_stack=log_error_stack,
69
        )
70
71
72
73

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

74
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
75
        self.enable_force_include_usage = enable_force_include_usage
76
77

        self.default_sampling_params = self.model_config.get_diff_sampling_param()
78

79
    async def render_completion_request(
80
81
        self,
        request: CompletionRequest,
82
83
84
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
85

86
87
88
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
89
90
91
92
93
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

94
95
96
97
98
99
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

100
        # Return error for unsupported features.
101
        if request.suffix is not None:
102
            return self.create_error_response("suffix is not currently supported")
103

104
        if request.echo and request.prompt_embeds is not None:
105
            return self.create_error_response("Echo is unsupported with prompt embeds.")
106

107
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
108
            return self.create_error_response(
109
110
                "prompt_logprobs is not compatible with prompt embeds."
            )
111

112
        try:
113
            renderer = self._get_completion_renderer()
114
115
116
            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
117
                config=self._build_render_config(request),
118
            )
119
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
120
            logger.exception("Error in preprocessing prompt inputs")
121
            return self.create_error_response(e)
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
156
            return self.create_error_response(e)
157

158
159
160
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

161
        # Schedule the request and get the result generator.
162
        generators: list[AsyncGenerator[RequestOutput, None]] = []
163
164
        try:
            for i, engine_prompt in enumerate(engine_prompts):
165
                prompt_text, prompt_token_ids, prompt_embeds = (
166
167
                    self._get_prompt_components(engine_prompt)
                )
168
169
170
171
172
173

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
174
                else:
175
                    raise NotImplementedError
176
177
178
179
180
181
182
183

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
184
185
                    default_sampling_params=self.default_sampling_params,
                )
186

187
                sampling_params: SamplingParams | BeamSearchParams
188
189
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
190
191
                        max_tokens, self.default_sampling_params
                    )
192
193
                else:
                    sampling_params = request.to_sampling_params(
194
195
196
197
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
198
199
200
201
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
202

203
204
                request_id_item = f"{request_id}-{i}"

205
206
                self._log_inputs(
                    request_id_item,
207
                    engine_prompt,
208
209
210
                    params=sampling_params,
                    lora_request=lora_request,
                )
211

212
213
214
215
216
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
217

218
219
220
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
221
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
222
                if isinstance(sampling_params, BeamSearchParams):
223
                    generator = self.beam_search(
224
                        prompt=engine_prompt,
225
226
                        request_id=request_id,
                        params=sampling_params,
227
                        lora_request=lora_request,
228
                        trace_headers=trace_headers,
229
                    )
230
                else:
231
232
233
234
235
236
237
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
238
                        data_parallel_rank=data_parallel_rank,
239
                    )
240

241
                    generator = self.engine_client.generate(
242
                        engine_request,
243
244
245
246
247
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
248
249
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
250
                        data_parallel_rank=data_parallel_rank,
251
                    )
252
253

                generators.append(generator)
254
        except ValueError as e:
255
            return self.create_error_response(e)
256

257
        result_generator = merge_async_iterators(*generators)
258

259
        model_name = self.models.model_name(lora_request)
260
261
        num_prompts = len(engine_prompts)

262
263
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
264
265

        # Streaming response
266
267
        tokenizer = self.renderer.tokenizer

268
        if stream:
269
270
            return self.completion_stream_generator(
                request,
271
                engine_prompts,
272
273
274
275
                result_generator,
                request_id,
                created_time,
                model_name,
276
                num_prompts=num_prompts,
277
                tokenizer=tokenizer,
278
                request_metadata=request_metadata,
279
            )
280
281

        # Non-streaming response
282
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
283
284
285
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
286
287
288
289
290
291
292
293

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
294
                    engine_prompt = engine_prompts[i]
295
296
297
298
299
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
300

301
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
302

303
            response = self.request_output_to_completion_response(
304
305
306
307
308
309
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
310
                request_metadata,
311
            )
312
313
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
314
315
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
316
        except ValueError as e:
317
            return self.create_error_response(e)
318

319
320
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
321
        if request.stream:
322
            response_json = response.model_dump_json()
323
324
325
326
327
328
329
330

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
331
332
333
334

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
335
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
336
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
337
338
339
340
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
341
        tokenizer: TokenizerLike | None,
342
        request_metadata: RequestResponseMetadata,
343
    ) -> AsyncGenerator[str, None]:
344
        num_choices = 1 if request.n is None else request.n
345
        previous_text_lens = [0] * num_choices * num_prompts
346
347
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
348
        num_prompt_tokens = [0] * num_prompts
349
350
        num_cached_tokens = None
        first_iteration = True
351

352
        stream_options = request.stream_options
353
354
355
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
356

357
358
        try:
            async for prompt_idx, res in result_generator:
359
360
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
361

362
363
364
365
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

366
367
368
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
369
370
371
372
373
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
374

375
                # Prompt details are excluded from later streamed outputs
376
377
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
378

379
                delta_token_ids: GenericSequence[int]
380
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
381
382

                for output in res.outputs:
383
                    i = output.index + prompt_idx * num_choices
384

385
386
387
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
388
                    prompt_token_ids_to_return: list[int] | None = None
389

390
                    assert request.max_tokens is not None
391
                    if request.echo and not has_echoed[i]:
392
                        assert prompt_token_ids is not None
393
394
                        if request.return_token_ids:
                            prompt_text = ""
395
                        assert prompt_text is not None
396
397
398
399
400
401
402
403
404
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
405
406
                                *prompt_token_ids,
                                *output.token_ids,
407
408
                            ]
                            out_logprobs = [
409
                                *(prompt_logprobs or []),
410
411
                                *(output.logprobs or []),
                            ]
412
                        prompt_token_ids_to_return = prompt_token_ids
413
414
415
                        has_echoed[i] = True
                    else:
                        # return just the delta
416
417
418
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
419

420
421
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
422
                        if not has_echoed[i] and request.return_token_ids:
423
424
425
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

426
427
428
429
430
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
431
432
433
                            # Chunked prefill case, don't return empty chunks
                            continue

434
                    if request.logprobs is not None:
435
                        assert out_logprobs is not None, "Did not output logprobs"
436
                        logprobs = self._create_completion_logprobs(
437
                            token_ids=delta_token_ids,
438
                            top_logprobs=out_logprobs,
439
                            num_output_top_logprobs=request.logprobs,
440
                            tokenizer=tokenizer,
441
                            initial_text_offset=previous_text_lens[i],
442
                            return_as_token_id=request.return_tokens_as_token_ids,
443
444
445
446
                        )
                    else:
                        logprobs = None

447
448
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
449
                    finish_reason = output.finish_reason
450
                    stop_reason = output.stop_reason
451

452
453
                    self._raise_if_error(finish_reason, request_id)

454
                    chunk = CompletionStreamResponse(
455
456
457
458
459
460
461
462
463
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
464
                                stop_reason=stop_reason,
465
                                prompt_token_ids=prompt_token_ids_to_return,
466
467
468
469
470
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
471
                            )
472
473
                        ],
                    )
474
475
476
477
478
479
480
481
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
482

483
                    response_json = chunk.model_dump_json(exclude_unset=False)
484
                    yield f"data: {response_json}\n\n"
485

486
487
488
489
490
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
491
492
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
493

494
495
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
496
497
                    cached_tokens=num_cached_tokens
                )
498

499
            if include_usage:
500
501
502
503
504
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
505
                    usage=final_usage_info,
506
                )
507
                final_usage_data = final_usage_chunk.model_dump_json(
508
509
                    exclude_unset=False, exclude_none=True
                )
510
511
                yield f"data: {final_usage_data}\n\n"

512
            # report to FastAPI middleware aggregate usage across all choices
513
            request_metadata.final_usage_info = final_usage_info
514

515
516
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
517
        except Exception as e:
518
            logger.exception("Error in completion stream generator.")
519
            data = self.create_streaming_error_response(e)
520
521
522
523
524
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
525
        final_res_batch: list[RequestOutput],
526
527
528
529
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
530
        tokenizer: TokenizerLike | None,
531
        request_metadata: RequestResponseMetadata,
532
    ) -> CompletionResponse:
533
        choices: list[CompletionResponseChoice] = []
534
535
        num_prompt_tokens = 0
        num_generated_tokens = 0
536
537
        kv_transfer_params = None
        last_final_res = None
538
        for final_res in final_res_batch:
539
            last_final_res = final_res
540
            prompt_token_ids = final_res.prompt_token_ids
541
            assert prompt_token_ids is not None
542
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
543
544
            prompt_text = final_res.prompt

545
            token_ids: GenericSequence[int]
546
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
547

548
            for output in final_res.outputs:
549
550
                self._raise_if_error(output.finish_reason, request_id)

551
                assert request.max_tokens is not None
552
                if request.echo:
553
554
                    if request.return_token_ids:
                        prompt_text = ""
555
                    assert prompt_text is not None
556
557
558
559
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
560
                    else:
561
562
563
564
565
566
567
568
569
570
571
572
573
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
574
575
                else:
                    token_ids = output.token_ids
576
                    out_logprobs = output.logprobs
577
578
579
                    output_text = output.text

                if request.logprobs is not None:
580
                    assert out_logprobs is not None, "Did not output logprobs"
581
                    logprobs = self._create_completion_logprobs(
582
                        token_ids=token_ids,
583
                        top_logprobs=out_logprobs,
584
                        tokenizer=tokenizer,
585
                        num_output_top_logprobs=request.logprobs,
586
                        return_as_token_id=request.return_tokens_as_token_ids,
587
588
589
590
591
592
593
594
595
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
596
                    stop_reason=output.stop_reason,
597
                    prompt_logprobs=final_res.prompt_logprobs,
598
599
600
601
602
603
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
604
605
606
                )
                choices.append(choice_data)

607
608
                num_generated_tokens += len(output.token_ids)

609
610
611
612
613
614
615
616
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

617
618
619
620
621
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
622
            usage.prompt_tokens_details = PromptTokenUsageInfo(
623
624
                cached_tokens=last_final_res.num_cached_tokens
            )
625

626
        request_metadata.final_usage_info = usage
627
628
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
629
630
631
632
633
634
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
635
636
            kv_transfer_params=kv_transfer_params,
        )
637
638
639
640

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
641
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
642
        num_output_top_logprobs: int,
643
        tokenizer: TokenizerLike | None,
644
        initial_text_offset: int = 0,
645
        return_as_token_id: bool | None = None,
646
647
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
648
        out_text_offset: list[int] = []
649
        out_token_logprobs: list[float | None] = []
650
        out_tokens: list[str] = []
651
        out_top_logprobs: list[dict[str, float] | None] = []
652
653
654

        last_token_len = 0

655
656
657
658
659
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
660
661
662
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
663
                if should_return_as_token_id:
664
                    token = f"token_id:{token_id}"
665
666
                else:
                    if tokenizer is None:
667
668
669
670
671
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
672
673
674
                        )

                    token = tokenizer.decode(token_id)
675

676
677
678
679
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
680
681
                step_token = step_top_logprobs[token_id]

682
                token = self._get_decoded_token(
683
                    step_token,
684
685
                    token_id,
                    tokenizer,
686
                    return_as_token_id=should_return_as_token_id,
687
688
689
                )
                token_logprob = max(step_token.logprob, -9999.0)

690
691
692
693
694
695
696
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
697
698
699
700
701
702
703
704
705
706
707
708
709
710
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
711
712
713
714
715
716
717
718
719
720
721
722
723

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
724
725
726
727

    def _build_render_config(
        self,
        request: CompletionRequest,
728
        max_input_length: int | None = None,
729
    ) -> RenderConfig:
730
731
732
733
734
735
736
737
738
        # Validate max_tokens before using it
        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
            raise VLLMValidationError(
                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
                f"the model's maximum context length ({self.max_model_len}).",
                parameter="max_tokens",
                value=request.max_tokens,
            )

739
740
741
742
743
744
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
745
            needs_detokenization=bool(request.echo and not request.return_token_ids),
746
        )