serving.py 30.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import asyncio
5
import time
6
7
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
8
from typing import cast
9

10
import jinja2
11
from fastapi import Request
12

13
from vllm.engine.protocol import EngineClient
14
from vllm.entrypoints.logger import RequestLogger
15
from vllm.entrypoints.openai.completion.protocol import (
16
17
18
19
20
21
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
    CompletionResponseChoice,
    CompletionResponseStreamChoice,
    CompletionStreamResponse,
22
23
)
from vllm.entrypoints.openai.engine.protocol import (
24
25
26
27
28
    ErrorResponse,
    PromptTokenUsageInfo,
    RequestResponseMetadata,
    UsageInfo,
)
29
from vllm.entrypoints.openai.engine.serving import (
30
31
32
33
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
)
34
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
35
from vllm.entrypoints.renderer import RenderConfig
36
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
37
from vllm.exceptions import VLLMValidationError
38
from vllm.inputs.data import EmbedsPrompt, TokensPrompt, is_embeds_prompt
39
from vllm.logger import init_logger
40
from vllm.logprobs import Logprob
41
from vllm.outputs import RequestOutput
42
from vllm.sampling_params import BeamSearchParams, SamplingParams
43
from vllm.tokenizers import TokenizerLike
44
45
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import as_list
46
from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
47
48
49
50
51

logger = init_logger(__name__)


class OpenAIServingCompletion(OpenAIServing):
52
53
    def __init__(
        self,
54
        engine_client: EngineClient,
55
        models: OpenAIServingModels,
56
        *,
57
        request_logger: RequestLogger | None,
58
        return_tokens_as_token_ids: bool = False,
59
        enable_prompt_tokens_details: bool = False,
60
        enable_force_include_usage: bool = False,
61
        log_error_stack: bool = False,
62
    ):
63
64
65
66
67
        super().__init__(
            engine_client=engine_client,
            models=models,
            request_logger=request_logger,
            return_tokens_as_token_ids=return_tokens_as_token_ids,
68
            log_error_stack=log_error_stack,
69
        )
70
71
72
73

        # set up logits processors
        self.logits_processors = self.model_config.logits_processors

74
        self.enable_prompt_tokens_details = enable_prompt_tokens_details
75
        self.default_sampling_params = self.model_config.get_diff_sampling_param()
76
        self.enable_force_include_usage = enable_force_include_usage
77
        if self.default_sampling_params:
78
79
            source = self.model_config.generation_config
            source = "model" if source == "auto" else source
80
81
82
83
84
            logger.info(
                "Using default completion sampling params from %s: %s",
                source,
                self.default_sampling_params,
            )
85

86
    async def render_completion_request(
87
88
        self,
        request: CompletionRequest,
89
90
91
    ) -> list[TokensPrompt | EmbedsPrompt] | ErrorResponse:
        """
        render completion request by validating and preprocessing inputs.
92

93
94
95
        Returns:
            A list of engine_prompts on success,
            or an ErrorResponse on failure.
96
97
98
99
100
        """
        error_check_ret = await self._check_model(request)
        if error_check_ret is not None:
            return error_check_ret

101
102
103
104
105
106
        # If the engine is dead, raise the engine's DEAD_ERROR.
        # This is required for the streaming case, where we return a
        # success status before we actually start generating text :).
        if self.engine_client.errored:
            raise self.engine_client.dead_error

107
        # Return error for unsupported features.
108
        if request.suffix is not None:
109
            return self.create_error_response("suffix is not currently supported")
110

111
        if request.echo and request.prompt_embeds is not None:
112
            return self.create_error_response("Echo is unsupported with prompt embeds.")
113

114
        if request.prompt_logprobs is not None and request.prompt_embeds is not None:
115
            return self.create_error_response(
116
117
                "prompt_logprobs is not compatible with prompt embeds."
            )
118

119
        try:
120
            if self.model_config.skip_tokenizer_init:
121
122
                tokenizer = None
            else:
123
                tokenizer = await self.engine_client.get_tokenizer()
124
125
126
127
128
            renderer = self._get_renderer(tokenizer)

            engine_prompts = await renderer.render_prompt_and_embeds(
                prompt_or_prompts=request.prompt,
                prompt_embeds=request.prompt_embeds,
129
                config=self._build_render_config(request),
130
            )
131
        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
132
            logger.exception("Error in preprocessing prompt inputs")
133
            return self.create_error_response(e)
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

        return engine_prompts

    async def create_completion(
        self,
        request: CompletionRequest,
        raw_request: Request | None = None,
    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
        """Completion API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/completions/create
        for the API specification. This API mimics the OpenAI Completion API.

        NOTE: Currently we do not support the following feature:
            - suffix (the language models we currently support do not support
            suffix)
        """
        result = await self.render_completion_request(request)
        if isinstance(result, ErrorResponse):
            return result

        engine_prompts = result

        request_id = f"cmpl-{self._base_request_id(raw_request, request.request_id)}"
        created_time = int(time.time())

        request_metadata = RequestResponseMetadata(request_id=request_id)
        if raw_request:
            raw_request.state.request_metadata = request_metadata

        try:
            lora_request = self._maybe_get_adapters(request)

            if self.model_config.skip_tokenizer_init:
                tokenizer = None
            else:
                tokenizer = await self.engine_client.get_tokenizer()
        except (ValueError, TypeError, RuntimeError) as e:
            logger.exception("Error preparing request components")
173
            return self.create_error_response(e)
174

175
176
177
        # Extract data_parallel_rank from header (router can inject it)
        data_parallel_rank = self._get_data_parallel_rank(raw_request)

178
        # Schedule the request and get the result generator.
179
        generators: list[AsyncGenerator[RequestOutput, None]] = []
180
181
        try:
            for i, engine_prompt in enumerate(engine_prompts):
182
                prompt_text, prompt_token_ids, prompt_embeds = (
183
184
                    self._get_prompt_components(engine_prompt)
                )
185
186
187
188
189
190

                input_length = None
                if prompt_token_ids is not None:
                    input_length = len(prompt_token_ids)
                elif prompt_embeds is not None:
                    input_length = len(prompt_embeds)
191
                else:
192
                    raise NotImplementedError
193
194
195
196
197
198
199
200

                if self.default_sampling_params is None:
                    self.default_sampling_params = {}

                max_tokens = get_max_tokens(
                    max_model_len=self.max_model_len,
                    request=request,
                    input_length=input_length,
201
202
                    default_sampling_params=self.default_sampling_params,
                )
203

204
                sampling_params: SamplingParams | BeamSearchParams
205
206
                if request.use_beam_search:
                    sampling_params = request.to_beam_search_params(
207
208
                        max_tokens, self.default_sampling_params
                    )
209
210
                else:
                    sampling_params = request.to_sampling_params(
211
212
213
214
                        max_tokens,
                        self.model_config.logits_processor_pattern,
                        self.default_sampling_params,
                    )
215
216
217
218
                    validate_logits_processors_parameters(
                        self.logits_processors,
                        sampling_params,
                    )
219

220
221
                request_id_item = f"{request_id}-{i}"

222
223
                self._log_inputs(
                    request_id_item,
224
                    engine_prompt,
225
226
227
                    params=sampling_params,
                    lora_request=lora_request,
                )
228

229
230
231
232
233
                trace_headers = (
                    None
                    if raw_request is None
                    else await self._get_trace_headers(raw_request.headers)
                )
234

235
236
237
                # Mypy inconsistently requires this second cast in different
                # environments. It shouldn't be necessary (redundant from above)
                # but pre-commit in CI fails without it.
238
                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
239
                if isinstance(sampling_params, BeamSearchParams):
240
                    generator = self.beam_search(
241
                        prompt=engine_prompt,
242
243
                        request_id=request_id,
                        params=sampling_params,
244
                        lora_request=lora_request,
245
                        trace_headers=trace_headers,
246
                    )
247
                else:
248
249
250
251
252
253
254
                    engine_request, tokenization_kwargs = await self._process_inputs(
                        request_id_item,
                        engine_prompt,
                        sampling_params,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
255
                        data_parallel_rank=data_parallel_rank,
256
                    )
257

258
                    generator = self.engine_client.generate(
259
                        engine_request,
260
261
262
263
264
                        sampling_params,
                        request_id_item,
                        lora_request=lora_request,
                        trace_headers=trace_headers,
                        priority=request.priority,
265
266
                        prompt_text=prompt_text,
                        tokenization_kwargs=tokenization_kwargs,
267
                        data_parallel_rank=data_parallel_rank,
268
                    )
269
270

                generators.append(generator)
271
        except ValueError as e:
272
            return self.create_error_response(e)
273

274
        result_generator = merge_async_iterators(*generators)
275

276
        model_name = self.models.model_name(lora_request)
277
278
        num_prompts = len(engine_prompts)

279
280
        # We do not stream the results when using beam search.
        stream = request.stream and not request.use_beam_search
281
282
283

        # Streaming response
        if stream:
284
285
            return self.completion_stream_generator(
                request,
286
                engine_prompts,
287
288
289
290
                result_generator,
                request_id,
                created_time,
                model_name,
291
                num_prompts=num_prompts,
292
                tokenizer=tokenizer,
293
                request_metadata=request_metadata,
294
            )
295
296

        # Non-streaming response
297
        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
298
299
300
        try:
            async for i, res in result_generator:
                final_res_batch[i] = res
301
302
303
304
305
306
307
308

            for i, final_res in enumerate(final_res_batch):
                assert final_res is not None

                # The output should contain the input text
                # We did not pass it into vLLM engine to avoid being redundant
                # with the inputs token IDs
                if final_res.prompt is None:
309
                    engine_prompt = engine_prompts[i]
310
311
312
313
314
                    final_res.prompt = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
315

316
            final_res_batch_checked = cast(list[RequestOutput], final_res_batch)
317

318
            response = self.request_output_to_completion_response(
319
320
321
322
323
324
                final_res_batch_checked,
                request,
                request_id,
                created_time,
                model_name,
                tokenizer,
325
                request_metadata,
326
            )
327
328
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
329
330
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
331
        except ValueError as e:
332
            return self.create_error_response(e)
333

334
335
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
336
        if request.stream:
337
            response_json = response.model_dump_json()
338
339
340
341
342
343
344
345

            async def fake_stream_generator() -> AsyncGenerator[str, None]:
                yield f"data: {response_json}\n\n"
                yield "data: [DONE]\n\n"

            return fake_stream_generator()

        return response
346
347
348
349

    async def completion_stream_generator(
        self,
        request: CompletionRequest,
350
        engine_prompts: list[TokensPrompt | EmbedsPrompt],
351
        result_generator: AsyncIterator[tuple[int, RequestOutput]],
352
353
354
355
        request_id: str,
        created_time: int,
        model_name: str,
        num_prompts: int,
356
        tokenizer: TokenizerLike | None,
357
        request_metadata: RequestResponseMetadata,
358
    ) -> AsyncGenerator[str, None]:
359
        num_choices = 1 if request.n is None else request.n
360
        previous_text_lens = [0] * num_choices * num_prompts
361
362
        previous_num_tokens = [0] * num_choices * num_prompts
        has_echoed = [False] * num_choices * num_prompts
363
        num_prompt_tokens = [0] * num_prompts
364
365
        num_cached_tokens = None
        first_iteration = True
366

367
        stream_options = request.stream_options
368
369
370
        include_usage, include_continuous_usage = should_include_usage(
            stream_options, self.enable_force_include_usage
        )
371

372
373
        try:
            async for prompt_idx, res in result_generator:
374
375
                prompt_token_ids = res.prompt_token_ids
                prompt_logprobs = res.prompt_logprobs
376

377
378
379
380
                if first_iteration:
                    num_cached_tokens = res.num_cached_tokens
                    first_iteration = False

381
382
383
                prompt_text = res.prompt
                if prompt_text is None:
                    engine_prompt = engine_prompts[prompt_idx]
384
385
386
387
388
                    prompt_text = (
                        None
                        if is_embeds_prompt(engine_prompt)
                        else engine_prompt.get("prompt")
                    )
389

390
                # Prompt details are excluded from later streamed outputs
391
392
                if prompt_token_ids is not None:
                    num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
393

394
                delta_token_ids: GenericSequence[int]
395
                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
396
397

                for output in res.outputs:
398
                    i = output.index + prompt_idx * num_choices
399

400
401
402
                    # Useful when request.return_token_ids is True
                    # Returning prompt token IDs shares the same logic
                    # with the echo implementation.
403
                    prompt_token_ids_to_return: list[int] | None = None
404

405
                    assert request.max_tokens is not None
406
                    if request.echo and not has_echoed[i]:
407
                        assert prompt_token_ids is not None
408
409
                        if request.return_token_ids:
                            prompt_text = ""
410
                        assert prompt_text is not None
411
412
413
414
415
416
417
418
419
                        if request.max_tokens == 0:
                            # only return the prompt
                            delta_text = prompt_text
                            delta_token_ids = prompt_token_ids
                            out_logprobs = prompt_logprobs
                        else:
                            # echo the prompt and first token
                            delta_text = prompt_text + output.text
                            delta_token_ids = [
420
421
                                *prompt_token_ids,
                                *output.token_ids,
422
423
                            ]
                            out_logprobs = [
424
                                *(prompt_logprobs or []),
425
426
                                *(output.logprobs or []),
                            ]
427
                        prompt_token_ids_to_return = prompt_token_ids
428
429
430
                        has_echoed[i] = True
                    else:
                        # return just the delta
431
432
433
                        delta_text = output.text
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs
434

435
436
                        # has_echoed[i] is reused here to indicate whether
                        # we have already returned the prompt token IDs.
437
                        if not has_echoed[i] and request.return_token_ids:
438
439
440
                            prompt_token_ids_to_return = prompt_token_ids
                            has_echoed[i] = True

441
442
443
444
445
                        if (
                            not delta_text
                            and not delta_token_ids
                            and not previous_num_tokens[i]
                        ):
446
447
448
                            # Chunked prefill case, don't return empty chunks
                            continue

449
                    if request.logprobs is not None:
450
                        assert out_logprobs is not None, "Did not output logprobs"
451
                        logprobs = self._create_completion_logprobs(
452
                            token_ids=delta_token_ids,
453
                            top_logprobs=out_logprobs,
454
                            num_output_top_logprobs=request.logprobs,
455
                            tokenizer=tokenizer,
456
                            initial_text_offset=previous_text_lens[i],
457
                            return_as_token_id=request.return_tokens_as_token_ids,
458
459
460
461
                        )
                    else:
                        logprobs = None

462
463
                    previous_text_lens[i] += len(output.text)
                    previous_num_tokens[i] += len(output.token_ids)
464
                    finish_reason = output.finish_reason
465
                    stop_reason = output.stop_reason
466

467
468
                    self._raise_if_error(finish_reason, request_id)

469
                    chunk = CompletionStreamResponse(
470
471
472
473
474
475
476
477
478
                        id=request_id,
                        created=created_time,
                        model=model_name,
                        choices=[
                            CompletionResponseStreamChoice(
                                index=i,
                                text=delta_text,
                                logprobs=logprobs,
                                finish_reason=finish_reason,
479
                                stop_reason=stop_reason,
480
                                prompt_token_ids=prompt_token_ids_to_return,
481
482
483
484
485
                                token_ids=(
                                    as_list(output.token_ids)
                                    if request.return_token_ids
                                    else None
                                ),
486
                            )
487
488
                        ],
                    )
489
490
491
492
493
494
495
496
                    if include_continuous_usage:
                        prompt_tokens = num_prompt_tokens[prompt_idx]
                        completion_tokens = previous_num_tokens[i]
                        chunk.usage = UsageInfo(
                            prompt_tokens=prompt_tokens,
                            completion_tokens=completion_tokens,
                            total_tokens=prompt_tokens + completion_tokens,
                        )
497

498
                    response_json = chunk.model_dump_json(exclude_unset=False)
499
                    yield f"data: {response_json}\n\n"
500

501
502
503
504
505
            total_prompt_tokens = sum(num_prompt_tokens)
            total_completion_tokens = sum(previous_num_tokens)
            final_usage_info = UsageInfo(
                prompt_tokens=total_prompt_tokens,
                completion_tokens=total_completion_tokens,
506
507
                total_tokens=total_prompt_tokens + total_completion_tokens,
            )
508

509
510
            if self.enable_prompt_tokens_details and num_cached_tokens:
                final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
511
512
                    cached_tokens=num_cached_tokens
                )
513

514
            if include_usage:
515
516
517
518
519
                final_usage_chunk = CompletionStreamResponse(
                    id=request_id,
                    created=created_time,
                    model=model_name,
                    choices=[],
520
                    usage=final_usage_info,
521
                )
522
                final_usage_data = final_usage_chunk.model_dump_json(
523
524
                    exclude_unset=False, exclude_none=True
                )
525
526
                yield f"data: {final_usage_data}\n\n"

527
            # report to FastAPI middleware aggregate usage across all choices
528
            request_metadata.final_usage_info = final_usage_info
529

530
531
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
532
        except Exception as e:
533
            logger.exception("Error in completion stream generator.")
534
            data = self.create_streaming_error_response(e)
535
536
537
538
539
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"

    def request_output_to_completion_response(
        self,
540
        final_res_batch: list[RequestOutput],
541
542
543
544
        request: CompletionRequest,
        request_id: str,
        created_time: int,
        model_name: str,
545
        tokenizer: TokenizerLike | None,
546
        request_metadata: RequestResponseMetadata,
547
    ) -> CompletionResponse:
548
        choices: list[CompletionResponseChoice] = []
549
550
        num_prompt_tokens = 0
        num_generated_tokens = 0
551
552
        kv_transfer_params = None
        last_final_res = None
553
        for final_res in final_res_batch:
554
            last_final_res = final_res
555
            prompt_token_ids = final_res.prompt_token_ids
556
            assert prompt_token_ids is not None
557
            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
558
559
            prompt_text = final_res.prompt

560
            token_ids: GenericSequence[int]
561
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
562

563
            for output in final_res.outputs:
564
565
                self._raise_if_error(output.finish_reason, request_id)

566
                assert request.max_tokens is not None
567
                if request.echo:
568
569
                    if request.return_token_ids:
                        prompt_text = ""
570
                    assert prompt_text is not None
571
572
573
574
                    if request.max_tokens == 0:
                        token_ids = prompt_token_ids
                        out_logprobs = prompt_logprobs
                        output_text = prompt_text
575
                    else:
576
577
578
579
580
581
582
583
584
585
586
587
588
                        token_ids = [*prompt_token_ids, *output.token_ids]

                        if request.logprobs is None:
                            out_logprobs = None
                        else:
                            assert prompt_logprobs is not None
                            assert output.logprobs is not None
                            out_logprobs = [
                                *prompt_logprobs,
                                *output.logprobs,
                            ]

                        output_text = prompt_text + output.text
589
590
                else:
                    token_ids = output.token_ids
591
                    out_logprobs = output.logprobs
592
593
594
                    output_text = output.text

                if request.logprobs is not None:
595
                    assert out_logprobs is not None, "Did not output logprobs"
596
                    logprobs = self._create_completion_logprobs(
597
                        token_ids=token_ids,
598
                        top_logprobs=out_logprobs,
599
                        tokenizer=tokenizer,
600
                        num_output_top_logprobs=request.logprobs,
601
                        return_as_token_id=request.return_tokens_as_token_ids,
602
603
604
605
606
607
608
609
610
                    )
                else:
                    logprobs = None

                choice_data = CompletionResponseChoice(
                    index=len(choices),
                    text=output_text,
                    logprobs=logprobs,
                    finish_reason=output.finish_reason,
611
                    stop_reason=output.stop_reason,
612
                    prompt_logprobs=final_res.prompt_logprobs,
613
614
615
616
617
618
                    prompt_token_ids=(
                        prompt_token_ids if request.return_token_ids else None
                    ),
                    token_ids=(
                        as_list(output.token_ids) if request.return_token_ids else None
                    ),
619
620
621
                )
                choices.append(choice_data)

622
623
                num_generated_tokens += len(output.token_ids)

624
625
626
627
628
629
630
631
            num_prompt_tokens += len(prompt_token_ids)

        usage = UsageInfo(
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
        )

632
633
634
635
636
        if (
            self.enable_prompt_tokens_details
            and last_final_res
            and last_final_res.num_cached_tokens
        ):
637
            usage.prompt_tokens_details = PromptTokenUsageInfo(
638
639
                cached_tokens=last_final_res.num_cached_tokens
            )
640

641
        request_metadata.final_usage_info = usage
642
643
        if final_res_batch:
            kv_transfer_params = final_res_batch[0].kv_transfer_params
644
645
646
647
648
649
        return CompletionResponse(
            id=request_id,
            created=created_time,
            model=model_name,
            choices=choices,
            usage=usage,
650
651
            kv_transfer_params=kv_transfer_params,
        )
652
653
654
655

    def _create_completion_logprobs(
        self,
        token_ids: GenericSequence[int],
656
        top_logprobs: GenericSequence[dict[int, Logprob] | None],
657
        num_output_top_logprobs: int,
658
        tokenizer: TokenizerLike | None,
659
        initial_text_offset: int = 0,
660
        return_as_token_id: bool | None = None,
661
662
    ) -> CompletionLogProbs:
        """Create logprobs for OpenAI Completion API."""
663
        out_text_offset: list[int] = []
664
        out_token_logprobs: list[float | None] = []
665
        out_tokens: list[str] = []
666
        out_top_logprobs: list[dict[str, float] | None] = []
667
668
669

        last_token_len = 0

670
671
672
673
674
        should_return_as_token_id = (
            return_as_token_id
            if return_as_token_id is not None
            else self.return_tokens_as_token_ids
        )
675
676
677
        for i, token_id in enumerate(token_ids):
            step_top_logprobs = top_logprobs[i]
            if step_top_logprobs is None:
678
                if should_return_as_token_id:
679
                    token = f"token_id:{token_id}"
680
681
                else:
                    if tokenizer is None:
682
683
684
685
686
                        raise VLLMValidationError(
                            "Unable to get tokenizer because "
                            "`skip_tokenizer_init=True`",
                            parameter="skip_tokenizer_init",
                            value=True,
687
688
689
                        )

                    token = tokenizer.decode(token_id)
690

691
692
693
694
                out_tokens.append(token)
                out_token_logprobs.append(None)
                out_top_logprobs.append(None)
            else:
695
696
                step_token = step_top_logprobs[token_id]

697
                token = self._get_decoded_token(
698
                    step_token,
699
700
                    token_id,
                    tokenizer,
701
                    return_as_token_id=should_return_as_token_id,
702
703
704
                )
                token_logprob = max(step_token.logprob, -9999.0)

705
706
707
708
709
710
711
                out_tokens.append(token)
                out_token_logprobs.append(token_logprob)

                # makes sure to add the top num_output_top_logprobs + 1
                # logprobs, as defined in the openai API
                # (cf. https://github.com/openai/openai-openapi/blob/
                # 893ba52242dbd5387a97b96444ee1c742cfce9bd/openapi.yaml#L7153)
712
713
714
715
716
717
718
719
720
721
722
723
724
725
                out_top_logprobs.append(
                    {
                        # Convert float("-inf") to the
                        # JSON-serializable float that OpenAI uses
                        self._get_decoded_token(
                            top_lp[1],
                            top_lp[0],
                            tokenizer,
                            return_as_token_id=should_return_as_token_id,
                        ): max(top_lp[1].logprob, -9999.0)
                        for i, top_lp in enumerate(step_top_logprobs.items())
                        if num_output_top_logprobs >= i
                    }
                )
726
727
728
729
730
731
732
733
734
735
736
737
738

            if len(out_text_offset) == 0:
                out_text_offset.append(initial_text_offset)
            else:
                out_text_offset.append(out_text_offset[-1] + last_token_len)
            last_token_len = len(token)

        return CompletionLogProbs(
            text_offset=out_text_offset,
            token_logprobs=out_token_logprobs,
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
739
740
741
742

    def _build_render_config(
        self,
        request: CompletionRequest,
743
        max_input_length: int | None = None,
744
    ) -> RenderConfig:
745
746
747
748
749
750
751
752
753
        # Validate max_tokens before using it
        if request.max_tokens is not None and request.max_tokens > self.max_model_len:
            raise VLLMValidationError(
                f"'max_tokens' ({request.max_tokens}) cannot be greater than "
                f"the model's maximum context length ({self.max_model_len}).",
                parameter="max_tokens",
                value=request.max_tokens,
            )

754
755
756
757
758
759
        max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
        return RenderConfig(
            max_length=max_input_tokens_len,
            truncate_prompt_tokens=request.truncate_prompt_tokens,
            add_special_tokens=request.add_special_tokens,
            cache_salt=request.cache_salt,
760
            needs_detokenization=bool(request.echo and not request.return_token_ids),
761
        )