Merge tag 'v0.15.0rc1' into v0.15.0rc1-ori

82e40fb7 · zhuwenwen · 30a1922e · 58996f35 · 82e40fb7 · 82e40fb7
Commit 82e40fb7 authored Jan 27, 2026 by zhuwenwen
20 changed files
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -63,6 +63,7 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
    ChatCompletionMessageParam,
    ChatTemplateContentFormatOption,
+    make_tool_call_id,
 )
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.mcp.tool_server import ToolServer
@@ -115,6 +116,7 @@ from vllm.entrypoints.openai.responses.utils import (
    extract_tool_types,
    should_continue_final_message,
 )
+from vllm.entrypoints.utils import get_max_tokens
 from vllm.exceptions import VLLMValidationError
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
@@ -250,6 +252,17 @@ class OpenAIServingResponses(OpenAIServing):
            self.default_sampling_params["stop_token_ids"].extend(
                get_stop_tokens_for_assistant_actions()
            )
+
+        # Handle tool call ID type for Kimi K2 (supporting test mocking via overrides)
+        hf_overrides = getattr(self.model_config, "hf_overrides", None)
+        if self.model_config.hf_text_config.model_type == "kimi_k2" or (
+            isinstance(hf_overrides, dict)
+            and hf_overrides.get("model_type") == "kimi_k2"
+        ):
+            self.tool_call_id_type = "kimi_k2"
+        else:
+            self.tool_call_id_type = "random"
+
        self.enable_auto_tools = enable_auto_tools
        # set up tool use
        self.tool_parser = self._get_tool_parser(
@@ -423,8 +436,11 @@ class OpenAIServingResponses(OpenAIServing):
                if maybe_error is not None:
                    return maybe_error

-                default_max_tokens = self.max_model_len - len(
-                    engine_prompt["prompt_token_ids"]
+                default_max_tokens = get_max_tokens(
+                    self.max_model_len,
+                    request,
+                    engine_prompt,
+                    self.default_sampling_params,
                )

                sampling_params = request.to_sampling_params(
@@ -954,25 +970,28 @@ class OpenAIServingResponses(OpenAIServing):
            enable_auto_tools=self.enable_auto_tools,
            tool_parser_cls=self.tool_parser,
        )
-        if content:
-            output_text = ResponseOutputText(
-                text=content,
-                annotations=[],  # TODO
-                type="output_text",
-                logprobs=(
-                    self._create_response_logprobs(
-                        token_ids=final_output.token_ids,
-                        logprobs=final_output.logprobs,
-                        tokenizer=tokenizer,
-                        top_logprobs=request.top_logprobs,
-                    )
-                    if request.is_include_output_logprobs()
-                    else None
-                ),
-            )
+
+        if content or (self.use_harmony and tool_calls):
+            res_text_part = None
+            if content:
+                res_text_part = ResponseOutputText(
+                    text=content,
+                    annotations=[],  # TODO
+                    type="output_text",
+                    logprobs=(
+                        self._create_response_logprobs(
+                            token_ids=final_output.token_ids,
+                            logprobs=final_output.logprobs,
+                            tokenizer=tokenizer,
+                            top_logprobs=request.top_logprobs,
+                        )
+                        if request.is_include_output_logprobs()
+                        else None
+                    ),
+                )
            message_item = ResponseOutputMessage(
                id=f"msg_{random_uuid()}",
-                content=[output_text],
+                content=[res_text_part] if res_text_part else [],
                role="assistant",
                status="completed",
                type="message",
@@ -984,17 +1003,28 @@ class OpenAIServingResponses(OpenAIServing):
        if message_item:
            outputs.append(message_item)
        if tool_calls:
-            tool_call_items = [
-                ResponseFunctionToolCall(
-                    id=f"fc_{random_uuid()}",
-                    call_id=f"call_{random_uuid()}",
-                    type="function_call",
-                    status="completed",
-                    name=tool_call.name,
-                    arguments=tool_call.arguments,
+            # We use a simple counter for history_tool_call_count because
+            # we don't track the history of tool calls in the Responses API yet.
+            # This means that the tool call index will start from 0 for each
+            # request.
+            tool_call_items = []
+            for history_tool_call_cnt, tool_call in enumerate(tool_calls):
+                tool_call_items.append(
+                    ResponseFunctionToolCall(
+                        id=f"fc_{random_uuid()}",
+                        call_id=tool_call.id
+                        if tool_call.id
+                        else make_tool_call_id(
+                            id_type=self.tool_call_id_type,
+                            func_name=tool_call.name,
+                            idx=history_tool_call_cnt,
+                        ),
+                        type="function_call",
+                        status="completed",
+                        name=tool_call.name,
+                        arguments=tool_call.arguments,
+                    )
                )
-                for tool_call in tool_calls
-            ]
            outputs.extend(tool_call_items)
        return outputs


--- a/vllm/entrypoints/pooling/classify/serving.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from http import HTTPStatus
-from typing import cast
+from typing import Final, cast

 import jinja2
 import numpy as np
@@ -11,18 +11,8 @@ from fastapi import Request
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.chat_completion.protocol import (
-    ChatCompletionRequest,
-)
-from vllm.entrypoints.openai.engine.protocol import (
-    ErrorResponse,
-    UsageInfo,
-)
-from vllm.entrypoints.openai.engine.serving import (
-    ClassificationServeContext,
-    OpenAIServing,
-    ServeContext,
-)
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo
+from vllm.entrypoints.openai.engine.serving import OpenAIServing, ServeContext
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.pooling.classify.protocol import (
    ClassificationChatRequest,
@@ -39,60 +29,68 @@ from vllm.pooling_params import PoolingParams
 logger = init_logger(__name__)


-class ClassificationMixin(OpenAIServing):
-    chat_template: str | None
-    chat_template_content_format: ChatTemplateContentFormatOption
-    trust_request_chat_template: bool
+ClassificationServeContext = ServeContext[ClassificationRequest]
+
+
+class ServingClassification(OpenAIServing):
+    request_id_prefix = "classify"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template

    async def _preprocess(
        self,
-        ctx: ServeContext,
+        ctx: ClassificationServeContext,
    ) -> ErrorResponse | None:
        """
        Process classification inputs: tokenize text, resolve adapters,
        and prepare model-specific inputs.
        """
-        ctx = cast(ClassificationServeContext, ctx)
        try:
-            request_obj = ctx.request
-
-            if isinstance(request_obj, ClassificationChatRequest):
-                chat_request = request_obj
-                messages = chat_request.messages
-                trust_request_chat_template = getattr(
-                    self,
-                    "trust_request_chat_template",
-                    False,
-                )
-                ret = self._validate_chat_template(
-                    request_chat_template=chat_request.chat_template,
-                    chat_template_kwargs=chat_request.chat_template_kwargs,
-                    trust_request_chat_template=trust_request_chat_template,
+            ctx.lora_request = self._maybe_get_adapters(ctx.request)
+
+            if isinstance(ctx.request, ClassificationChatRequest):
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=ctx.request.chat_template,
+                    chat_template_kwargs=ctx.request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
                )
-                if ret:
-                    return ret
+                if error_check_ret:
+                    return error_check_ret

                _, engine_prompts = await self._preprocess_chat(
-                    cast(ChatCompletionRequest, chat_request),
+                    ctx.request,
                    self.renderer,
-                    messages,
-                    chat_template=(
-                        chat_request.chat_template
-                        or getattr(self, "chat_template", None)
-                    ),
-                    chat_template_content_format=cast(
-                        ChatTemplateContentFormatOption,
-                        getattr(self, "chat_template_content_format", "auto"),
-                    ),
-                    add_generation_prompt=chat_request.add_generation_prompt,
-                    continue_final_message=chat_request.continue_final_message,
-                    add_special_tokens=chat_request.add_special_tokens,
+                    ctx.request.messages,
+                    chat_template=ctx.request.chat_template or self.chat_template,
+                    chat_template_content_format=self.chat_template_content_format,
+                    add_generation_prompt=ctx.request.add_generation_prompt,
+                    continue_final_message=ctx.request.continue_final_message,
+                    add_special_tokens=ctx.request.add_special_tokens,
                )
                ctx.engine_prompts = engine_prompts

-            elif isinstance(request_obj, ClassificationCompletionRequest):
-                completion_request = request_obj
-                input_data = completion_request.input
+            elif isinstance(ctx.request, ClassificationCompletionRequest):
+                input_data = ctx.request.input
                if input_data in (None, ""):
                    return self.create_error_response(
                        "Input or messages must be provided",
@@ -106,13 +104,10 @@ class ClassificationMixin(OpenAIServing):
                prompt_input = cast(str | list[str], input_data)
                ctx.engine_prompts = await renderer.render_prompt(
                    prompt_or_prompts=prompt_input,
-                    config=self._build_render_config(completion_request),
+                    config=self._build_render_config(ctx.request),
                )
            else:
-                return self.create_error_response(
-                    "Invalid classification request type",
-                    status_code=HTTPStatus.BAD_REQUEST,
-                )
+                return self.create_error_response("Invalid classification request type")

            return None

@@ -122,13 +117,14 @@ class ClassificationMixin(OpenAIServing):

    def _build_response(
        self,
-        ctx: ServeContext,
+        ctx: ClassificationServeContext,
    ) -> ClassificationResponse | ErrorResponse:
        """
        Convert model outputs to a formatted classification response
        with probabilities and labels.
        """
-        ctx = cast(ClassificationServeContext, ctx)
+        id2label = getattr(self.model_config.hf_config, "id2label", {})
+
        items: list[ClassificationData] = []
        num_prompt_tokens = 0

@@ -139,9 +135,7 @@ class ClassificationMixin(OpenAIServing):

            probs = classify_res.probs
            predicted_index = int(np.argmax(probs))
-            label = getattr(self.model_config.hf_config, "id2label", {}).get(
-                predicted_index
-            )
+            label = id2label.get(predicted_index)

            item = ClassificationData(
                index=idx,
@@ -174,32 +168,6 @@ class ClassificationMixin(OpenAIServing):
            add_special_tokens=request.add_special_tokens,
        )

-
-class ServingClassification(ClassificationMixin):
-    request_id_prefix = "classify"
-
-    def __init__(
-        self,
-        engine_client: EngineClient,
-        models: OpenAIServingModels,
-        *,
-        request_logger: RequestLogger | None,
-        chat_template: str | None = None,
-        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
-        trust_request_chat_template: bool = False,
-        log_error_stack: bool = False,
-    ) -> None:
-        super().__init__(
-            engine_client=engine_client,
-            models=models,
-            request_logger=request_logger,
-            log_error_stack=log_error_stack,
-        )
-
-        self.chat_template = chat_template
-        self.chat_template_content_format = chat_template_content_format
-        self.trust_request_chat_template = trust_request_chat_template
-
    async def create_classify(
        self,
        request: ClassificationRequest,
@@ -215,11 +183,11 @@ class ServingClassification(ClassificationMixin):
            request_id=request_id,
        )

-        return await super().handle(ctx)  # type: ignore
+        return await self.handle(ctx)  # type: ignore[return-value]

    def _create_pooling_params(
        self,
-        ctx: ServeContext[ClassificationRequest],
+        ctx: ClassificationServeContext,
    ) -> PoolingParams | ErrorResponse:
        pooling_params = super()._create_pooling_params(ctx)
        if isinstance(pooling_params, ErrorResponse):

--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -6,21 +6,13 @@ from typing import Any, Final, cast

 import torch
 from fastapi import Request
-from fastapi.responses import Response
-from typing_extensions import assert_never, override
+from typing_extensions import assert_never

 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.engine.protocol import (
-    ErrorResponse,
-    UsageInfo,
-)
-from vllm.entrypoints.openai.engine.serving import (
-    EmbeddingServeContext,
-    OpenAIServing,
-    ServeContext,
-)
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo
+from vllm.entrypoints.openai.engine.serving import OpenAIServing, ServeContext
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.pooling.embed.protocol import (
    EmbeddingBytesResponse,
@@ -33,19 +25,11 @@ from vllm.entrypoints.pooling.embed.protocol import (
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
-from vllm.outputs import (
-    EmbeddingRequestOutput,
-    PoolingOutput,
-    PoolingRequestOutput,
-    RequestOutput,
-)
+from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.utils.async_utils import merge_async_iterators
 from vllm.utils.collection_utils import chunk_list
 from vllm.utils.serial_utils import (
-    EmbedDType,
-    EncodingFormat,
-    Endianness,
    encode_pooling_bytes,
    encode_pooling_output,
 )
@@ -53,9 +37,33 @@ from vllm.utils.serial_utils import (
 logger = init_logger(__name__)


-class EmbeddingMixin(OpenAIServing):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+EmbeddingServeContext = ServeContext[EmbeddingRequest]
+
+
+class OpenAIServingEmbedding(OpenAIServing):
+    request_id_prefix = "embd"
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        trust_request_chat_template: bool = False,
+        log_error_stack: bool = False,
+    ) -> None:
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            log_error_stack=log_error_stack,
+        )
+
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template

        pooler_config = self.model_config.pooler_config

@@ -69,32 +77,41 @@ class EmbeddingMixin(OpenAIServing):
            else None
        )

-    @override
    async def _preprocess(
        self,
-        ctx: ServeContext,
+        ctx: EmbeddingServeContext,
    ) -> ErrorResponse | None:
-        ctx = cast(EmbeddingServeContext, ctx)
        try:
            ctx.lora_request = self._maybe_get_adapters(ctx.request)

            if isinstance(ctx.request, EmbeddingChatRequest):
+                error_check_ret = self._validate_chat_template(
+                    request_chat_template=ctx.request.chat_template,
+                    chat_template_kwargs=ctx.request.chat_template_kwargs,
+                    trust_request_chat_template=self.trust_request_chat_template,
+                )
+                if error_check_ret is not None:
+                    return error_check_ret
+
                _, ctx.engine_prompts = await self._preprocess_chat(
                    ctx.request,
                    self.renderer,
                    ctx.request.messages,
-                    chat_template=ctx.request.chat_template or ctx.chat_template,
-                    chat_template_content_format=ctx.chat_template_content_format,
+                    chat_template=ctx.request.chat_template or self.chat_template,
+                    chat_template_content_format=self.chat_template_content_format,
                    add_generation_prompt=ctx.request.add_generation_prompt,
                    continue_final_message=ctx.request.continue_final_message,
                    add_special_tokens=ctx.request.add_special_tokens,
                )
-            else:
+            elif isinstance(ctx.request, EmbeddingCompletionRequest):
                renderer = self._get_completion_renderer()
                ctx.engine_prompts = await renderer.render_prompt(
                    prompt_or_prompts=ctx.request.input,
                    config=self._build_render_config(ctx.request),
                )
+            else:
+                return self.create_error_response("Invalid classification request type")
+
            return None
        except (ValueError, TypeError) as e:
            logger.exception("Error in preprocessing prompt inputs")
@@ -113,16 +130,15 @@ class EmbeddingMixin(OpenAIServing):
            add_special_tokens=request.add_special_tokens,
        )

-    @override
    def _build_response(
        self,
-        ctx: ServeContext,
-    ) -> EmbeddingResponse | Response | ErrorResponse:
-        final_res_batch_checked = cast(list[PoolingRequestOutput], ctx.final_res_batch)
+        ctx: EmbeddingServeContext,
+    ) -> EmbeddingResponse | EmbeddingBytesResponse | ErrorResponse:
+        final_res_batch_checked = ctx.final_res_batch

-        encoding_format: EncodingFormat = ctx.request.encoding_format
-        embed_dtype: EmbedDType = ctx.request.embed_dtype
-        endianness: Endianness = ctx.request.endianness
+        encoding_format = ctx.request.encoding_format
+        embed_dtype = ctx.request.embed_dtype
+        endianness = ctx.request.endianness

        def encode_float_base64():
            items: list[EmbeddingResponseData] = []
@@ -203,8 +219,8 @@ class EmbeddingMixin(OpenAIServing):
        self,
        ctx: EmbeddingServeContext,
        token_ids: list[int],
-        pooling_params,
-        trace_headers,
+        pooling_params: PoolingParams,
+        trace_headers: Mapping[str, str] | None,
        prompt_idx: int,
    ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
        """Process a single prompt using chunked processing."""
@@ -246,7 +262,7 @@ class EmbeddingMixin(OpenAIServing):

    def _validate_input(
        self,
-        request,
+        request: object,
        input_ids: list[int],
        input_text: str,
    ) -> TokensPrompt:
@@ -326,7 +342,7 @@ class EmbeddingMixin(OpenAIServing):
        pooling_params: PoolingParams,
        trace_headers: Mapping[str, str] | None,
        prompt_index: int,
-    ) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]:
+    ) -> AsyncGenerator[PoolingRequestOutput, None]:
        """Create a generator for a single prompt using standard processing."""
        request_id_item = f"{ctx.request_id}-{prompt_index}"

@@ -347,7 +363,6 @@ class EmbeddingMixin(OpenAIServing):
            priority=getattr(ctx.request, "priority", 0),
        )

-    @override
    async def _prepare_generators(
        self,
        ctx: ServeContext,
@@ -363,9 +378,7 @@ class EmbeddingMixin(OpenAIServing):
            return await super()._prepare_generators(ctx)

        # Custom logic for chunked processing
-        generators: list[
-            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
-        ] = []
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []

        try:
            trace_headers = (
@@ -419,10 +432,9 @@ class EmbeddingMixin(OpenAIServing):
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))

-    @override
    async def _collect_batch(
        self,
-        ctx: ServeContext,
+        ctx: EmbeddingServeContext,
    ) -> ErrorResponse | None:
        """Collect and aggregate batch results
        with support for chunked processing.
@@ -431,7 +443,6 @@ class EmbeddingMixin(OpenAIServing):
        minimize memory usage.
        For regular requests, collects results normally.
        """
-        ctx = cast(EmbeddingServeContext, ctx)
        try:
            if ctx.engine_prompts is None:
                return self.create_error_response("Engine prompts not available")
@@ -527,12 +538,10 @@ class EmbeddingMixin(OpenAIServing):
                    except (ValueError, IndexError):
                        prompt_idx = result_idx  # Fallback to result_idx

-                    short_prompts_results[prompt_idx] = cast(
-                        PoolingRequestOutput, result
-                    )
+                    short_prompts_results[prompt_idx] = result

            # Finalize aggregated results
-            final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = []
+            final_res_batch: list[PoolingRequestOutput] = []
            num_prompts = len(ctx.engine_prompts)

            for prompt_idx in range(num_prompts):
@@ -580,49 +589,19 @@ class EmbeddingMixin(OpenAIServing):
                            f"Failed to aggregate chunks for prompt {prompt_idx}"
                        )
                elif prompt_idx in short_prompts_results:
-                    final_res_batch.append(
-                        cast(PoolingRequestOutput, short_prompts_results[prompt_idx])
-                    )
+                    final_res_batch.append(short_prompts_results[prompt_idx])
                else:
                    return self.create_error_response(
                        f"Result not found for prompt {prompt_idx}"
                    )

-            ctx.final_res_batch = cast(
-                list[RequestOutput | PoolingRequestOutput], final_res_batch
-            )
+            ctx.final_res_batch = final_res_batch

            return None

        except Exception as e:
            return self.create_error_response(str(e))

-
-class OpenAIServingEmbedding(EmbeddingMixin):
-    request_id_prefix = "embd"
-
-    def __init__(
-        self,
-        engine_client: EngineClient,
-        models: OpenAIServingModels,
-        *,
-        request_logger: RequestLogger | None,
-        chat_template: str | None,
-        chat_template_content_format: ChatTemplateContentFormatOption,
-        trust_request_chat_template: bool = False,
-        log_error_stack: bool = False,
-    ) -> None:
-        super().__init__(
-            engine_client=engine_client,
-            models=models,
-            request_logger=request_logger,
-            log_error_stack=log_error_stack,
-        )
-
-        self.chat_template = chat_template
-        self.chat_template_content_format: Final = chat_template_content_format
-        self.trust_request_chat_template = trust_request_chat_template
-
    async def create_embedding(
        self,
        request: EmbeddingRequest,
@@ -645,16 +624,13 @@ class OpenAIServingEmbedding(EmbeddingMixin):
            raw_request=raw_request,
            model_name=model_name,
            request_id=request_id,
-            chat_template=self.chat_template,
-            chat_template_content_format=self.chat_template_content_format,
        )

-        return await super().handle(ctx)  # type: ignore
+        return await self.handle(ctx)  # type: ignore[return-value]

-    @override
    def _create_pooling_params(
        self,
-        ctx: ServeContext[EmbeddingRequest],
+        ctx: EmbeddingServeContext,
    ) -> PoolingParams | ErrorResponse:
        pooling_params = super()._create_pooling_params(ctx)
        if isinstance(pooling_params, ErrorResponse):
@@ -666,17 +642,3 @@ class OpenAIServingEmbedding(EmbeddingMixin):
            return self.create_error_response(str(e))

        return pooling_params
-
-    async def _preprocess(
-        self,
-        ctx: ServeContext,
-    ) -> ErrorResponse | None:
-        if isinstance(ctx.request, EmbeddingChatRequest):
-            error_check_ret = self._validate_chat_template(
-                request_chat_template=ctx.request.chat_template,
-                chat_template_kwargs=ctx.request.chat_template_kwargs,
-                trust_request_chat_template=self.trust_request_chat_template,
-            )
-            if error_check_ret is not None:
-                return error_check_ret
-        return await super()._preprocess(ctx)
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -17,8 +17,10 @@ from starlette.background import BackgroundTask, BackgroundTasks

 from vllm import envs
 from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import EmbedsPrompt, TokensPrompt
 from vllm.logger import current_formatter_type, init_logger
 from vllm.platforms import current_platform
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 if TYPE_CHECKING:
@@ -32,11 +34,15 @@ if TYPE_CHECKING:
        StreamOptions,
    )
    from vllm.entrypoints.openai.models.protocol import LoRAModulePath
+    from vllm.entrypoints.openai.responses.protocol import (
+        ResponsesRequest,
+    )
 else:
    ChatCompletionRequest = object
    CompletionRequest = object
    StreamOptions = object
    LoRAModulePath = object
+    ResponsesRequest = object


 logger = init_logger(__name__)
@@ -211,11 +217,26 @@ def _validate_truncation_size(

 def get_max_tokens(
    max_model_len: int,
-    request: "ChatCompletionRequest | CompletionRequest",
-    input_length: int,
+    request: "CompletionRequest | ChatCompletionRequest | ResponsesRequest",
+    prompt: TokensPrompt | EmbedsPrompt,
    default_sampling_params: dict,
 ) -> int:
-    max_tokens = getattr(request, "max_completion_tokens", None) or request.max_tokens
+    # NOTE: Avoid isinstance() for better efficiency
+    max_tokens: int | None = None
+    if max_tokens is None:
+        # ChatCompletionRequest
+        max_tokens = getattr(request, "max_completion_tokens", None)
+    if max_tokens is None:
+        # ResponsesRequest
+        max_tokens = getattr(request, "max_output_tokens", None)
+    if max_tokens is None:
+        # CompletionRequest (also a fallback for ChatCompletionRequest)
+        max_tokens = getattr(request, "max_tokens", None)
+
+    input_length = length_from_prompt_token_ids_or_embeds(
+        prompt.get("prompt_token_ids"),  # type: ignore[arg-type]
+        prompt.get("prompt_embeds"),  # type: ignore[arg-type]
+    )
    default_max_tokens = max_model_len - input_length
    max_output_tokens = current_platform.get_max_output_tokens(input_length)


--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,7 @@ if TYPE_CHECKING:
    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_PLUGINS: list[str] | None = None
    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
+    VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
    # Deprecated env variables for profiling, kept for backward compatibility
    # See also vllm/config/profiler.py and `--profiler-config` argument
    VLLM_TORCH_CUDA_PROFILE: str | None = None
@@ -288,16 +289,11 @@ def use_aot_compile() -> bool:
    from vllm.model_executor.layers.batch_invariant import (
        vllm_is_batch_invariant,
    )
-    from vllm.platforms import current_platform
    from vllm.utils.torch_utils import is_torch_equal_or_newer

    default_value = (
        "1"
-        if is_torch_equal_or_newer("2.10.0.dev")
-        and not disable_compile_cache()
-        # Disabling AOT_COMPILE for CPU
-        # See: https://github.com/vllm-project/vllm/issues/32033
-        and not current_platform.is_cpu()
+        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
        else "0"
    )

@@ -782,6 +778,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    ),
    # Backend for Video IO
    # - "opencv": Default backend that uses OpenCV stream buffered backend.
+    # - "identity": Returns raw video bytes for model processor to handle.
    #
    # Custom backend implementations can be registered
    # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and
@@ -873,6 +870,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
+    # A remote HF repo(s) containing one or more LoRA adapters, which
+    # may be downloaded and leveraged as needed. Only works if plugins
+    # are enabled and VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
+    # Values should be comma separated.
+    "VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv(
+        "VLLM_LORA_RESOLVER_HF_REPO_LIST", None
+    ),
    # Enables torch CUDA profiling if set to 1.
    # Deprecated, see profiler_config.
    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),

--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from vllm.logging_utils.access_log_filter import (
+    UvicornAccessLogFilter,
+    create_uvicorn_log_config,
+)
 from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime
 __all__ = [
    "NewLineFormatter",
    "ColoredFormatter",
+    "UvicornAccessLogFilter",
+    "create_uvicorn_log_config",
    "lazy",
    "logtime",
 ]
--- a/vllm/logging_utils/access_log_filter.py
+++ b/vllm/logging_utils/access_log_filter.py
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -103,7 +103,14 @@ def run_cutlass_moe_fp8(
        or a2_scale.size(0) == a1q.shape[0]
    ), "Intermediate scale shape mismatch"
    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-    if expert_map is not None:
+
+    # NOTE(rob): the expert_map is used for the STANDARD case and
+    # the batched format is used by the BATCHED case.
+    # TODO(rob): update the MK interface to only pass the expert_map
+    # during the STANDARD case to make this clearer across all kernels.
+    if use_batched_format:
+        assert expert_num_tokens is not None
+    else:
        assert expert_num_tokens is None

    # We have two modes: batched experts and non-batched experts.
@@ -379,7 +386,10 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
        # needed for STANDARD activation format kernels in DP/EP mode.
        # Note that the BATCHED activation format does not use
        # the expert map for identifying experts.
-        return not moe_parallel_config.use_all2all_kernels
+        return not (
+            moe_parallel_config.use_fi_all2allv_kernels
+            or moe_parallel_config.use_deepep_ht_kernels
+        )

    def supports_chunking(self) -> bool:
        return True
@@ -641,10 +651,8 @@ def run_cutlass_moe_fp4(


 class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
-    @staticmethod
-    def expects_unquantized_inputs(
-        moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig
-    ) -> bool:
+    @property
+    def expects_unquantized_inputs(self) -> bool:
        return True

    @staticmethod

--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
--- a/vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_a2a_prepare_finalize.py
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py