[Frontend] Add request_id to the Request object so they can be controlled...

[Frontend] Add request_id to the Request object so they can be controlled better via external load balancers (#21009) Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>

[Frontend] Add request_id to the Request object so they can be controlled...
[Frontend] Add request_id to the Request object so they can be controlled better via external load balancers (#21009) Signed-off-by: Kourosh Hakhamaneshi <kourosh@anyscale.com>
9fe98d42 · kourosh hakhamaneshi · GitHub · 29c6fbe5 · 9fe98d42 · 9fe98d42
Unverified Commit 9fe98d42 authored Jul 25, 2025 by kourosh hakhamaneshi Committed by GitHub Jul 25, 2025
3 changed files
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1007,6 +1007,13 @@ class CompletionRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
    logits_processors: Optional[LogitsProcessors] = Field(
        default=None,
        description=(
@@ -1251,6 +1258,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
    # --8<-- [end:embedding-extra-params]
@@ -1302,6 +1316,13 @@ class EmbeddingChatRequest(OpenAIBaseModel):
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."),
    )
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."),
+    )
    # --8<-- [end:chat-embedding-extra-params]
    @model_validator(mode="before")

--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -113,7 +113,9 @@ class OpenAIServingCompletion(OpenAIServing):
            return self.create_error_response(
                "Echo is unsupported with prompt embeds.")
-        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        request_id = (
+            f"cmpl-"
+            f"{self._base_request_id(raw_request, request.request_id)}")
        created_time = int(time.time())
        request_metadata = RequestResponseMetadata(request_id=request_id)

--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -163,8 +163,9 @@ class OpenAIServingEmbedding(EmbeddingMixin):
        for the API specification. This API mimics the OpenAI Embedding API.
        """
        model_name = self._get_model_name(request.model)
-        request_id = (f"{self.request_id_prefix}-"
+        request_id = (
-                      f"{self._base_request_id(raw_request)}")
+            f"{self.request_id_prefix}-"
+            f"{self._base_request_id(raw_request, request.request_id)}")
        ctx = EmbeddingServeContext(
            request=request,