[OAI] Add rid tracing for v1/embeddings and fix rid type in Chat (#6397)

066cf445 · Chang Su · GitHub · 6dc6b306 · 066cf445 · 066cf445
Unverified Commit 066cf445 authored May 18, 2025 by Chang Su Committed by GitHub May 18, 2025
3 changed files
--- a/python/sglang/srt/layers/attention/flashattention_backend.py
+++ b/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -918,8 +918,8 @@ class FlashAttentionBackend(AttentionBackend):
            and local_attn_metadata is not None
            and (hasattr(layer, "use_irope") and layer.use_irope)
        )
-        
-        # When Spec Decode enabled, forward_decode would be called with two mode: 
+
+        # When Spec Decode enabled, forward_decode would be called with two mode:
        # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
        # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
        use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1

--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -1827,8 +1827,10 @@ def v1_embedding_request(all_requests, tokenizer_manager):
            )
        else:
            prompt_kwargs = {"input_ids": prompts}
+    request_ids = [req.request_id for req in all_requests]

    adapted_request = EmbeddingReqInput(
+        rid=request_ids,
        **prompt_kwargs,
    )


--- a/python/sglang/srt/openai_api/protocol.py
+++ b/python/sglang/srt/openai_api/protocol.py
@@ -393,7 +393,7 @@ class ChatCompletionRequest(BaseModel):
    chat_template_kwargs: Optional[Dict] = None

    # The request id.
-    rid: Optional[Union[List[str], str]] = None
+    rid: Optional[str] = None

    # For PD disaggregation
    bootstrap_host: Optional[str] = None
@@ -469,6 +469,9 @@ class EmbeddingRequest(BaseModel):
    dimensions: int = None
    user: Optional[str] = None

+    # The request id.
+    rid: Optional[str] = None
+

 class EmbeddingObject(BaseModel):
    embedding: List[float]