[PD] let toy proxy handle /chat/completions (#19730)

Signed-off-by: Linkun <github@lkchen.net>

[PD] let toy proxy handle /chat/completions (#19730)
Signed-off-by: Linkun <github@lkchen.net>
4734704b · lkchen · GitHub · 8b8c209e · 4734704b
Unverified Commit 4734704b authored Jun 25, 2025 by lkchen Committed by GitHub Jun 25, 2025
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 7 deletions

tests/v1/kv_connector/nixl_integration/toy_proxy_server.py tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +15 -7

No files found.
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -196,8 +196,7 @@ async def stream_service_response(client_info: dict, endpoint: str,
            yield chunk
-@app.post("/v1/completions")
+async def _handle_completions(api: str, request: Request):
-async def handle_completions(request: Request):
    try:
        req_data = await request.json()
        request_id = str(uuid.uuid4())
@@ -206,9 +205,8 @@ async def handle_completions(request: Request):
        prefill_client_info = get_next_client(request.app, 'prefill')
        # Send request to prefill service
-        response = await send_request_to_service(prefill_client_info,
+        response = await send_request_to_service(prefill_client_info, api,
-                                                 "/completions", req_data,
+                                                 req_data, request_id)
-                                                 request_id)
        # Extract the needed fields
        response_json = response.json()
@@ -224,7 +222,7 @@ async def handle_completions(request: Request):
        # Stream response from decode service
        async def generate_stream():
            async for chunk in stream_service_response(decode_client_info,
-                                                       "/completions",
+                                                       api,
                                                       req_data,
                                                       request_id=request_id):
                yield chunk
@@ -237,12 +235,22 @@ async def handle_completions(request: Request):
        import traceback
        exc_info = sys.exc_info()
        print("Error occurred in disagg prefill proxy server"
-              " - completions endpoint")
+              f" - {api} endpoint")
        print(e)
        print("".join(traceback.format_exception(*exc_info)))
        raise
+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    return await _handle_completions("/completions", request)
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    return await _handle_completions("/chat/completions", request)
 @app.get("/healthcheck")
 async def healthcheck():
    """Simple endpoint to check if the server is running."""