Unverified Commit 2ae809c5 authored by ch-tiger1's avatar ch-tiger1 Committed by GitHub
Browse files

Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)


Signed-off-by: default avatarch-tiger1 <xyz@ch-tech.ip-ddns.com>
Co-authored-by: default avatarch-tiger1 <xyz@ch-tech.ip-ddns.com>
parent 1de4db9b
...@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse ...@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
from sglang.srt.disaggregation.utils import PDRegistryRequest from sglang.srt.disaggregation.utils import PDRegistryRequest
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
1024 * 64
) # 64KB, to prevent aiohttp's "Chunk too big" error
def setup_logger(): def setup_logger():
logger = logging.getLogger("pdlb") logger = logging.getLogger("pdlb")
...@@ -154,7 +158,9 @@ class MiniLoadBalancer: ...@@ -154,7 +158,9 @@ class MiniLoadBalancer:
else: else:
yield chunk yield chunk
else: else:
async for chunk in decode_response.content: async for chunk in decode_response.content.iter_chunked(
AIOHTTP_STREAM_READ_CHUNK_SIZE
):
yield chunk yield chunk
return StreamingResponse( return StreamingResponse(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment