Unverified Commit 2ae809c5 authored by ch-tiger1's avatar ch-tiger1 Committed by GitHub
Browse files

Fix mini_lb for PD with long output: limit chunk size of decode response (#7301)


Signed-off-by: default avatarch-tiger1 <xyz@ch-tech.ip-ddns.com>
Co-authored-by: default avatarch-tiger1 <xyz@ch-tech.ip-ddns.com>
parent 1de4db9b
......@@ -18,6 +18,10 @@ from fastapi.responses import ORJSONResponse, Response, StreamingResponse
from sglang.srt.disaggregation.utils import PDRegistryRequest
AIOHTTP_STREAM_READ_CHUNK_SIZE = (
1024 * 64
) # 64KB, to prevent aiohttp's "Chunk too big" error
def setup_logger():
logger = logging.getLogger("pdlb")
......@@ -154,7 +158,9 @@ class MiniLoadBalancer:
else:
yield chunk
else:
async for chunk in decode_response.content:
async for chunk in decode_response.content.iter_chunked(
AIOHTTP_STREAM_READ_CHUNK_SIZE
):
yield chunk
return StreamingResponse(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment