"src/vscode:/vscode.git/clone" did not exist on "2bc82d6381c6bc5ec9c73e43a30f38434db5a9e1"
Unverified Commit d279d499 authored by Yuhong Guo's avatar Yuhong Guo Committed by GitHub
Browse files

Fix aiohttp 'Chunk too big' in bench_serving (#6737)

parent 6cb00c63
...@@ -39,7 +39,6 @@ from transformers import ( ...@@ -39,7 +39,6 @@ from transformers import (
PreTrainedTokenizerFast, PreTrainedTokenizerFast,
) )
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
ASSISTANT_SUFFIX = "Assistant:" ASSISTANT_SUFFIX = "Assistant:"
global args global args
...@@ -51,6 +50,19 @@ def _get_bool_env_var(name: str, default: str = "false") -> bool: ...@@ -51,6 +50,19 @@ def _get_bool_env_var(name: str, default: str = "false") -> bool:
return value.lower() in ("true", "1") return value.lower() in ("true", "1")
def _create_bench_client_session():
# When the pressure is big, the read buffer could be full before aio thread read
# the content. We increase the read_bufsize from 64K to 10M.
# Define constants for timeout and buffer size for clarity and maintainability
BENCH_AIOHTTP_TIMEOUT_SECONDS = 6 * 60 * 60 # 6 hours
BENCH_AIOHTTP_READ_BUFSIZE_BYTES = 10 * 1024**2 # 10 MB
aiohttp_timeout = aiohttp.ClientTimeout(total=BENCH_AIOHTTP_TIMEOUT_SECONDS)
return aiohttp.ClientSession(
timeout=aiohttp_timeout, read_bufsize=BENCH_AIOHTTP_READ_BUFSIZE_BYTES
)
@dataclass @dataclass
class RequestFuncInput: class RequestFuncInput:
prompt: str prompt: str
...@@ -106,7 +118,7 @@ async def async_request_trt_llm( ...@@ -106,7 +118,7 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("generate_stream") assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with _create_bench_client_session() as session:
payload = { payload = {
"accumulate_tokens": True, "accumulate_tokens": True,
"text_input": request_func_input.prompt, "text_input": request_func_input.prompt,
...@@ -179,7 +191,7 @@ async def async_request_openai_completions( ...@@ -179,7 +191,7 @@ async def async_request_openai_completions(
prompt = request_func_input.prompt prompt = request_func_input.prompt
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with _create_bench_client_session() as session:
payload = { payload = {
"model": request_func_input.model, "model": request_func_input.model,
"prompt": prompt, "prompt": prompt,
...@@ -261,7 +273,7 @@ async def async_request_truss( ...@@ -261,7 +273,7 @@ async def async_request_truss(
prompt = request_func_input.prompt prompt = request_func_input.prompt
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with _create_bench_client_session() as session:
payload = { payload = {
"model": request_func_input.model, "model": request_func_input.model,
"prompt": prompt, "prompt": prompt,
...@@ -338,7 +350,7 @@ async def async_request_sglang_generate( ...@@ -338,7 +350,7 @@ async def async_request_sglang_generate(
api_url = request_func_input.api_url api_url = request_func_input.api_url
prompt = request_func_input.prompt prompt = request_func_input.prompt
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with _create_bench_client_session() as session:
payload = { payload = {
("text" if isinstance(prompt, str) else "input_ids"): prompt, ("text" if isinstance(prompt, str) else "input_ids"): prompt,
"sampling_params": { "sampling_params": {
...@@ -437,7 +449,7 @@ async def async_request_gserver( ...@@ -437,7 +449,7 @@ async def async_request_gserver(
async def async_request_profile(api_url: str) -> RequestFuncOutput: async def async_request_profile(api_url: str) -> RequestFuncOutput:
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with _create_bench_client_session() as session:
output = RequestFuncOutput() output = RequestFuncOutput()
try: try:
async with session.post(url=api_url) as response: async with session.post(url=api_url) as response:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment