[Misc] Support bench serve long context (#24373)

Signed-off-by: Ming Yang <minos.future@gmail.com>

[Misc] Support bench serve long context (#24373)
Signed-off-by: Ming Yang <minos.future@gmail.com>
1823a00d · Ming Yang · GitHub · ed16d0f2 · 1823a00d · 1823a00d
Unverified Commit 1823a00d authored Sep 08, 2025 by Ming Yang Committed by GitHub Sep 08, 2025
Show whitespace changes
Inline Side-by-side

Showing with 167 additions and 84 deletions

tests/benchmarks/test_serve_cli.py tests/benchmarks/test_serve_cli.py +31 -0

vllm/benchmarks/lib/endpoint_request_func.py vllm/benchmarks/lib/endpoint_request_func.py +136 -84

No files found.
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -45,3 +45,34 @@ def test_bench_serve(server):
    print(result.stderr)

    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+
+@pytest.mark.benchmark
+def test_bench_serve_chat(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--endpoint-type",
+        "openai-chat",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -17,6 +17,47 @@ from tqdm.asyncio import tqdm
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


+class StreamedResponseHandler:
+    """Handles streaming HTTP responses by accumulating chunks until complete
+    messages are available."""
+
+    def __init__(self):
+        self.buffer = ""
+
+    def add_chunk(self, chunk_bytes: bytes) -> list[str]:
+        """Add a chunk of bytes to the buffer and return any complete
+        messages."""
+        chunk_str = chunk_bytes.decode("utf-8")
+        self.buffer += chunk_str
+
+        messages = []
+
+        # Split by double newlines (SSE message separator)
+        while "\n\n" in self.buffer:
+            message, self.buffer = self.buffer.split("\n\n", 1)
+            message = message.strip()
+            if message:
+                messages.append(message)
+
+        # if self.buffer is not empty, check if it is a complete message
+        # by removing data: prefix and check if it is a valid JSON
+        if self.buffer.startswith("data: "):
+            message_content = self.buffer.removeprefix("data: ").strip()
+            if message_content == "[DONE]":
+                messages.append(self.buffer.strip())
+                self.buffer = ""
+            elif message_content:
+                try:
+                    json.loads(message_content)
+                    messages.append(self.buffer.strip())
+                    self.buffer = ""
+                except json.JSONDecodeError:
+                    # Incomplete JSON, wait for more chunks.
+                    pass
+
+        return messages
+
+
 @dataclass
 class RequestFuncInput:
    """The input for the request function."""
@@ -102,18 +143,22 @@ async def async_request_openai_completions(
                                headers=headers) as response:
            if response.status == 200:
                first_chunk_received = False
-                async for chunk_bytes in response.content:
+                handler = StreamedResponseHandler()
+
+                async for chunk_bytes in response.content.iter_any():
                    chunk_bytes = chunk_bytes.strip()
                    if not chunk_bytes:
                        continue
-                    chunk_bytes = chunk_bytes.decode("utf-8")
+
+                    messages = handler.add_chunk(chunk_bytes)
+                    for message in messages:
                        # NOTE: SSE comments (often used as pings) start with
                        # a colon. These are not JSON data payload and should
                        # be skipped.
-                    if chunk_bytes.startswith(":"):
+                        if message.startswith(":"):
                            continue

-                    chunk = chunk_bytes.removeprefix("data: ")
+                        chunk = message.removeprefix("data: ")

                        if chunk != "[DONE]":
                            data = json.loads(chunk)
@@ -227,18 +272,21 @@ async def async_request_openai_chat_completions(
        async with session.post(url=api_url, json=payload,
                                headers=headers) as response:
            if response.status == 200:
-                async for chunk_bytes in response.content:
+                handler = StreamedResponseHandler()
+                async for chunk_bytes in response.content.iter_any():
                    chunk_bytes = chunk_bytes.strip()
                    if not chunk_bytes:
                        continue
-                    chunk_bytes = chunk_bytes.decode("utf-8")
+
+                    messages = handler.add_chunk(chunk_bytes)
+                    for message in messages:
                        # NOTE: SSE comments (often used as pings) start with
                        # a colon. These are not JSON data payload and should
                        # be skipped.
-                    if chunk_bytes.startswith(":"):
+                        if message.startswith(":"):
                            continue

-                    chunk = chunk_bytes.removeprefix("data: ")
+                        chunk = message.removeprefix("data: ")

                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
@@ -347,12 +395,16 @@ async def async_request_openai_audio(
                                    data=form,
                                    headers=headers) as response:
                if response.status == 200:
-                    async for chunk_bytes in response.content:
+                    handler = StreamedResponseHandler()
+
+                    async for chunk_bytes in response.content.iter_any():
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue

-                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                        messages = handler.add_chunk(chunk_bytes)
+                        for message in messages:
+                            chunk = message.decode("utf-8").removeprefix(
                                "data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()