[router] Add gRPC E2E test suite (#11790)

63cfe1b0 · Keyang Ru · GitHub · 70f6309c · 63cfe1b0 · 63cfe1b0
Unverified Commit 63cfe1b0 authored Oct 21, 2025 by Keyang Ru Committed by GitHub Oct 21, 2025
13 changed files
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -86,7 +86,7 @@ jobs:
  pytest-rust:
    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
    runs-on: 4-gpu-a10
-    timeout-minutes: 25
+    timeout-minutes: 32
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -144,6 +144,12 @@ jobs:
          python3 -m pip --no-cache-dir install --upgrade --break-system-packages genai-bench==0.0.2
          pytest -m e2e -s  -vv -o log_cli=true --log-cli-level=INFO
+      - name: Run Python E2E gRPC tests
+        run: |
+          bash scripts/killall_sglang.sh "nuk_gpus"
+          cd sgl-router
+          SHOW_ROUTER_LOGS=1 ROUTER_LOCAL_MODEL_PATH="/home/ubuntu/models" pytest py_test/e2e_grpc -s -vv -o log_cli=true --log-cli-level=INFO
      - name: Upload benchmark results
        if: success()
        uses: actions/upload-artifact@v4

--- a/sgl-router/py_test/e2e_grpc/basic/test_openai_server.py
+++ b/sgl-router/py_test/e2e_grpc/basic/test_openai_server.py
+"""
+gRPC Router E2E Test - OpenAI Server API Compatibility
+This test file is REUSED from test/srt/openai_server/basic/test_openai_server.py
+with minimal changes:
+- Swap popen_launch_server() → popen_launch_workers_and_router()
+- Update teardown to cleanup router + workers
+- All test logic and assertions remain identical
+Run with:
+    python3 -m pytest e2e_grpc/basic/test_openai_server.py -v
+    python3 -m unittest e2e_grpc.basic.test_openai_server.TestOpenAIServer.test_completion
+"""
+import json
+# CHANGE: Import router launcher instead of server launcher
+import sys
+import unittest
+from pathlib import Path
+import openai
+import requests
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    get_tokenizer,
+    kill_process_tree,
+)
+class TestOpenAIServer(CustomTestCase):
+    """
+    Test OpenAI API through gRPC router.
+    REUSED from test/srt/openai_server/basic/test_openai_server.py
+    ONLY CHANGE: Server launch mechanism
+      - Launches SGLang workers with --enable-grpc
+      - Launches gRPC router pointing to those workers
+    """
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            num_workers=1,
+            tp_size=2,
+            policy="round_robin",
+            api_key=cls.api_key,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+    # ALL TEST METHODS BELOW ARE UNCHANGED FROM ORIGINAL
+    # They validate that the router maintains OpenAI API compatibility
+    def run_chat_completion(self, logprobs, parallel_sample_num):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "What is the capital of France? Answer in a few words.",
+                },
+            ],
+            temperature=0,
+            logprobs=logprobs is not None and logprobs > 0,
+            top_logprobs=logprobs,
+            n=parallel_sample_num,
+        )
+        if logprobs:
+            assert isinstance(
+                response.choices[0].logprobs.content[0].top_logprobs[0].token, str
+            )
+            ret_num_top_logprobs = len(
+                response.choices[0].logprobs.content[0].top_logprobs
+            )
+            assert (
+                ret_num_top_logprobs == logprobs
+            ), f"{ret_num_top_logprobs} vs {logprobs}"
+        assert len(response.choices) == parallel_sample_num
+        assert response.choices[0].message.role == "assistant"
+        assert isinstance(response.choices[0].message.content, str)
+        assert response.id
+        assert response.created
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens > 0
+    def run_chat_completion_stream(self, logprobs, parallel_sample_num=1):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        generator = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "What is the capital of France?"},
+            ],
+            temperature=0,
+            logprobs=logprobs is not None and logprobs > 0,
+            top_logprobs=logprobs,
+            stream=True,
+            stream_options={"include_usage": True},
+            n=parallel_sample_num,
+        )
+        is_firsts = {}
+        is_finished = {}
+        finish_reason_counts = {}
+        for response in generator:
+            usage = response.usage
+            if usage is not None:
+                assert usage.prompt_tokens > 0, f"usage.prompt_tokens was zero"
+                assert usage.completion_tokens > 0, f"usage.completion_tokens was zero"
+                assert usage.total_tokens > 0, f"usage.total_tokens was zero"
+                continue
+            index = response.choices[0].index
+            finish_reason = response.choices[0].finish_reason
+            if finish_reason is not None:
+                is_finished[index] = True
+                finish_reason_counts[index] = finish_reason_counts.get(index, 0) + 1
+            data = response.choices[0].delta
+            if is_firsts.get(index, True):
+                assert (
+                    data.role == "assistant"
+                ), f"data.role was not 'assistant' for first chunk"
+                is_firsts[index] = False
+                continue
+            if logprobs and not is_finished.get(index, False):
+                assert response.choices[0].logprobs, f"logprobs was not returned"
+                assert isinstance(
+                    response.choices[0].logprobs.content[0].top_logprobs[0].token, str
+                ), f"top_logprobs token was not a string"
+                assert isinstance(
+                    response.choices[0].logprobs.content[0].top_logprobs, list
+                ), f"top_logprobs was not a list"
+                ret_num_top_logprobs = len(
+                    response.choices[0].logprobs.content[0].top_logprobs
+                )
+                assert (
+                    ret_num_top_logprobs == logprobs
+                ), f"{ret_num_top_logprobs} vs {logprobs}"
+            assert (
+                isinstance(data.content, str)
+                or isinstance(data.reasoning_content, str)
+                or (isinstance(data.tool_calls, list) and len(data.tool_calls) > 0)
+                or response.choices[0].finish_reason
+            )
+            assert response.id
+            assert response.created
+        for index in [i for i in range(parallel_sample_num)]:
+            assert not is_firsts.get(
+                index, True
+            ), f"index {index} is not found in the response"
+        for index in range(parallel_sample_num):
+            assert (
+                index in finish_reason_counts
+            ), f"No finish_reason found for index {index}"
+            assert (
+                finish_reason_counts[index] == 1
+            ), f"Expected 1 finish_reason chunk for index {index}, got {finish_reason_counts[index]}"
+    def test_chat_completion(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion(logprobs, parallel_sample_num)
+    def test_chat_completion_stream(self):
+        for logprobs in [None, 5]:
+            for parallel_sample_num in [1, 2]:
+                self.run_chat_completion_stream(logprobs, parallel_sample_num)
+    def test_regex(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        regex = (
+            r"""\{\n"""
+            + r"""   "name": "[\w]+",\n"""
+            + r"""   "population": [\d]+\n"""
+            + r"""\}"""
+        )
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "Introduce the capital of France."},
+            ],
+            temperature=0,
+            max_tokens=128,
+            extra_body={"regex": regex},
+        )
+        text = response.choices[0].message.content
+        try:
+            js_obj = json.loads(text)
+        except (TypeError, json.decoder.JSONDecodeError):
+            print("JSONDecodeError", text)
+            raise
+        assert isinstance(js_obj["name"], str)
+        assert isinstance(js_obj["population"], int)
+    def test_penalty(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {"role": "user", "content": "Introduce the capital of France."},
+            ],
+            temperature=0,
+            max_tokens=32,
+            frequency_penalty=1.0,
+        )
+        text = response.choices[0].message.content
+        assert isinstance(text, str)
+    def test_response_prefill(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": """
+Extract the name, size, price, and color from this product description as a JSON object:
+<description>
+The SmartHome Mini is a compact smart home assistant available in black or white for only $49.99. At just 5 inches wide, it lets you control lights, thermostats, and other connected devices via voice or app—no matter where you place it in your home. This affordable little hub brings convenient hands-free control to your smart devices.
+</description>
+""",
+                },
+                {
+                    "role": "assistant",
+                    "content": "{\n",
+                },
+            ],
+            temperature=0,
+            extra_body={"continue_final_message": True},
+        )
+        assert (
+            response.choices[0]
+            .message.content.strip()
+            .startswith('"name": "SmartHome Mini",')
+        )
+    def test_model_list(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        # TODO: Update the logic here when router /v1/models response format matching the openai api standard
+        models = list(client.models.list().models)
+        assert len(models) == 1
+        # assert isinstance(getattr(models[0], "max_model_len", None), int)
+    @unittest.skip("Skipping retrieve model test as it is not supported by the router")
+    def test_retrieve_model(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        retrieved_model = client.models.retrieve(self.model)
+        self.assertEqual(retrieved_model.id, self.model)
+        self.assertEqual(retrieved_model.root, self.model)
+        with self.assertRaises(openai.NotFoundError):
+            client.models.retrieve("non-existent-model")
+if __name__ == "__main__":
+    unittest.main()
--- a/sgl-router/py_test/e2e_grpc/conftest.py
+++ b/sgl-router/py_test/e2e_grpc/conftest.py
+"""
+Pytest configuration for gRPC router e2e tests.
+This module provides shared fixtures that can be used across all gRPC router tests.
+"""
+import sys
+from pathlib import Path
+import pytest
+# Ensure router py_src is importable
+_ROUTER_ROOT = Path(__file__).resolve().parents[2]
+_ROUTER_SRC = _ROUTER_ROOT / "py_src"
+if str(_ROUTER_SRC) not in sys.path:
+    sys.path.insert(0, str(_ROUTER_SRC))
+# Ensure e2e_grpc test utilities are importable
+_E2E_GRPC_DIR = Path(__file__).parent
+if str(_E2E_GRPC_DIR) not in sys.path:
+    sys.path.insert(0, str(_E2E_GRPC_DIR))
+# Pytest markers for test organization
+def pytest_configure(config):
+    config.addinivalue_line("markers", "e2e: end-to-end tests with real workers")
+    config.addinivalue_line("markers", "grpc: gRPC-specific tests")
+    config.addinivalue_line("markers", "slow: slow-running tests")
+    config.addinivalue_line("markers", "pd: prefill-decode disaggregation tests")
--- a/sgl-router/py_test/e2e_grpc/features/test_enable_thinking.py
+++ b/sgl-router/py_test/e2e_grpc/features/test_enable_thinking.py
+"""
+Usage:
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
+python3 -m unittest openai_server.features.test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
+"""
+import asyncio
+import json
+import os
+import sys
+import time
+import unittest
+# CHANGE: Import router launcher instead of server launcher
+from pathlib import Path
+import openai
+import requests
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_ENABLE_THINKING_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    get_tokenizer,
+    kill_process_tree,
+)
+class TestEnableThinking(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # CHANGE: Launch gRPC router with integrated workers (single command)
+        cls.model = DEFAULT_ENABLE_THINKING_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=120,
+            api_key=cls.api_key,
+            router_args=[
+                "--reasoning-parser",
+                "qwen3",
+            ],
+            num_workers=1,
+            tp_size=4,
+        )
+        cls.additional_chat_kwargs = {}
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+    def test_chat_completion_with_reasoning(self):
+        # Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+        )
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+        self.assertIn("reasoning_content", data["choices"][0]["message"])
+        self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
+    def test_chat_completion_without_reasoning(self):
+        # Test non-streaming with "enable_thinking": False, reasoning_content should be empty
+        client = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+        )
+        self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
+        data = client.json()
+        self.assertIn("choices", data)
+        self.assertTrue(len(data["choices"]) > 0)
+        self.assertIn("message", data["choices"][0])
+        if "reasoning_content" in data["choices"][0]["message"]:
+            self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
+    def test_stream_chat_completion_with_reasoning(self):
+        # Test streaming with "enable_thinking": True, reasoning_content should not be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": True},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+        has_reasoning = False
+        has_content = False
+        print("\n=== Stream With Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+        self.assertTrue(
+            has_reasoning,
+            "The reasoning content is not included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+    def test_stream_chat_completion_without_reasoning(self):
+        # Test streaming with "enable_thinking": False, reasoning_content should  be empty
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": "Hello"}],
+                "temperature": 0,
+                "separate_reasoning": True,
+                "stream": True,
+                "chat_template_kwargs": {"enable_thinking": False},
+                **self.additional_chat_kwargs,
+            },
+            stream=True,
+        )
+        self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
+        has_reasoning = False
+        has_content = False
+        print("\n=== Stream Without Reasoning ===")
+        for line in response.iter_lines():
+            if line:
+                line = line.decode("utf-8")
+                if line.startswith("data:") and not line.startswith("data: [DONE]"):
+                    data = json.loads(line[6:])
+                    if "choices" in data and len(data["choices"]) > 0:
+                        delta = data["choices"][0].get("delta", {})
+                        if "reasoning_content" in delta and delta["reasoning_content"]:
+                            has_reasoning = True
+                        if "content" in delta and delta["content"]:
+                            has_content = True
+        self.assertFalse(
+            has_reasoning,
+            "The reasoning content should not be included in the stream response",
+        )
+        self.assertTrue(
+            has_content, "The stream response does not contain normal content"
+        )
+if __name__ == "__main__":
+    unittest.main()
--- a/sgl-router/py_test/e2e_grpc/features/test_reasoning_content.py
+++ b/sgl-router/py_test/e2e_grpc/features/test_reasoning_content.py
+"""
+Usage:
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_streaming_separate_reasoning_true_stream_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_false
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentAPI.test_nonstreaming_separate_reasoning_true
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_nonstreaming
+python3 -m unittest openai_server.features.test_reasoning_content.TestReasoningContentStartup.test_streaming
+"""
+import json
+# CHANGE: Import router launcher instead of server launcher
+import sys
+import unittest
+from pathlib import Path
+import openai
+import requests
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_REASONING_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    kill_process_tree,
+)
+class TestReasoningContentAPI(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # CHANGE: Launch gRPC router with integrated workers (single command)
+        cls.model = DEFAULT_REASONING_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-1234"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            router_args=[
+                "--reasoning-parser",
+                "deepseek_r1",
+            ],
+            num_workers=1,
+            tp_size=2,
+        )
+        cls.base_url += "/v1"
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+    def test_streaming_separate_reasoning_false(self):
+        # Test streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+        assert len(reasoning_content) == 0
+        assert len(content) > 0
+    def test_streaming_separate_reasoning_true(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+        reasoning_content = ""
+        content = ""
+        for chunk in response:
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+            elif chunk.choices[0].delta.reasoning_content:
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+    def test_streaming_separate_reasoning_true_stream_reasoning_false(self):
+        # Test streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "stream": True,
+            "extra_body": {"separate_reasoning": True, "stream_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+        reasoning_content = ""
+        content = ""
+        first_chunk = False
+        for chunk in response:
+            if chunk.choices[0].delta.reasoning_content:
+                reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if chunk.choices[0].delta.content:
+                content += chunk.choices[0].delta.content
+                if not first_chunk:
+                    reasoning_content = chunk.choices[0].delta.reasoning_content
+                first_chunk = True
+            if not first_chunk:
+                assert (
+                    not chunk.choices[0].delta.reasoning_content
+                    or len(chunk.choices[0].delta.reasoning_content) == 0
+                )
+        assert len(reasoning_content) > 0
+        assert len(content) > 0
+    def test_nonstreaming_separate_reasoning_false(self):
+        # Test non-streaming with separate_reasoning=False, reasoning_content should be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": False},
+        }
+        response = client.chat.completions.create(**payload)
+        assert (
+            not response.choices[0].message.reasoning_content
+            or len(response.choices[0].message.reasoning_content) == 0
+        )
+        assert len(response.choices[0].message.content) > 0
+    def test_nonstreaming_separate_reasoning_true(self):
+        # Test non-streaming with separate_reasoning=True, reasoning_content should not be empty
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        payload = {
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "What is 1+3?",
+                }
+            ],
+            "max_tokens": 100,
+            "extra_body": {"separate_reasoning": True},
+        }
+        response = client.chat.completions.create(**payload)
+        assert len(response.choices[0].message.reasoning_content) > 0
+        assert len(response.choices[0].message.content) > 0
+if __name__ == "__main__":
+    unittest.main()
--- a/sgl-router/py_test/e2e_grpc/fixtures.py
+++ b/sgl-router/py_test/e2e_grpc/fixtures.py
+"""
+Fixtures for launching gRPC router + workers for e2e testing.
+This module provides fixtures for launching SGLang workers and gRPC router separately:
+    1. Launch N SGLang workers with gRPC enabled
+    2. Launch router pointing to those workers
+This approach gives more control and matches production deployment patterns.
+"""
+import socket
+import subprocess
+import time
+from typing import Optional
+import requests
+def find_free_port() -> int:
+    """Find an available port on localhost."""
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+def wait_for_workers_ready(
+    router_url: str,
+    expected_workers: int,
+    timeout: int = 300,
+    api_key: Optional[str] = None,
+) -> None:
+    """
+    Wait for router to have all workers connected.
+    Polls the /workers endpoint until the 'total' field matches expected_workers.
+    Example response from /workers endpoint:
+    {"workers":[],"total":0,"stats":{"prefill_count":0,"decode_count":0,"regular_count":0}}
+    Args:
+        router_url: Base URL of router (e.g., "http://127.0.0.1:30000")
+        expected_workers: Number of workers expected to be connected
+        timeout: Max seconds to wait
+        api_key: Optional API key for authentication
+    """
+    start_time = time.time()
+    last_error = None
+    attempt = 0
+    headers = {}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    with requests.Session() as session:
+        while time.time() - start_time < timeout:
+            attempt += 1
+            elapsed = int(time.time() - start_time)
+            # Print progress every 10 seconds
+            if elapsed > 0 and elapsed % 10 == 0 and attempt % 10 == 0:
+                print(f"  Still waiting for workers... ({elapsed}/{timeout}s elapsed)")
+            try:
+                response = session.get(
+                    f"{router_url}/workers", headers=headers, timeout=5
+                )
+                if response.status_code == 200:
+                    data = response.json()
+                    total_workers = data.get("total", 0)
+                    if total_workers == expected_workers:
+                        print(
+                            f"  All {expected_workers} workers connected after {elapsed}s"
+                        )
+                        return
+                    else:
+                        last_error = f"Workers: {total_workers}/{expected_workers}"
+                else:
+                    last_error = f"HTTP {response.status_code}"
+            except requests.ConnectionError:
+                last_error = "Connection refused (router not ready yet)"
+            except requests.Timeout:
+                last_error = "Timeout"
+            except requests.RequestException as e:
+                last_error = str(e)
+            except (ValueError, KeyError) as e:
+                last_error = f"Invalid response: {e}"
+            time.sleep(1)
+    raise TimeoutError(
+        f"Router at {router_url} did not get {expected_workers} workers within {timeout}s.\n"
+        f"Last status: {last_error}\n"
+        f"Hint: Run with SHOW_ROUTER_LOGS=1 to see startup logs"
+    )
+def popen_launch_workers_and_router(
+    model: str,
+    base_url: str,
+    timeout: int = 300,
+    num_workers: int = 2,
+    policy: str = "round_robin",
+    api_key: Optional[str] = None,
+    worker_args: Optional[list] = None,
+    router_args: Optional[list] = None,
+    tp_size: int = 1,
+    env: Optional[dict] = None,
+    stdout=None,
+    stderr=None,
+) -> dict:
+    """
+    Launch SGLang workers and gRPC router separately.
+    This approach:
+    1. Starts N SGLang workers with --grpc-mode flag
+    2. Waits for workers to initialize (process startup)
+    3. Starts a gRPC router pointing to those workers
+    4. Waits for router health check to pass (router validates worker connectivity)
+    This matches production deployment patterns better than the integrated approach.
+    Args:
+        model: Model path (e.g., /home/ubuntu/models/llama-3.1-8b-instruct)
+        base_url: Base URL for router (e.g., "http://127.0.0.1:8080")
+        timeout: Timeout for server startup (default: 300s)
+        num_workers: Number of workers to launch
+        policy: Routing policy (round_robin, random, power_of_two, cache_aware)
+        api_key: Optional API key for router
+        worker_args: Additional arguments for workers (e.g., ["--context-len", "8192"])
+        router_args: Additional arguments for router (e.g., ["--max-total-token", "1536"])
+        tp_size: Tensor parallelism size for workers (default: 1)
+        env: Optional environment variables for workers (e.g., {"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256"})
+        stdout: Optional file handle for worker stdout (default: subprocess.PIPE)
+        stderr: Optional file handle for worker stderr (default: subprocess.PIPE)
+    Returns:
+        dict with:
+            - workers: list of worker process objects
+            - worker_urls: list of gRPC worker URLs
+            - router: router process object
+            - base_url: router URL (HTTP endpoint)
+    Example:
+        >>> cluster = popen_launch_workers_and_router(model, base_url, num_workers=2)
+        >>> # Use cluster['base_url'] for HTTP requests
+        >>> # Cleanup:
+        >>> for worker in cluster['workers']:
+        >>>     kill_process_tree(worker.pid)
+        >>> kill_process_tree(cluster['router'].pid)
+    """
+    import os
+    show_output = os.environ.get("SHOW_ROUTER_LOGS", "0") == "1"
+    # Note: timeout parameter is used for router health check below
+    # Parse router port from base_url
+    if ":" in base_url.split("//")[-1]:
+        router_port = int(base_url.split(":")[-1])
+    else:
+        router_port = find_free_port()
+    print(f"\n{'='*70}")
+    print(f"Launching gRPC cluster (separate workers + router)")
+    print(f"{'='*70}")
+    print(f"  Model: {model}")
+    print(f"  Router port: {router_port}")
+    print(f"  Workers: {num_workers}")
+    print(f"  TP size: {tp_size}")
+    print(f"  Policy: {policy}")
+    # Step 1: Launch workers with gRPC enabled
+    workers = []
+    worker_urls = []
+    for i in range(num_workers):
+        worker_port = find_free_port()
+        worker_url = f"grpc://127.0.0.1:{worker_port}"
+        worker_urls.append(worker_url)
+        print(f"\n[Worker {i+1}/{num_workers}]")
+        print(f"  Port: {worker_port}")
+        print(f"  URL: {worker_url}")
+        # Build worker command
+        worker_cmd = [
+            "python3",
+            "-m",
+            "sglang.launch_server",
+            "--model-path",
+            model,
+            "--host",
+            "127.0.0.1",
+            "--port",
+            str(worker_port),
+            "--grpc-mode",  # Enable gRPC for this worker
+            "--mem-fraction-static",
+            "0.8",
+            "--attention-backend",
+            "fa3",
+        ]
+        # Add TP size
+        if tp_size > 1:
+            worker_cmd.extend(["--tp-size", str(tp_size)])
+        # Add worker-specific args
+        if worker_args:
+            worker_cmd.extend(worker_args)
+        # Launch worker with optional environment variables
+        if show_output:
+            worker_proc = subprocess.Popen(
+                worker_cmd,
+                env=env,
+                stdout=stdout,
+                stderr=stderr,
+            )
+        else:
+            worker_proc = subprocess.Popen(
+                worker_cmd,
+                stdout=stdout if stdout is not None else subprocess.PIPE,
+                stderr=stderr if stderr is not None else subprocess.PIPE,
+                env=env,
+            )
+        workers.append(worker_proc)
+        print(f"  PID: {worker_proc.pid}")
+    # Give workers a moment to start binding to ports
+    # The router will check worker health when it starts
+    print(f"\nWaiting for {num_workers} workers to initialize (20s)...")
+    time.sleep(20)
+    # Quick check: make sure worker processes are still alive
+    for i, worker in enumerate(workers):
+        if worker.poll() is not None:
+            print(f"  ✗ Worker {i+1} died during startup (exit code: {worker.poll()})")
+            # Cleanup: kill all workers
+            for w in workers:
+                try:
+                    w.kill()
+                except:
+                    pass
+            raise RuntimeError(f"Worker {i+1} failed to start")
+    print(f"✓ All {num_workers} workers started (router will verify connectivity)")
+    # Step 2: Launch router pointing to workers
+    print(f"\n[Router]")
+    print(f"  Port: {router_port}")
+    print(f"  Worker URLs: {', '.join(worker_urls)}")
+    # Build router command
+    router_cmd = [
+        "python3",
+        "-m",
+        "sglang_router.launch_router",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        str(router_port),
+        "--prometheus-port",
+        "9321",
+        "--policy",
+        policy,
+        "--model-path",
+        model,
+    ]
+    # Add worker URLs
+    router_cmd.append("--worker-urls")
+    router_cmd.extend(worker_urls)
+    # Add API key
+    if api_key:
+        router_cmd.extend(["--api-key", api_key])
+    # Add router-specific args
+    if router_args:
+        router_cmd.extend(router_args)
+    if show_output:
+        print(f"  Command: {' '.join(router_cmd)}")
+    # Launch router
+    if show_output:
+        router_proc = subprocess.Popen(router_cmd)
+    else:
+        router_proc = subprocess.Popen(
+            router_cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+    print(f"  PID: {router_proc.pid}")
+    # Wait for router to be ready
+    router_url = f"http://127.0.0.1:{router_port}"
+    print(f"\nWaiting for router to start at {router_url}...")
+    try:
+        wait_for_workers_ready(
+            router_url, expected_workers=num_workers, timeout=180, api_key=api_key
+        )
+        print(f"✓ Router ready at {router_url}")
+    except TimeoutError:
+        print(f"✗ Router failed to start")
+        # Cleanup: kill router and all workers
+        try:
+            router_proc.kill()
+        except:
+            pass
+        for worker in workers:
+            try:
+                worker.kill()
+            except:
+                pass
+        raise
+    print(f"\n{'='*70}")
+    print(f"✓ gRPC cluster ready!")
+    print(f"  Router: {router_url}")
+    print(f"  Workers: {len(workers)}")
+    print(f"{'='*70}\n")
+    return {
+        "workers": workers,
+        "worker_urls": worker_urls,
+        "router": router_proc,
+        "base_url": router_url,
+    }
--- a/sgl-router/py_test/e2e_grpc/function_call/test_openai_function_calling.py
+++ b/sgl-router/py_test/e2e_grpc/function_call/test_openai_function_calling.py
--- a/sgl-router/py_test/e2e_grpc/function_call/test_tool_choice.py
+++ b/sgl-router/py_test/e2e_grpc/function_call/test_tool_choice.py
--- a/sgl-router/py_test/e2e_grpc/pytest.ini
+++ b/sgl-router/py_test/e2e_grpc/pytest.ini
+[pytest]
+# Show print statements and logs
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s [%(levelname)8s] %(message)s
+log_cli_date_format = %Y-%m-%d %H:%M:%S
+# Show stdout/stderr
+addopts = -v -s --tb=short
+# Capture settings
+# -s means don't capture stdout (show print statements)
+# --tb=short means short traceback format
--- a/sgl-router/py_test/e2e_grpc/util.py
+++ b/sgl-router/py_test/e2e_grpc/util.py
+"""
+Standalone utilities for e2e_grpc tests.
+This module provides all necessary utilities without depending on sglang Python package.
+Extracted and adapted from:
+- sglang.srt.utils.kill_process_tree
+- sglang.srt.utils.hf_transformers_utils.get_tokenizer
+- sglang.test.test_utils (constants and CustomTestCase)
+"""
+import os
+import signal
+import threading
+import unittest
+from pathlib import Path
+from typing import Optional, Union
+import psutil
+try:
+    from transformers import (
+        AutoTokenizer,
+        PreTrainedTokenizer,
+        PreTrainedTokenizerBase,
+        PreTrainedTokenizerFast,
+    )
+except ImportError:
+    raise ImportError(
+        "transformers is required for tokenizer utilities. "
+        "Install with: pip install transformers"
+    )
+# ============================================================================
+# Constants
+# ============================================================================
+# Server and timeout constants
+DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 20000
+DEFAULT_URL_FOR_TEST = f"http://127.0.0.1:{DEFAULT_PORT_FOR_SRT_TEST_RUNNER + 1000}"
+# File name constants for test output
+STDOUT_FILENAME = "/tmp/sglang_test_stdout.txt"
+STDERR_FILENAME = "/tmp/sglang_test_stderr.txt"
+# Model base path - can be overridden via environment variable
+# By default, use HuggingFace model identifiers (no local path prefix)
+# Set ROUTER_LOCAL_MODEL_PATH to use local models (e.g., "/home/ubuntu/models")
+ROUTER_LOCAL_MODEL_PATH = os.environ.get("ROUTER_LOCAL_MODEL_PATH", "")
+# Helper function to build model paths
+def _get_model_path(model_identifier: str) -> str:
+    """
+    Build model path from base path and model identifier.
+    If ROUTER_LOCAL_MODEL_PATH is set, prepend it to the identifier.
+    Otherwise, return the identifier as-is (for HuggingFace download).
+    """
+    if ROUTER_LOCAL_MODEL_PATH:
+        return os.path.join(ROUTER_LOCAL_MODEL_PATH, model_identifier)
+    return model_identifier
+# Model paths used in e2e_grpc tests
+# These can be either HuggingFace identifiers or local paths (depending on ROUTER_LOCAL_MODEL_PATH)
+# Main test model - Llama 3.1 8B Instruct
+DEFAULT_MODEL_PATH = _get_model_path("meta-llama/Llama-3.1-8B-Instruct")
+# Small models for function calling tests
+DEFAULT_SMALL_MODEL_PATH = _get_model_path("meta-llama/Llama-3.2-1B-Instruct")
+# Reasoning models
+DEFAULT_REASONING_MODEL_PATH = _get_model_path(
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+)
+# Thinking-enabled models
+DEFAULT_ENABLE_THINKING_MODEL_PATH = _get_model_path("Qwen/Qwen3-30B-A3B")
+# Function calling models
+DEFAULT_QWEN_FUNCTION_CALLING_MODEL_PATH = _get_model_path("Qwen/Qwen2.5-7B-Instruct")
+DEFAULT_MISTRAL_FUNCTION_CALLING_MODEL_PATH = _get_model_path(
+    "mistralai/Mistral-7B-Instruct-v0.3"
+)
+# ============================================================================
+# Process Management
+# ============================================================================
+def kill_process_tree(parent_pid, include_parent: bool = True, skip_pid: int = None):
+    """
+    Kill the process and all its child processes.
+    Args:
+        parent_pid: PID of the parent process
+        include_parent: Whether to kill the parent process itself
+        skip_pid: Optional PID to skip during cleanup
+    """
+    # Remove sigchld handler to avoid spammy logs
+    if threading.current_thread() is threading.main_thread():
+        signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+    if parent_pid is None:
+        parent_pid = os.getpid()
+        include_parent = False
+    try:
+        itself = psutil.Process(parent_pid)
+    except psutil.NoSuchProcess:
+        return
+    children = itself.children(recursive=True)
+    for child in children:
+        if child.pid == skip_pid:
+            continue
+        try:
+            child.kill()
+        except psutil.NoSuchProcess:
+            pass
+    if include_parent:
+        try:
+            itself.kill()
+        except psutil.NoSuchProcess:
+            pass
+# ============================================================================
+# Tokenizer Utilities
+# ============================================================================
+def check_gguf_file(model_path: str) -> bool:
+    """Check if the model path points to a GGUF file."""
+    if not isinstance(model_path, str):
+        return False
+    return model_path.endswith(".gguf")
+def is_remote_url(path: str) -> bool:
+    """Check if the path is a remote URL."""
+    if not isinstance(path, str):
+        return False
+    return path.startswith("http://") or path.startswith("https://")
+def get_tokenizer(
+    tokenizer_name: str,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    tokenizer_revision: Optional[str] = None,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    """
+    Gets a tokenizer for the given model name via Huggingface.
+    Args:
+        tokenizer_name: Name or path of the tokenizer
+        tokenizer_mode: Mode for tokenizer loading ("auto", "slow")
+        trust_remote_code: Whether to trust remote code
+        tokenizer_revision: Specific revision to use
+        **kwargs: Additional arguments passed to AutoTokenizer.from_pretrained
+    Returns:
+        Loaded tokenizer instance
+    """
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    # Handle special model name mapping
+    if tokenizer_name == "mistralai/Devstral-Small-2505":
+        tokenizer_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    is_gguf = check_gguf_file(tokenizer_name)
+    if is_gguf:
+        kwargs["gguf_file"] = tokenizer_name
+        tokenizer_name = Path(tokenizer_name).parent
+    # Note: Removed remote URL handling and local directory download
+    # as they depend on sglang-specific utilities
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            tokenizer_revision=tokenizer_revision,
+            **kwargs,
+        )
+    except TypeError as e:
+        # Handle specific errors
+        err_msg = (
+            "Failed to load the tokenizer. If you are running a model with "
+            "a custom tokenizer, please set the --trust-remote-code flag."
+        )
+        raise RuntimeError(err_msg) from e
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        print(
+            f"Warning: Using a slow tokenizer. This might cause a performance "
+            f"degradation. Consider using a fast tokenizer instead."
+        )
+    return tokenizer
+def get_tokenizer_from_processor(processor):
+    """Extract tokenizer from a processor object."""
+    if isinstance(processor, PreTrainedTokenizerBase):
+        return processor
+    return processor.tokenizer
+# ============================================================================
+# Test Utilities
+# ============================================================================
+class CustomTestCase(unittest.TestCase):
+    """
+    Custom test case base class with retry support.
+    This provides automatic test retry functionality based on environment variables.
+    """
+    def _callTestMethod(self, method):
+        """Override to add retry logic."""
+        max_retry = int(os.environ.get("SGLANG_TEST_MAX_RETRY", "0"))
+        if max_retry == 0:
+            # No retry, just run once
+            return super(CustomTestCase, self)._callTestMethod(method)
+        # Retry logic
+        for attempt in range(max_retry + 1):
+            try:
+                return super(CustomTestCase, self)._callTestMethod(method)
+            except Exception as e:
+                if attempt < max_retry:
+                    print(
+                        f"Test failed on attempt {attempt + 1}/{max_retry + 1}, retrying..."
+                    )
+                    continue
+                else:
+                    # Last attempt, re-raise the exception
+                    raise
+    def setUp(self):
+        """Print test method name at the start of each test."""
+        print(f"[Test Method] {self._testMethodName}", flush=True)
--- a/sgl-router/py_test/e2e_grpc/validation/test_large_max_new_tokens.py
+++ b/sgl-router/py_test/e2e_grpc/validation/test_large_max_new_tokens.py
+"""
+python3 -m unittest openai_server.validation.test_large_max_new_tokens.TestLargeMaxNewTokens.test_chat_completion
+"""
+import os
+# CHANGE: Import router launcher instead of server launcher
+import sys
+import time
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+import openai
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    STDERR_FILENAME,
+    STDOUT_FILENAME,
+    CustomTestCase,
+    get_tokenizer,
+    kill_process_tree,
+)
+class TestLargeMaxNewTokens(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.stdout = open(STDOUT_FILENAME, "w")
+        cls.stderr = open(STDERR_FILENAME, "w")
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            worker_args=(
+                "--max-total-token",
+                "1536",
+                "--context-len",
+                "8192",
+                "--decode-log-interval",
+                "2",
+            ),
+            num_workers=1,
+            tp_size=2,
+            env={"SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION": "256", **os.environ},
+            stdout=cls.stdout,
+            stderr=cls.stderr,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+        cls.stdout.close()
+        cls.stderr.close()
+        os.remove(STDOUT_FILENAME)
+        os.remove(STDERR_FILENAME)
+    def run_chat_completion(self):
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful AI assistant"},
+                {
+                    "role": "user",
+                    "content": "Please repeat the world 'hello' for 10000 times.",
+                },
+            ],
+            temperature=0,
+        )
+        return response
+    def test_chat_completion(self):
+        num_requests = 4
+        all_requests_running = False
+        futures = []
+        with ThreadPoolExecutor(num_requests) as executor:
+            # Send multiple requests
+            for i in range(num_requests):
+                futures.append(executor.submit(self.run_chat_completion))
+            # Ensure that they are running concurrently
+            pt = 0
+            while pt >= 0:
+                time.sleep(5)
+                # Flush stderr to ensure logs are written
+                self.stderr.flush()
+                lines = open(STDERR_FILENAME).readlines()
+                for line in lines[pt:]:
+                    print(line, end="", flush=True)
+                    if f"#running-req: {num_requests}" in line:
+                        all_requests_running = True
+                        pt = -1
+                        break
+                    pt += 1
+        assert all_requests_running
+if __name__ == "__main__":
+    unittest.main()
--- a/sgl-router/py_test/e2e_grpc/validation/test_openai_server_ignore_eos.py
+++ b/sgl-router/py_test/e2e_grpc/validation/test_openai_server_ignore_eos.py
+"""
+gRPC Router E2E Test - Test Openai Server Ignore Eos
+This test file is REUSED from test/srt/openai_server/validation/test_openai_server_ignore_eos.py
+with minimal changes:
+    num_workers=2,
+- Swap popen_launch_server() → popen_launch_workers_and_router()
+- Update teardown to cleanup router + workers
+- All test logic and assertions remain identical
+Run with:
+    pytest py_test/e2e_grpc/e2e_grpc/validation/test_openai_server_ignore_eos.py -v
+"""
+# CHANGE: Import router launcher instead of server launcher
+import sys
+from pathlib import Path
+import openai
+_TEST_DIR = Path(__file__).parent
+sys.path.insert(0, str(_TEST_DIR.parent))
+from fixtures import popen_launch_workers_and_router
+from util import (
+    DEFAULT_MODEL_PATH,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    get_tokenizer,
+    kill_process_tree,
+)
+class TestOpenAIServerIgnoreEOS(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        # CHANGE: Launch gRPC router with integrated workers (single command)
+        cls.model = DEFAULT_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.cluster = popen_launch_workers_and_router(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            num_workers=1,
+            tp_size=2,
+        )
+        cls.base_url += "/v1"
+        cls.tokenizer = get_tokenizer(cls.model)
+    @classmethod
+    def tearDownClass(cls):
+        # Cleanup router and workers
+        kill_process_tree(cls.cluster["router"].pid)
+        for worker in cls.cluster.get("workers", []):
+            kill_process_tree(worker.pid)
+    def test_ignore_eos(self):
+        """
+        Test that ignore_eos=True allows generation to continue beyond EOS token
+        and reach the max_tokens limit.
+        """
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+        max_tokens = 200
+        response_default = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count from 1 to 20."},
+            ],
+            temperature=0,
+            max_tokens=max_tokens,
+            extra_body={"ignore_eos": False},
+        )
+        response_ignore_eos = client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Count from 1 to 20."},
+            ],
+            temperature=0,
+            max_tokens=max_tokens,
+            extra_body={"ignore_eos": True},
+        )
+        default_tokens = len(
+            self.tokenizer.encode(response_default.choices[0].message.content)
+        )
+        ignore_eos_tokens = len(
+            self.tokenizer.encode(response_ignore_eos.choices[0].message.content)
+        )
+        # Check if ignore_eos resulted in more tokens or exactly max_tokens
+        # The ignore_eos response should either:
+        # 1. Have more tokens than the default response (if default stopped at EOS before max_tokens)
+        # 2. Have exactly max_tokens (if it reached the max_tokens limit)
+        self.assertTrue(
+            ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens,
+            f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}",
+        )
+        self.assertEqual(
+            response_ignore_eos.choices[0].finish_reason,
+            "length",
+            f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}",
+        )
--- a/sgl-router/src/protocols/chat.rs
+++ b/sgl-router/src/protocols/chat.rs
@@ -625,8 +625,6 @@ pub struct ChatCompletionMessage {
    pub content: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_calls: Option<Vec<ToolCall>>,
-    /// Reasoning content for O1-style models (SGLang extension)
-    #[serde(skip_serializing_if = "Option::is_none")]
    pub reasoning_content: Option<String>,
    // Note: function_call is deprecated and not included
    // Note: refusal, annotations, audio are not added yet
@@ -669,8 +667,6 @@ pub struct ChatMessageDelta {
    pub content: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_calls: Option<Vec<ToolCallDelta>>,
-    /// Reasoning content delta for O1-style models (SGLang extension)
-    #[serde(skip_serializing_if = "Option::is_none")]
    pub reasoning_content: Option<String>,
 }