test: add kv routing test for sglang (#2424)

Signed-off-by: ishandhanani <82981111+ishandhanani@users.noreply.github.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

test: add kv routing test for sglang (#2424)
Signed-off-by: ishandhanani <82981111+ishandhanani@users.noreply.github.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
c0ed76da · ishandhanani · GitHub · c3ecaf6c · c0ed76da · c0ed76da
Unverified Commit c0ed76da authored Aug 13, 2025 by ishandhanani Committed by GitHub Aug 13, 2025
5 changed files
--- a/components/backends/sglang/README.md
+++ b/components/backends/sglang/README.md
@@ -141,6 +141,9 @@ cd $DYNAMO_HOME/components/backends/sglang

 ### Aggregated Serving with KV Routing

+> [!NOTE]
+> Until sglang releases a version > v0.5.0rc0, you will have to install from source to use kv_routing. You can do this by running `git clone https://github.com/sgl-project/sglang.git && cd sglang && uv pip install -e "python[all]"`. We will update this section once sglang releases a newer version.
+
 ```bash
 cd $DYNAMO_HOME/components/backends/sglang
 ./launch/agg_router.sh

--- a/components/backends/sglang/launch/agg_router.sh
+++ b/components/backends/sglang/launch/agg_router.sh
@@ -5,8 +5,8 @@
 # Setup cleanup trap
 cleanup() {
    echo "Cleaning up background processes..."
-    kill $DYNAMO_PID 2>/dev/null || true
-    wait $DYNAMO_PID 2>/dev/null || true
+    kill $DYNAMO_PID $WORKER_PID 2>/dev/null || true
+    wait $DYNAMO_PID $WORKER_PID 2>/dev/null || true
    echo "Cleanup complete."
 }
 trap cleanup EXIT INT TERM
@@ -26,4 +26,14 @@ python3 -m dynamo.sglang.worker \
  --tp 1 \
  --trust-remote-code \
  --skip-tokenizer-init \
-  --kv-events-config '{"publisher": "zmq", "topic": "kv-events"}'
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' &
+WORKER_PID=$!
+
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang.worker \
+  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --page-size 16 \
+  --tp 1 \
+  --trust-remote-code \
+  --skip-tokenizer-init \
+  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}'
\ No newline at end of file
--- a/components/backends/sglang/src/dynamo/sglang/worker/main.py
+++ b/components/backends/sglang/src/dynamo/sglang/worker/main.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0

 import asyncio
+import json
 import logging
 import random
 import signal
@@ -357,10 +358,17 @@ async def init(
    handler.setup_metrics()

    # Set up ZMQ kv event publisher
+    if server_args.kv_events_config:
+        kv_events = json.loads(server_args.kv_events_config)
+        ep = kv_events.get("endpoint")
+        zmq_ep = ep.replace("*", get_ip()) if ep else None
+
        zmq_config = ZmqKvEventPublisherConfig(
            worker_id=endpoint.lease_id(),
            kv_block_size=server_args.page_size,
+            zmq_endpoint=zmq_ep,
        )
+        logging.info(f"Setting up ZMQ kv event publisher at {zmq_ep}")
        _ = ZmqKvEventPublisher(component=component, config=zmq_config)

    tasks = [endpoint.serve_endpoint(handler.generate)]

--- a/tests/README.md
+++ b/tests/README.md
@@ -95,3 +95,11 @@ via ```./container/build.sh --framework X``` and run via
 The tests will automatically use a local cache at `~/.cache/huggingface` to avoid
 repeated downloads of model files. This cache is shared across test runs to improve performance.

+## Running tests locally outside of a container
+
+To run tests outside of the development container, ensure that you have properly setup your environment and have installed the following dependencies in your `venv`:
+
+```bash
+uv pip install pytest-mypy
+uv pip install pytest-asyncio
+```
\ No newline at end of file
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -3,6 +3,7 @@

 import logging
 import os
+import re
 from dataclasses import dataclass
 from typing import Any, List

@@ -14,6 +15,31 @@ from tests.utils.managed_process import ManagedProcess
 logger = logging.getLogger(__name__)


+def validate_log_patterns(log_file, patterns):
+    """Validate log patterns after test completion."""
+    if not os.path.exists(log_file):
+        raise AssertionError(f"Log file not found: {log_file}")
+
+    with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
+        content = f.read()
+
+    compiled = [re.compile(p) for p in patterns]
+    missing = []
+
+    for pattern, rx in zip(patterns, compiled):
+        if not rx.search(content):
+            missing.append(pattern)
+
+    if missing:
+        # Include sample of log content for debugging
+        sample = content[-1000:] if len(content) > 1000 else content
+        raise AssertionError(
+            f"Missing expected log patterns: {missing}\n\nLog sample:\n{sample}"
+        )
+
+    return True
+
+
 @dataclass
 class SGLangConfig:
    """Configuration for SGLang test scenarios"""
@@ -28,7 +54,9 @@ class SGLangProcess(ManagedProcess):

    def __init__(self, script_name, request):
        self.port = 8000
-        sglang_dir = "/workspace/components/backends/sglang"
+        sglang_dir = os.environ.get(
+            "SGLANG_DIR", "/workspace/components/backends/sglang"
+        )
        script_path = os.path.join(sglang_dir, "launch", script_name)

        # Verify script exists
@@ -38,8 +66,17 @@ class SGLangProcess(ManagedProcess):
        # Make script executable and run it
        command = ["bash", script_path]

+        # Focus kv-router logs for kv_events run
+        env = os.environ.copy()
+        if script_name == "agg_router.sh":
+            env.setdefault(
+                "DYN_LOG",
+                "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
+            )
+
        super().__init__(
            command=command,
+            env=env,
            timeout=900,
            display_output=True,
            working_dir=sglang_dir,
@@ -50,7 +87,6 @@ class SGLangProcess(ManagedProcess):
            delayed_start=60,  # Give SGLang more time to fully start
            terminate_existing=False,
            stragglers=[],  # Don't kill any stragglers automatically
-            log_dir=request.node.name,
        )

    def _check_models_api(self, response):
@@ -72,6 +108,9 @@ sglang_configs = {
    "disaggregated": SGLangConfig(
        script_name="disagg.sh", marks=[pytest.mark.gpu_2], name="disaggregated"
    ),
+    "kv_events": SGLangConfig(
+        script_name="agg_router.sh", marks=[pytest.mark.gpu_2], name="kv_events"
+    ),
 }


@@ -79,6 +118,7 @@ sglang_configs = {
    params=[
        pytest.param("aggregated", marks=[pytest.mark.gpu_1]),
        pytest.param("disaggregated", marks=[pytest.mark.gpu_2]),
+        pytest.param("kv_events", marks=[pytest.mark.gpu_2]),
    ]
 )
 def sglang_config_test(request):
@@ -104,6 +144,15 @@ def test_sglang_deployment(request, runtime_services, sglang_config_test):

    with SGLangProcess(config.script_name, request) as server:
        # Test chat completions
+        prompts = [
+            "why is roger federer the best tennis player of all time?",
+            "why is novak djokovic not the best tennis player of all time?",
+            "why is rafa nadal a sneaky good grass court player?",
+            "explain the difference between federer and nadal's backhand.",
+            "who is the most clutch tennis player in history?",
+        ]
+        responses = []
+        for prompt in prompts:
            response = requests.post(
                f"http://localhost:{server.port}/v1/chat/completions",
                json={
@@ -111,22 +160,35 @@ def test_sglang_deployment(request, runtime_services, sglang_config_test):
                    "messages": [
                        {
                            "role": "user",
-                        "content": "Why is Roger Federer the best tennis player of all time?",
+                            "content": prompt,
                        }
                    ],
                    "max_tokens": 50,
                },
                timeout=120,
            )
-
            assert response.status_code == 200
            result = response.json()
            assert "choices" in result
            assert len(result["choices"]) > 0
            content = result["choices"][0]["message"]["content"]
            assert len(content) > 0
+            responses.append(content)
            logger.info(f"SGLang {config.name} response: {content}")

+        # For kv_events (KV routing path), assert KV publisher/scheduler log lines appear
+        if config.name == "kv_events":
+            log_file = os.path.join(server.log_dir, "bash.log.txt")
+            assert os.path.exists(log_file), f"Log file not found: {log_file}"
+
+            patterns = [
+                r"ZMQ listener .* received batch with \d+ events \(seq=\d+\)",
+                r"Event processor for worker_id \d+ processing event: Stored\(",
+                r"Selected worker: \d+, logit: ",
+            ]
+
+            validate_log_patterns(log_file, patterns)
+
        # Test completions endpoint for disaggregated only
        if config.name == "disaggregated":
            response = requests.post(