ci: add sglang gpu based CI tests (#1744)

ffccc722 · ishandhanani · GitHub · ebd23361 · ffccc722
Unverified Commit ffccc722 authored Jul 07, 2025 by ishandhanani Committed by GitHub Jul 07, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 175 additions and 0 deletions

tests/serve/test_sglang.py tests/serve/test_sglang.py +175 -0

No files found.
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any, List
+import pytest
+import requests
+from tests.utils.managed_process import ManagedProcess
+logger = logging.getLogger(__name__)
+@dataclass
+class SGLangConfig:
+    """Configuration for SGLang test scenarios"""
+    script_name: str
+    marks: List[Any]
+    name: str
+class SGLangProcess(ManagedProcess):
+    """Simple process manager for sglang shell scripts"""
+    def __init__(self, script_name, request):
+        self.port = 8000
+        sglang_dir = "/workspace/examples/sglang"
+        script_path = os.path.join(sglang_dir, "launch", script_name)
+        # Verify script exists
+        if not os.path.exists(script_path):
+            raise FileNotFoundError(f"SGLang script not found: {script_path}")
+        # Make script executable and run it
+        command = ["bash", script_path]
+        super().__init__(
+            command=command,
+            timeout=900,
+            display_output=True,
+            working_dir=sglang_dir,
+            health_check_ports=[],  # Disable port health check
+            health_check_urls=[
+                (f"http://localhost:{self.port}/v1/models", self._check_models_api)
+            ],
+            delayed_start=60,  # Give SGLang more time to fully start
+            terminate_existing=False,
+            stragglers=[],  # Don't kill any stragglers automatically
+            log_dir=request.node.name,
+        )
+    def _check_models_api(self, response):
+        """Check if models API is working and returns models"""
+        try:
+            if response.status_code != 200:
+                return False
+            data = response.json()
+            return data.get("data") and len(data["data"]) > 0
+        except Exception:
+            return False
+# SGLang test configurations
+sglang_configs = {
+    "aggregated": SGLangConfig(
+        script_name="agg.sh", marks=[pytest.mark.gpu_1], name="aggregated"
+    ),
+    "disaggregated": SGLangConfig(
+        script_name="disagg.sh", marks=[pytest.mark.gpu_2], name="disaggregated"
+    ),
+}
+@pytest.fixture(
+    params=[
+        pytest.param("aggregated", marks=[pytest.mark.gpu_1]),
+        pytest.param("disaggregated", marks=[pytest.mark.gpu_2]),
+    ]
+)
+def sglang_config_test(request):
+    """Fixture that provides different SGLang test configurations"""
+    return sglang_configs[request.param]
+@pytest.mark.e2e
+@pytest.mark.slow
+@pytest.mark.sglang
+def test_sglang_deployment(request, runtime_services, sglang_config_test):
+    """Test SGLang deployment scenarios"""
+    # First check if sglang is available
+    try:
+        import sglang
+        logger.info(f"SGLang version: {sglang.__version__}")
+    except ImportError:
+        pytest.skip("SGLang not available")
+    config = sglang_config_test
+    with SGLangProcess(config.script_name, request) as server:
+        # Test chat completions
+        response = requests.post(
+            f"http://localhost:{server.port}/v1/chat/completions",
+            json={
+                "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "Why is Roger Federer the best tennis player of all time?",
+                    }
+                ],
+                "max_tokens": 50,
+            },
+            timeout=120,
+        )
+        assert response.status_code == 200
+        result = response.json()
+        assert "choices" in result
+        assert len(result["choices"]) > 0
+        content = result["choices"][0]["message"]["content"]
+        assert len(content) > 0
+        logger.info(f"SGLang {config.name} response: {content}")
+        # Test completions endpoint for disaggregated only
+        if config.name == "disaggregated":
+            response = requests.post(
+                f"http://localhost:{server.port}/v1/completions",
+                json={
+                    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+                    "prompt": "Roger Federer is the greatest tennis player of all time",
+                    "max_tokens": 30,
+                },
+                timeout=120,
+            )
+            assert response.status_code == 200
+            result = response.json()
+            assert "choices" in result
+            assert len(result["choices"]) > 0
+            text = result["choices"][0]["text"]
+            assert len(text) > 0
+            logger.info(f"SGLang completions response: {text}")
+@pytest.mark.skip(
+    reason="Requires 4 GPUs - enable when hardware is consistently available"
+)
+def test_sglang_disagg_dp_attention(request, runtime_services):
+    """Test sglang disaggregated with DP attention (requires 4 GPUs)"""
+    with SGLangProcess("disagg_dp_attn.sh", request) as server:
+        # Test chat completions with the DP attention model
+        response = requests.post(
+            f"http://localhost:{server.port}/v1/chat/completions",
+            json={
+                "model": "silence09/DeepSeek-R1-Small-2layers",  # DP attention model
+                "messages": [{"role": "user", "content": "Tell me about MoE models"}],
+                "max_tokens": 50,
+            },
+            timeout=120,
+        )
+        assert response.status_code == 200
+        result = response.json()
+        assert "choices" in result
+        assert len(result["choices"]) > 0
+        content = result["choices"][0]["message"]["content"]
+        assert len(content) > 0
+        logger.info(f"SGLang DP attention response: {content}")