# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import logging import os from dataclasses import dataclass from typing import Any, List import pytest import requests from tests.utils.managed_process import ManagedProcess logger = logging.getLogger(__name__) @dataclass class SGLangConfig: """Configuration for SGLang test scenarios""" script_name: str marks: List[Any] name: str class SGLangProcess(ManagedProcess): """Simple process manager for sglang shell scripts""" def __init__(self, script_name, request): self.port = 8000 sglang_dir = "/workspace/examples/sglang" script_path = os.path.join(sglang_dir, "launch", script_name) # Verify script exists if not os.path.exists(script_path): raise FileNotFoundError(f"SGLang script not found: {script_path}") # Make script executable and run it command = ["bash", script_path] super().__init__( command=command, timeout=900, display_output=True, working_dir=sglang_dir, health_check_ports=[], # Disable port health check health_check_urls=[ (f"http://localhost:{self.port}/v1/models", self._check_models_api) ], delayed_start=60, # Give SGLang more time to fully start terminate_existing=False, stragglers=[], # Don't kill any stragglers automatically log_dir=request.node.name, ) def _check_models_api(self, response): """Check if models API is working and returns models""" try: if response.status_code != 200: return False data = response.json() return data.get("data") and len(data["data"]) > 0 except Exception: return False # SGLang test configurations sglang_configs = { "aggregated": SGLangConfig( script_name="agg.sh", marks=[pytest.mark.gpu_1], name="aggregated" ), "disaggregated": SGLangConfig( script_name="disagg.sh", marks=[pytest.mark.gpu_2], name="disaggregated" ), } @pytest.fixture( params=[ pytest.param("aggregated", marks=[pytest.mark.gpu_1]), pytest.param("disaggregated", marks=[pytest.mark.gpu_2]), ] ) def sglang_config_test(request): """Fixture that provides different SGLang test configurations""" return sglang_configs[request.param] @pytest.mark.e2e @pytest.mark.slow @pytest.mark.sglang def test_sglang_deployment(request, runtime_services, sglang_config_test): """Test SGLang deployment scenarios""" # First check if sglang is available try: import sglang logger.info(f"SGLang version: {sglang.__version__}") except ImportError: pytest.skip("SGLang not available") config = sglang_config_test with SGLangProcess(config.script_name, request) as server: # Test chat completions response = requests.post( f"http://localhost:{server.port}/v1/chat/completions", json={ "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "messages": [ { "role": "user", "content": "Why is Roger Federer the best tennis player of all time?", } ], "max_tokens": 50, }, timeout=120, ) assert response.status_code == 200 result = response.json() assert "choices" in result assert len(result["choices"]) > 0 content = result["choices"][0]["message"]["content"] assert len(content) > 0 logger.info(f"SGLang {config.name} response: {content}") # Test completions endpoint for disaggregated only if config.name == "disaggregated": response = requests.post( f"http://localhost:{server.port}/v1/completions", json={ "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "prompt": "Roger Federer is the greatest tennis player of all time", "max_tokens": 30, }, timeout=120, ) assert response.status_code == 200 result = response.json() assert "choices" in result assert len(result["choices"]) > 0 text = result["choices"][0]["text"] assert len(text) > 0 logger.info(f"SGLang completions response: {text}") @pytest.mark.skip( reason="Requires 4 GPUs - enable when hardware is consistently available" ) def test_sglang_disagg_dp_attention(request, runtime_services): """Test sglang disaggregated with DP attention (requires 4 GPUs)""" with SGLangProcess("disagg_dp_attn.sh", request) as server: # Test chat completions with the DP attention model response = requests.post( f"http://localhost:{server.port}/v1/chat/completions", json={ "model": "silence09/DeepSeek-R1-Small-2layers", # DP attention model "messages": [{"role": "user", "content": "Tell me about MoE models"}], "max_tokens": 50, }, timeout=120, ) assert response.status_code == 200 result = response.json() assert "choices" in result assert len(result["choices"]) > 0 content = result["choices"][0]["message"]["content"] assert len(content) > 0 logger.info(f"SGLang DP attention response: {content}")