test_sglang_profile.py

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Test script for /engine/start_profile and /engine/stop_profile routes.

This script demonstrates the new custom engine route registration feature.
It starts a simple sglang server with dynamo and tests the profiling endpoints.

Usage:
    python test_sglang_profile.py
"""

import os
import signal
import subprocess
import sys
import time
from pathlib import Path

import requests

# Configuration
MODEL = "Qwen/Qwen3-0.6B"  # Small model for quick testing
HOST = "127.0.0.1"
PORT = 30000
SYSTEM_PORT = 9090
PROFILER_OUTPUT_DIR = "/tmp/dynamo_profiler_test"


def cleanup_output_dir():
    """Clean up the profiler output directory"""
    import shutil

    if os.path.exists(PROFILER_OUTPUT_DIR):
        shutil.rmtree(PROFILER_OUTPUT_DIR)
    os.makedirs(PROFILER_OUTPUT_DIR, exist_ok=True)


def start_frontend():
    """Start the Dynamo frontend (HTTP server)"""
    print("\nStarting Dynamo frontend...")
    print(f"  - Frontend HTTP: http://{HOST}:{PORT}")

    cmd = [
        "python",
        "-m",
        "dynamo.frontend",
        "--http-port",
        str(PORT),
    ]

    print(f"Command: {' '.join(cmd)}")
    print("(Output will appear below)\n")

    process = subprocess.Popen(cmd)

    # Wait for frontend to be ready
    max_wait = 30
    start_time = time.time()
    frontend_ready = False

    while time.time() - start_time < max_wait:
        try:
            # Check /health endpoint first
            response = requests.get(f"http://{HOST}:{PORT}/health", timeout=1)
            if response.status_code == 200:
                print("✓ Frontend is ready!")
                frontend_ready = True
                break
        except requests.exceptions.RequestException:
            pass

        if process.poll() is not None:
            print("✗ Frontend process died!")
            sys.exit(1)

        time.sleep(1)

    if not frontend_ready:
        print("✗ Frontend failed to start in time!")
        process.kill()
        sys.exit(1)

    return process


def start_sglang_backend():
    """Start the sglang backend (inference engine)"""
    print("\nStarting SGLang backend...")
    print(f"  - Model: {MODEL}")
    print(f"  - System server: http://{HOST}:{SYSTEM_PORT}")

    # Set environment variables
    env = os.environ.copy()
    env["SGLANG_TORCH_PROFILER_DIR"] = PROFILER_OUTPUT_DIR
    env["DYN_SYSTEM_PORT"] = str(SYSTEM_PORT)

    cmd = [
        "python",
        "-m",
        "dynamo.sglang",
        "--model-path",
        MODEL,
        "--tp",
        "1",
        "--mem-fraction-static",
        "0.8",
    ]

    print(f"Command: {' '.join(cmd)}")
    print("(Output will appear below)")
    print("\nWaiting for backend to start...\n")

    process = subprocess.Popen(cmd, env=env)

    # Wait for backend to be ready (check system server health)
    max_wait = 120  # 2 minutes
    start_time = time.time()
    backend_ready = False

    while time.time() - start_time < max_wait:
        try:
            # Check system server health endpoint
            response = requests.get(f"http://{HOST}:{SYSTEM_PORT}/health", timeout=1)
            if response.status_code == 200:
                print("✓ Backend is ready!")
                backend_ready = True
                break
        except requests.exceptions.RequestException:
            pass

        # Check if process has died
        if process.poll() is not None:
            print("✗ Backend process died!")
            sys.exit(1)

        time.sleep(2)

    if not backend_ready:
        print("✗ Backend failed to start in time!")
        process.kill()
        sys.exit(1)

    return process


def test_profiling_endpoints():
    """Test the /engine/start_profile and /engine/stop_profile endpoints"""
    base_url = f"http://{HOST}:{SYSTEM_PORT}"

    print("\n" + "=" * 60)
    print("Testing /engine/start_profile and /engine/stop_profile")
    print("=" * 60)

    # Test 1: Start profiling with parameters (no num_steps so we control stop manually)
    print("\n1. Starting profiling with parameters...")
    response = requests.post(
        f"{base_url}/engine/start_profile",
        json={
            "output_dir": PROFILER_OUTPUT_DIR,
            "activities": ["CPU", "GPU"],
            "with_stack": True,
            "record_shapes": True,
        },
    )
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
    assert response.status_code == 200, f"Expected 200, got {response.status_code}"
    assert response.json()["status"] == "ok", "Expected status 'ok'"

    # Check available models
    print("\n2. Checking available models...")
    response = requests.get(f"http://{HOST}:{PORT}/v1/models")
    if response.status_code == 200:
        models = response.json()
        print(f"   Available models: {models}")

    # Make a few inference requests to generate profiling data
    print("\n3. Making inference requests...")
    inference_url = f"http://{HOST}:{PORT}/v1/completions"
    for i in range(3):
        response = requests.post(
            inference_url,
            json={
                "model": MODEL,
                "prompt": f"Hello, this is test request {i+1}. ",
                "max_tokens": 10,
                "temperature": 0.8,
            },
        )
        print(f"   Request {i+1}: {response.status_code}")
        if response.status_code != 200:
            print(f"   Response: {response.text[:200]}")
        time.sleep(0.5)

    # Test 2: Stop profiling
    print("\n4. Stopping profiling...")
    response = requests.post(f"{base_url}/engine/stop_profile")
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
    assert response.status_code == 200, f"Expected 200, got {response.status_code}"
    assert response.json()["status"] == "ok", "Expected status 'ok'"

    # Test 3: Test with empty body (GET-like POST)
    print("\n5. Starting profiling with empty body...")
    response = requests.post(f"{base_url}/engine/start_profile")
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
    assert response.status_code == 200, f"Expected 200, got {response.status_code}"

    # Test 4: Test invalid route
    print("\n6. Testing invalid route...")
    response = requests.post(f"{base_url}/engine/nonexistent_route")
    print(f"   Status: {response.status_code}")
    print(f"   Response: {response.json()}")
    assert response.status_code == 404, f"Expected 404, got {response.status_code}"

    # Stop profiling again
    response = requests.post(f"{base_url}/engine/stop_profile")

    print("\n" + "=" * 60)
    print("✓ All tests passed!")
    print("=" * 60)

    # Check if profiling files were created
    print(f"\nChecking profiler output directory: {PROFILER_OUTPUT_DIR}")
    if os.path.exists(PROFILER_OUTPUT_DIR):
        files = list(Path(PROFILER_OUTPUT_DIR).rglob("*"))
        if files:
            print(f"✓ Found {len(files)} files in output directory")
            for f in files[:5]:  # Show first 5 files
                print(f"  - {f}")
        else:
            print("⚠ No files found (profiling may not have run long enough)")
    else:
        print("⚠ Output directory not created")


def main():
    """Main test function"""
    frontend_process = None
    backend_process = None
    try:
        # Clean up output directory
        cleanup_output_dir()

        # Start frontend first
        frontend_process = start_frontend()

        # Start backend
        backend_process = start_sglang_backend()

        # Run tests
        print("\n" + "=" * 60)
        print("Both frontend and backend are ready!")
        print("=" * 60)
        time.sleep(2)  # Give everything a moment to fully settle
        test_profiling_endpoints()

        print("\n✓ Test completed successfully!")

    except KeyboardInterrupt:
        print("\n⚠ Interrupted by user")
    except Exception as e:
        print(f"\n✗ Test failed: {e}")
        import traceback

        traceback.print_exc()
        sys.exit(1)
    finally:
        # Cleanup
        print("\nShutting down servers...")
        if backend_process:
            print("  Stopping backend...")
            backend_process.send_signal(signal.SIGTERM)
            try:
                backend_process.wait(timeout=10)
            except subprocess.TimeoutExpired:
                print("  Force killing backend...")
                backend_process.kill()

        if frontend_process:
            print("  Stopping frontend...")
            frontend_process.send_signal(signal.SIGTERM)
            try:
                frontend_process.wait(timeout=10)
            except subprocess.TimeoutExpired:
                print("  Force killing frontend...")
                frontend_process.kill()

        print("✓ Servers stopped")


if __name__ == "__main__":
    main()