# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """Frontend API-surface compliance suite against a live Dynamo frontend. Subject under test is Dynamo's HTTP surface (`/v1/responses` and `/v1/messages` wire shapes, tool-call routing through both); sglang is just the backend vehicle for producing real traffic. Runs three suites sequentially against one server: 1. Upstream OpenResponses compliance-test.ts harness (bun/TypeScript validator against zod schemas generated from the OpenAPI spec). 2. `codex exec` smoke — forces the shell tool-call path through `/v1/responses`. 3. `claude -p` smoke — forces the Bash tool-call path through `/v1/messages` (Anthropic Messages API). All external tooling (bun, node, the OpenResponses suite, and the codex / claude CLIs) is installed lazily at test time by session-scoped fixtures into a session-shared cache directory. Versions and the OpenResponses SHA are pinned as module-level constants. FileLock coordination makes concurrent xdist workers share a single install. """ import logging import os import platform import shlex import shutil import subprocess import tarfile import time import zipfile from pathlib import Path import pytest import requests from filelock import FileLock from tests.serve.common import WORKSPACE_DIR from tests.utils.engine_process import EngineConfig, EngineProcess logger = logging.getLogger(__name__) sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join( WORKSPACE_DIR, "examples/backends/sglang" ) COMPLIANCE_MODEL = "Qwen/Qwen3-VL-2B-Instruct" # Pinned external-tool versions. Bun and node are pinned for reproducibility. # The agent CLIs (@openai/codex, @anthropic-ai/claude-code) float to @latest # so we automatically pick up protocol fixes — they're client-side harnesses, # not Dynamo surface. BUN_VERSION = "1.3.12" NODE_VERSION = "20.19.0" OPENRESPONSES_REPO = "https://github.com/openresponses/openresponses.git" OPENRESPONSES_SHA = "fa29df5" # Retry budget for network-touching installs. Exponential backoff starting # at 2s; 3 attempts caps the worst-case wait at ~6s before we surface a # clear "upstream unavailable" error. _RETRY_COUNT = 3 _RETRY_BACKOFF_INITIAL_S = 2.0 # Env keys forwarded into codex/claude subprocesses. These agents run with tool # permissions (`--dangerously-bypass-approvals-and-sandbox`, `--dangerously-skip-permissions`), # and even against a local model they may emit telemetry; inheriting the whole # CI environment would expose `GITHUB_TOKEN`, AWS creds, registry credentials, # etc. Keep to a minimal allowlist covering only what the runtime needs: # PATH to resolve the binaries, locale/TLS/proxy for HTTPS, HOME so Node/bun # finds per-user caches, and NVIDIA/CUDA vars so any GPU-touching side effects # see the same device the test was given. _SUBPROCESS_ENV_ALLOWLIST: frozenset[str] = frozenset( { "PATH", "HOME", "LANG", "LC_ALL", "TZ", "SSL_CERT_FILE", "SSL_CERT_DIR", "REQUESTS_CA_BUNDLE", "CURL_CA_BUNDLE", "HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY", "http_proxy", "https_proxy", "no_proxy", "LD_LIBRARY_PATH", "CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "NVIDIA_DRIVER_CAPABILITIES", } ) def _agent_subprocess_env( extra_env: dict[str, str], path_prepend: list[Path] | None = None ) -> dict[str, str]: """Build a minimal env for codex/claude subprocesses: allowlist from `os.environ` merged with explicit test-scoped vars. Optional `path_prepend` prepends directories to PATH so the fixture-installed node/codex/claude binaries resolve without contaminating the inherited PATH.""" base = { k: v for k in _SUBPROCESS_ENV_ALLOWLIST if (v := os.environ.get(k)) is not None } if path_prepend: existing = base.get("PATH", "") prefix = os.pathsep.join(str(p) for p in path_prepend) base["PATH"] = f"{prefix}{os.pathsep}{existing}" if existing else prefix base.update(extra_env) return base # --------------------------------------------------------------------------- # Tool-install fixtures # --------------------------------------------------------------------------- def _retry_network_op(fn, description: str): """Run `fn()` with a small exponential-backoff retry budget so that transient github/npm/nodejs.org blips don't flake the test. Captures subprocess stderr into the final error message so post-mortem doesn't require digging through logs.""" last_err: BaseException | None = None for attempt in range(_RETRY_COUNT): try: return fn() except (OSError, requests.RequestException, subprocess.CalledProcessError) as e: last_err = e if attempt + 1 < _RETRY_COUNT: wait = _RETRY_BACKOFF_INITIAL_S * (2**attempt) logger.warning( "%s failed (attempt %d/%d): %s — retrying in %.1fs", description, attempt + 1, _RETRY_COUNT, e, wait, ) time.sleep(wait) detail = "" if isinstance(last_err, subprocess.CalledProcessError): detail = f"\nstdout:\n{last_err.stdout or ''}\nstderr:\n{last_err.stderr or ''}" raise RuntimeError( f"{description} failed after {_RETRY_COUNT} attempts: {last_err}{detail}" ) from last_err def _download_url(url: str, dest: Path) -> None: """Stream GET `url` into `dest` atomically via a `.part` sibling.""" tmp = dest.with_suffix(dest.suffix + ".part") with requests.get(url, stream=True, timeout=60) as r: r.raise_for_status() with open(tmp, "wb") as f: for chunk in r.iter_content(chunk_size=64 * 1024): if chunk: f.write(chunk) tmp.rename(dest) def _bun_arch() -> str: m = platform.machine() if m == "x86_64": return "x64" if m == "aarch64": return "aarch64" raise RuntimeError(f"Unsupported machine architecture for bun: {m}") def _node_arch() -> str: m = platform.machine() if m == "x86_64": return "x64" if m == "aarch64": return "arm64" raise RuntimeError(f"Unsupported machine architecture for node: {m}") @pytest.fixture(scope="session") def _tools_cache(tmp_path_factory) -> Path: """Session-shared cache directory for downloaded compliance tooling. Lives under the pytest basetemp so it's reused across xdist workers in the same session and cleaned up automatically when the session ends.""" base = Path(tmp_path_factory.getbasetemp()) / "_frontend_api_surface_tools" base.mkdir(parents=True, exist_ok=True) return base @pytest.fixture(scope="session") def _bun_binary(_tools_cache) -> Path: """Pinned-version bun executable. FileLock-coordinated so concurrent xdist workers share a single download.""" install_dir = _tools_cache / f"bun-{BUN_VERSION}" bun_bin = install_dir / "bun" with FileLock(str(_tools_cache / "bun.lock")): if bun_bin.exists(): return bun_bin install_dir.mkdir(parents=True, exist_ok=True) arch = _bun_arch() url = ( f"https://github.com/oven-sh/bun/releases/download/" f"bun-v{BUN_VERSION}/bun-linux-{arch}.zip" ) zip_path = install_dir / "bun.zip" _retry_network_op( lambda: _download_url(url, zip_path), description=f"download bun v{BUN_VERSION} ({arch})", ) with zipfile.ZipFile(zip_path) as zf: zf.extractall(install_dir) extracted = install_dir / f"bun-linux-{arch}" / "bun" shutil.copy(extracted, bun_bin) bun_bin.chmod(0o755) zip_path.unlink(missing_ok=True) return bun_bin @pytest.fixture(scope="session") def _node_bin(_tools_cache) -> Path: """Pinned-version node runtime root `bin/` directory containing `node` and `npm`. FileLock-coordinated.""" install_dir = _tools_cache / f"node-v{NODE_VERSION}" bin_dir = install_dir / "bin" with FileLock(str(_tools_cache / "node.lock")): if (bin_dir / "node").exists() and (bin_dir / "npm").exists(): return bin_dir install_dir.mkdir(parents=True, exist_ok=True) arch = _node_arch() tarball_name = f"node-v{NODE_VERSION}-linux-{arch}.tar.xz" url = f"https://nodejs.org/dist/v{NODE_VERSION}/{tarball_name}" tar_path = install_dir / tarball_name _retry_network_op( lambda: _download_url(url, tar_path), description=f"download node v{NODE_VERSION} ({arch})", ) with tarfile.open(tar_path) as tf: # `filter="data"` is the safe extraction filter added in 3.12 and # required in 3.14; passing it explicitly silences the pytest # filterwarnings=error escalation of the DeprecationWarning. tf.extractall(install_dir, filter="data") extracted = install_dir / f"node-v{NODE_VERSION}-linux-{arch}" for item in extracted.iterdir(): shutil.move(str(item), str(install_dir / item.name)) extracted.rmdir() tar_path.unlink(missing_ok=True) return bin_dir @pytest.fixture(scope="session") def _openresponses_suite(_tools_cache, _bun_binary) -> Path: """Pinned-SHA clone of the OpenResponses compliance suite with bun deps installed. A `.installed` sentinel file marks a completed setup so an interrupted prior install forces a clean redo.""" install_dir = _tools_cache / f"openresponses-{OPENRESPONSES_SHA}" sentinel = install_dir / ".installed" with FileLock(str(_tools_cache / "openresponses.lock")): if sentinel.exists(): return install_dir if install_dir.exists(): shutil.rmtree(install_dir) _retry_network_op( lambda: subprocess.run( [ "git", "clone", "--filter=blob:none", OPENRESPONSES_REPO, str(install_dir), ], check=True, capture_output=True, text=True, ), description="clone openresponses", ) subprocess.run( ["git", "-C", str(install_dir), "checkout", OPENRESPONSES_SHA], check=True, capture_output=True, text=True, ) _retry_network_op( lambda: subprocess.run( [str(_bun_binary), "install", "--frozen-lockfile"], cwd=str(install_dir), check=True, capture_output=True, text=True, ), description="bun install openresponses deps", ) sentinel.touch() return install_dir def _install_npm_cli( tools_cache: Path, node_bin: Path, package: str, binary_name: str, slot: str, ) -> Path: """Install `package` into `{tools_cache}/{slot}` via npm and return the path to the CLI entry point. Shared helper for codex + claude.""" install_dir = tools_cache / slot cli_bin = install_dir / "node_modules" / ".bin" / binary_name with FileLock(str(tools_cache / f"{slot}.lock")): if cli_bin.exists(): return cli_bin install_dir.mkdir(parents=True, exist_ok=True) env = { **os.environ, "PATH": f"{node_bin}{os.pathsep}{os.environ.get('PATH', '')}", } _retry_network_op( lambda: subprocess.run( [ str(node_bin / "npm"), "install", "--prefix", str(install_dir), package, ], env=env, check=True, capture_output=True, text=True, ), description=f"npm install {package}", ) return cli_bin @pytest.fixture(scope="session") def _codex_cli(_tools_cache, _node_bin) -> Path: return _install_npm_cli( _tools_cache, _node_bin, package="@openai/codex", binary_name="codex", slot="codex", ) @pytest.fixture(scope="session") def _claude_cli(_tools_cache, _node_bin) -> Path: return _install_npm_cli( _tools_cache, _node_bin, package="@anthropic-ai/claude-code", binary_name="claude", slot="claude", ) # --------------------------------------------------------------------------- # Test # --------------------------------------------------------------------------- @pytest.mark.sglang @pytest.mark.e2e @pytest.mark.gpu_1 @pytest.mark.model(COMPLIANCE_MODEL) @pytest.mark.profiled_vram_gib(6.0) @pytest.mark.requested_sglang_kv_tokens(512) # Budget: tool-install fixtures (~30-60s first session run, near-zero on # cache hit) + sglang cold start (30-60s) + bun compliance (up to 180s) + # codex exec (up to 180s) + claude exec (up to 180s) + two inter-suite # health checks + teardown. 750s leaves headroom for CI variance without # masking real hangs. @pytest.mark.timeout(750) @pytest.mark.frontend_api_surface_compliance @pytest.mark.pre_merge def test_frontend_api_surface_compliance( request, runtime_services_dynamic_ports, dynamo_dynamic_ports, predownload_models, tmp_path, _bun_binary, _node_bin, _openresponses_suite, _codex_cli, _claude_cli, ): """Assert the frontend passes the upstream OpenResponses compliance suite.""" frontend_port = int(dynamo_dynamic_ports.frontend_port) system_port = int(dynamo_dynamic_ports.system_ports[0]) config = EngineConfig( name="responses_compliance", directory=sglang_dir, marks=[], request_payloads=[], model=COMPLIANCE_MODEL, script_name="agg.sh", # Qwen3-VL-2B-specific flags: vision-model CUDA graph workaround + # model-aware reasoning/tool-call parsers. Forwarded verbatim to # `dynamo.sglang` by agg.sh's pass-through loop. # # Tool-call parser is `hermes`, not `qwen3_coder`: Qwen3-VL-Instruct # emits `{"name":..., "arguments":...}` (JSON # inside the tags — Hermes-style), while `qwen3_coder` expects the # XML-structured `v # ` that Qwen3-Coder models emit. Using the # wrong parser leaves tool calls as raw text in the response and # breaks end-to-end agent flows (codex exec, etc.). script_args=[ "--model-path", COMPLIANCE_MODEL, "--disable-piecewise-cuda-graph", "--dyn-reasoning-parser", "qwen3", "--dyn-tool-call-parser", "hermes", ], timeout=360, env={}, frontend_port=frontend_port, ) merged_env = { "DYN_HTTP_PORT": str(frontend_port), "DYN_SYSTEM_PORT": str(system_port), # agg.sh doesn't forward frontend args, but the frontend reads this # env var directly. Enables /v1/messages for the claude smoke step. "DYN_ENABLE_ANTHROPIC_API": "1", } codex_home = tmp_path / "codex_home" _write_codex_config(codex_home, frontend_port) # Marker file that the agents can only "see" by invoking their shell/Bash # tool; if a model answers from its prior without actually running `ls`, # the marker won't appear in stdout and the assertion fails. Proves the # tool-call paths through the frontend end-to-end (both /v1/responses # for codex and /v1/messages for claude), not just text generation. agent_cwd = tmp_path / "agent_cwd" agent_cwd.mkdir() marker_filename = "dynamo_compliance_marker.txt" (agent_cwd / marker_filename).write_text("compliance-smoke") # Isolated HOME so claude doesn't write session state into the runner's # ~/.claude during CI / local invocation. claude_home = tmp_path / "claude_home" claude_home.mkdir() with EngineProcess.from_script(config, request, extra_env=merged_env): _run_bun_compliance(_bun_binary, _openresponses_suite, frontend_port) _wait_for_frontend_healthy(frontend_port) _run_codex_exec_smoke( _codex_cli, _node_bin, codex_home, agent_cwd, marker_filename ) _wait_for_frontend_healthy(frontend_port) _run_claude_exec_smoke( _claude_cli, _node_bin, claude_home, agent_cwd, marker_filename, frontend_port, ) def _attach_subprocess_log( name: str, cmd: list[str], result: subprocess.CompletedProcess, extra_env: dict[str, str] | None = None, cwd: str | None = None, ) -> None: """Attach a reproducible transcript of `cmd` to the Allure report. Lands in `test-results/allure-results/-attachment.txt`, which the CI workflow uploads as an artifact on every run (pass or fail). Contents are a cut-and-paste-able shell invocation plus the raw stdout + stderr so a failing CI run can be reproduced locally from the artifact alone. Only explicitly listed env vars (`extra_env`) are recorded — not the inherited `os.environ` — to avoid leaking runner secrets into the artifact. CI runners keep HF tokens and cloud creds in env vars the subprocess inherits; we don't need those in the log to reproduce. """ # Local import: `allure` is only available inside the test image (via # allure-pytest). Pre-commit's collection-only pytest runs in a clean # uvx env without it, so a module-level import would fail collection. import allure lines: list[str] = [] if cwd: lines.append(f"$ cd {shlex.quote(cwd)}") if extra_env: for k, v in sorted(extra_env.items()): lines.append(f"$ export {k}={shlex.quote(v)}") lines.append("$ " + " ".join(shlex.quote(c) for c in cmd)) lines.append("") lines.append(f"exit: {result.returncode}") lines.append("") lines.append("=== stdout ===") lines.append(result.stdout or "(empty)") lines.append("") lines.append("=== stderr ===") lines.append(result.stderr or "(empty)") allure.attach( "\n".join(lines), name=name, attachment_type=allure.attachment_type.TEXT, ) def _wait_for_frontend_healthy( frontend_port: int, timeout_s: float = 15.0, model: str = COMPLIANCE_MODEL ) -> None: """Confirm the frontend is still serving before the next subprocess fires. Without this check, if bun compliance accidentally destabilized the server (e.g. a hang that the bun timeout cut short) a codex exec failure looks identical to "codex is broken" in CI logs. The health probe collapses that ambiguity: if the frontend has crashed or the worker has deregistered, fail here with a clear message rather than letting codex run and time out. """ deadline = time.monotonic() + timeout_s last_err: Exception | None = None while time.monotonic() < deadline: try: resp = requests.get( f"http://localhost:{frontend_port}/v1/models", timeout=2 ) if resp.ok and any( m.get("id") == model for m in resp.json().get("data", []) ): return except requests.RequestException as e: last_err = e time.sleep(0.5) pytest.fail( f"frontend unhealthy after bun compliance — /v1/models did not list " f"{model!r} within {timeout_s}s (last error: {last_err})" ) def _run_bun_compliance( bun_binary: Path, openresponses_dir: Path, frontend_port: int ) -> None: """Invoke compliance-test.ts against the running frontend.""" base_url = f"http://localhost:{frontend_port}/v1" logger.info("Running OpenResponses compliance suite against %s", base_url) cmd = [ str(bun_binary), "run", "bin/compliance-test.ts", "--base-url", base_url, "--api-key", "sk-compliance-dummy", "--model", COMPLIANCE_MODEL, "--verbose", ] result = subprocess.run( cmd, cwd=str(openresponses_dir), capture_output=True, text=True, timeout=180, ) _attach_subprocess_log( name="bun_compliance_suite.log", cmd=cmd, result=result, cwd=str(openresponses_dir), ) if result.stdout: logger.info("compliance stdout:\n%s", result.stdout) if result.stderr: logger.info("compliance stderr:\n%s", result.stderr) if result.returncode != 0: pytest.fail( f"OpenResponses compliance suite failed (exit={result.returncode}).\n" f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" ) def _write_codex_config(codex_home, frontend_port: int) -> None: """Emit a minimal ~/.codex/config.toml pointing Codex at Dynamo. Using a per-test CODEX_HOME keeps the runner's global Codex state (if any) untouched. """ codex_home.mkdir(parents=True, exist_ok=True) config_path = codex_home / "config.toml" config_path.write_text( f""" [model_providers.local] name = "local-dynamo" base_url = "http://localhost:{frontend_port}/v1" wire_api = "responses" env_key = "LOCAL_API_KEY" """.lstrip() ) def _run_codex_exec_smoke( codex_cli: Path, node_bin: Path, codex_home, cwd, marker_filename: str ) -> None: """Run `codex exec` against the Dynamo Responses endpoint and assert the tool-call path actually fires. We prompt codex to list `cwd`; `cwd` contains `marker_filename` and nothing else the model could pattern-match from prior knowledge. If codex answers without invoking its shell tool, the marker won't appear in stdout and the assertion fails — which proves we're testing the full Responses API tool-calling chain, not just text generation. """ logger.info("Running codex exec smoke test against CODEX_HOME=%s", codex_home) # Isolate HOME for codex the same way we do for claude below. CODEX_HOME # scopes codex's own state, but the agent still invokes a shell tool under # `--dangerously-bypass-approvals-and-sandbox`, which inherits HOME for # any shell/helper reads and writes. Point it at `codex_home` so nothing # escapes `tmp_path`. extra_env = { "CODEX_HOME": str(codex_home), "HOME": str(codex_home), "LOCAL_API_KEY": "sk-none", } # codex is a node script (`#!/usr/bin/env node`); prepend the fixture- # installed node runtime to PATH so the shebang resolves without pulling # in the runner's system node (if any). env = _agent_subprocess_env(extra_env, path_prepend=[node_bin]) cmd = [ str(codex_cli), "-m", COMPLIANCE_MODEL, "-c", "model_provider=local", "exec", "What files exist in the current working directory? Use your shell tool to run ls and report each filename verbatim from the output.", "--dangerously-bypass-approvals-and-sandbox", ] result = subprocess.run( cmd, cwd=str(cwd), env=env, capture_output=True, text=True, timeout=180, ) _attach_subprocess_log( name="codex_exec_smoke.log", cmd=cmd, result=result, extra_env=extra_env, cwd=str(cwd), ) if result.stdout: logger.info("codex stdout:\n%s", result.stdout) if result.stderr: logger.info("codex stderr:\n%s", result.stderr) if result.returncode != 0: pytest.fail( f"codex exec failed (exit={result.returncode}).\n" f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" ) if marker_filename not in result.stdout: pytest.fail( "codex exec did not report the marker file — expected stdout to " f"contain {marker_filename!r} (implies the shell tool was invoked " f"and actually ran `ls` in {cwd}). Got:\n{result.stdout}" ) def _run_claude_exec_smoke( claude_cli: Path, node_bin: Path, claude_home, cwd, marker_filename: str, frontend_port: int, ) -> None: """Run `claude -p` against the Dynamo Anthropic Messages endpoint and assert the Bash tool-call path actually fires. Same marker-file pattern as the codex step but hitting /v1/messages: if claude answers without invoking its Bash tool, the marker won't appear in stdout and the assertion fails — which proves the full Anthropic Messages + tool-calling chain, not just text generation. Isolated HOME so claude doesn't write session state into the runner's `~/.claude`. An `ANTHROPIC_AUTH_TOKEN` is required even though Dynamo ignores the value: on a fresh HOME with no cached OAuth, the CLI aborts with "Not logged in" unless a bearer is supplied. """ base_url = f"http://localhost:{frontend_port}" logger.info("Running claude exec smoke test against %s", base_url) extra_env = { "HOME": str(claude_home), "ANTHROPIC_BASE_URL": base_url, "ANTHROPIC_AUTH_TOKEN": "sk-none", } # claude shells out to `node` internally; make sure the fixture-installed # runtime resolves on PATH without inheriting the runner's node. env = _agent_subprocess_env(extra_env, path_prepend=[node_bin]) cmd = [ str(claude_cli), "--model", COMPLIANCE_MODEL, "--dangerously-skip-permissions", "-p", "What files exist in the current working directory? Use your shell tool to run ls and report each filename verbatim from the output.", ] result = subprocess.run( cmd, cwd=str(cwd), env=env, capture_output=True, text=True, timeout=180, ) _attach_subprocess_log( name="claude_exec_smoke.log", cmd=cmd, result=result, extra_env=extra_env, cwd=str(cwd), ) if result.stdout: logger.info("claude stdout:\n%s", result.stdout) if result.stderr: logger.info("claude stderr:\n%s", result.stderr) if result.returncode != 0: pytest.fail( f"claude -p failed (exit={result.returncode}).\n" f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" ) if marker_filename not in result.stdout: pytest.fail( "claude -p did not report the marker file — expected stdout to " f"contain {marker_filename!r} (implies the Bash tool was invoked " f"and actually ran `ls` in {cwd}). Got:\n{result.stdout}" )