test_frontend_api_surface_compliance.py 26.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Frontend API-surface compliance suite against a live Dynamo frontend.

Subject under test is Dynamo's HTTP surface (`/v1/responses` and
`/v1/messages` wire shapes, tool-call routing through both); sglang is
just the backend vehicle for producing real traffic. Runs three suites
sequentially against one server:

1. Upstream OpenResponses compliance-test.ts harness (bun/TypeScript
   validator against zod schemas generated from the OpenAPI spec).
2. `codex exec` smoke — forces the shell tool-call path through
   `/v1/responses`.
3. `claude -p` smoke — forces the Bash tool-call path through
   `/v1/messages` (Anthropic Messages API).

All external tooling (bun, node, the OpenResponses suite, and the codex /
claude CLIs) is installed lazily at test time by session-scoped fixtures
into a session-shared cache directory. Versions and the OpenResponses
SHA are pinned as module-level constants. FileLock coordination makes
concurrent xdist workers share a single install.
"""

import logging
import os
import platform
import shlex
import shutil
import subprocess
import tarfile
import time
import zipfile
from pathlib import Path

import pytest
import requests
from filelock import FileLock

from tests.serve.common import WORKSPACE_DIR
from tests.utils.engine_process import EngineConfig, EngineProcess

logger = logging.getLogger(__name__)

sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/sglang"
)

COMPLIANCE_MODEL = "Qwen/Qwen3-VL-2B-Instruct"

# Pinned external-tool versions. Bun and node are pinned for reproducibility.
# The agent CLIs (@openai/codex, @anthropic-ai/claude-code) float to @latest
# so we automatically pick up protocol fixes — they're client-side harnesses,
# not Dynamo surface.
BUN_VERSION = "1.3.12"
NODE_VERSION = "20.19.0"
OPENRESPONSES_REPO = "https://github.com/openresponses/openresponses.git"
OPENRESPONSES_SHA = "fa29df5"

# Retry budget for network-touching installs. Exponential backoff starting
# at 2s; 3 attempts caps the worst-case wait at ~6s before we surface a
# clear "upstream unavailable" error.
_RETRY_COUNT = 3
_RETRY_BACKOFF_INITIAL_S = 2.0

# Env keys forwarded into codex/claude subprocesses. These agents run with tool
# permissions (`--dangerously-bypass-approvals-and-sandbox`, `--dangerously-skip-permissions`),
# and even against a local model they may emit telemetry; inheriting the whole
# CI environment would expose `GITHUB_TOKEN`, AWS creds, registry credentials,
# etc. Keep to a minimal allowlist covering only what the runtime needs:
# PATH to resolve the binaries, locale/TLS/proxy for HTTPS, HOME so Node/bun
# finds per-user caches, and NVIDIA/CUDA vars so any GPU-touching side effects
# see the same device the test was given.
_SUBPROCESS_ENV_ALLOWLIST: frozenset[str] = frozenset(
    {
        "PATH",
        "HOME",
        "LANG",
        "LC_ALL",
        "TZ",
        "SSL_CERT_FILE",
        "SSL_CERT_DIR",
        "REQUESTS_CA_BUNDLE",
        "CURL_CA_BUNDLE",
        "HTTP_PROXY",
        "HTTPS_PROXY",
        "NO_PROXY",
        "http_proxy",
        "https_proxy",
        "no_proxy",
        "LD_LIBRARY_PATH",
        "CUDA_VISIBLE_DEVICES",
        "NVIDIA_VISIBLE_DEVICES",
        "NVIDIA_DRIVER_CAPABILITIES",
    }
)


def _agent_subprocess_env(
    extra_env: dict[str, str], path_prepend: list[Path] | None = None
) -> dict[str, str]:
    """Build a minimal env for codex/claude subprocesses: allowlist from
    `os.environ` merged with explicit test-scoped vars. Optional
    `path_prepend` prepends directories to PATH so the fixture-installed
    node/codex/claude binaries resolve without contaminating the
    inherited PATH."""
    base = {
        k: v for k in _SUBPROCESS_ENV_ALLOWLIST if (v := os.environ.get(k)) is not None
    }
    if path_prepend:
        existing = base.get("PATH", "")
        prefix = os.pathsep.join(str(p) for p in path_prepend)
        base["PATH"] = f"{prefix}{os.pathsep}{existing}" if existing else prefix
    base.update(extra_env)
    return base


# ---------------------------------------------------------------------------
# Tool-install fixtures
# ---------------------------------------------------------------------------


def _retry_network_op(fn, description: str):
    """Run `fn()` with a small exponential-backoff retry budget so that
    transient github/npm/nodejs.org blips don't flake the test.
    Captures subprocess stderr into the final error message so post-mortem
    doesn't require digging through logs."""
    last_err: BaseException | None = None
    for attempt in range(_RETRY_COUNT):
        try:
            return fn()
        except (OSError, requests.RequestException, subprocess.CalledProcessError) as e:
            last_err = e
            if attempt + 1 < _RETRY_COUNT:
                wait = _RETRY_BACKOFF_INITIAL_S * (2**attempt)
                logger.warning(
                    "%s failed (attempt %d/%d): %s — retrying in %.1fs",
                    description,
                    attempt + 1,
                    _RETRY_COUNT,
                    e,
                    wait,
                )
                time.sleep(wait)
    detail = ""
    if isinstance(last_err, subprocess.CalledProcessError):
        detail = f"\nstdout:\n{last_err.stdout or ''}\nstderr:\n{last_err.stderr or ''}"
    raise RuntimeError(
        f"{description} failed after {_RETRY_COUNT} attempts: {last_err}{detail}"
    ) from last_err


def _download_url(url: str, dest: Path) -> None:
    """Stream GET `url` into `dest` atomically via a `.part` sibling."""
    tmp = dest.with_suffix(dest.suffix + ".part")
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(chunk_size=64 * 1024):
                if chunk:
                    f.write(chunk)
    tmp.rename(dest)


def _bun_arch() -> str:
    m = platform.machine()
    if m == "x86_64":
        return "x64"
    if m == "aarch64":
        return "aarch64"
    raise RuntimeError(f"Unsupported machine architecture for bun: {m}")


def _node_arch() -> str:
    m = platform.machine()
    if m == "x86_64":
        return "x64"
    if m == "aarch64":
        return "arm64"
    raise RuntimeError(f"Unsupported machine architecture for node: {m}")


@pytest.fixture(scope="session")
def _tools_cache(tmp_path_factory) -> Path:
    """Session-shared cache directory for downloaded compliance tooling.
    Lives under the pytest basetemp so it's reused across xdist workers
    in the same session and cleaned up automatically when the session
    ends."""
    base = Path(tmp_path_factory.getbasetemp()) / "_frontend_api_surface_tools"
    base.mkdir(parents=True, exist_ok=True)
    return base


@pytest.fixture(scope="session")
def _bun_binary(_tools_cache) -> Path:
    """Pinned-version bun executable. FileLock-coordinated so concurrent
    xdist workers share a single download."""
    install_dir = _tools_cache / f"bun-{BUN_VERSION}"
    bun_bin = install_dir / "bun"
    with FileLock(str(_tools_cache / "bun.lock")):
        if bun_bin.exists():
            return bun_bin
        install_dir.mkdir(parents=True, exist_ok=True)
        arch = _bun_arch()
        url = (
            f"https://github.com/oven-sh/bun/releases/download/"
            f"bun-v{BUN_VERSION}/bun-linux-{arch}.zip"
        )
        zip_path = install_dir / "bun.zip"
        _retry_network_op(
            lambda: _download_url(url, zip_path),
            description=f"download bun v{BUN_VERSION} ({arch})",
        )
        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(install_dir)
        extracted = install_dir / f"bun-linux-{arch}" / "bun"
        shutil.copy(extracted, bun_bin)
        bun_bin.chmod(0o755)
        zip_path.unlink(missing_ok=True)
    return bun_bin


@pytest.fixture(scope="session")
def _node_bin(_tools_cache) -> Path:
    """Pinned-version node runtime root `bin/` directory containing
    `node` and `npm`. FileLock-coordinated."""
    install_dir = _tools_cache / f"node-v{NODE_VERSION}"
    bin_dir = install_dir / "bin"
    with FileLock(str(_tools_cache / "node.lock")):
        if (bin_dir / "node").exists() and (bin_dir / "npm").exists():
            return bin_dir
        install_dir.mkdir(parents=True, exist_ok=True)
        arch = _node_arch()
        tarball_name = f"node-v{NODE_VERSION}-linux-{arch}.tar.xz"
        url = f"https://nodejs.org/dist/v{NODE_VERSION}/{tarball_name}"
        tar_path = install_dir / tarball_name
        _retry_network_op(
            lambda: _download_url(url, tar_path),
            description=f"download node v{NODE_VERSION} ({arch})",
        )
        with tarfile.open(tar_path) as tf:
            # `filter="data"` is the safe extraction filter added in 3.12 and
            # required in 3.14; passing it explicitly silences the pytest
            # filterwarnings=error escalation of the DeprecationWarning.
            tf.extractall(install_dir, filter="data")
        extracted = install_dir / f"node-v{NODE_VERSION}-linux-{arch}"
        for item in extracted.iterdir():
            shutil.move(str(item), str(install_dir / item.name))
        extracted.rmdir()
        tar_path.unlink(missing_ok=True)
    return bin_dir


@pytest.fixture(scope="session")
def _openresponses_suite(_tools_cache, _bun_binary) -> Path:
    """Pinned-SHA clone of the OpenResponses compliance suite with bun
    deps installed. A `.installed` sentinel file marks a completed setup
    so an interrupted prior install forces a clean redo."""
    install_dir = _tools_cache / f"openresponses-{OPENRESPONSES_SHA}"
    sentinel = install_dir / ".installed"
    with FileLock(str(_tools_cache / "openresponses.lock")):
        if sentinel.exists():
            return install_dir
        if install_dir.exists():
            shutil.rmtree(install_dir)
        _retry_network_op(
            lambda: subprocess.run(
                [
                    "git",
                    "clone",
                    "--filter=blob:none",
                    OPENRESPONSES_REPO,
                    str(install_dir),
                ],
                check=True,
                capture_output=True,
                text=True,
            ),
            description="clone openresponses",
        )
        subprocess.run(
            ["git", "-C", str(install_dir), "checkout", OPENRESPONSES_SHA],
            check=True,
            capture_output=True,
            text=True,
        )
        _retry_network_op(
            lambda: subprocess.run(
                [str(_bun_binary), "install", "--frozen-lockfile"],
                cwd=str(install_dir),
                check=True,
                capture_output=True,
                text=True,
            ),
            description="bun install openresponses deps",
        )
        sentinel.touch()
    return install_dir


def _install_npm_cli(
    tools_cache: Path,
    node_bin: Path,
    package: str,
    binary_name: str,
    slot: str,
) -> Path:
    """Install `package` into `{tools_cache}/{slot}` via npm and return
    the path to the CLI entry point. Shared helper for codex + claude."""
    install_dir = tools_cache / slot
    cli_bin = install_dir / "node_modules" / ".bin" / binary_name
    with FileLock(str(tools_cache / f"{slot}.lock")):
        if cli_bin.exists():
            return cli_bin
        install_dir.mkdir(parents=True, exist_ok=True)
        env = {
            **os.environ,
            "PATH": f"{node_bin}{os.pathsep}{os.environ.get('PATH', '')}",
        }
        _retry_network_op(
            lambda: subprocess.run(
                [
                    str(node_bin / "npm"),
                    "install",
                    "--prefix",
                    str(install_dir),
                    package,
                ],
                env=env,
                check=True,
                capture_output=True,
                text=True,
            ),
            description=f"npm install {package}",
        )
    return cli_bin


@pytest.fixture(scope="session")
def _codex_cli(_tools_cache, _node_bin) -> Path:
    return _install_npm_cli(
        _tools_cache,
        _node_bin,
        package="@openai/codex",
        binary_name="codex",
        slot="codex",
    )


@pytest.fixture(scope="session")
def _claude_cli(_tools_cache, _node_bin) -> Path:
    return _install_npm_cli(
        _tools_cache,
        _node_bin,
        package="@anthropic-ai/claude-code",
        binary_name="claude",
        slot="claude",
    )


# ---------------------------------------------------------------------------
# Test
# ---------------------------------------------------------------------------


@pytest.mark.sglang
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.model(COMPLIANCE_MODEL)
@pytest.mark.profiled_vram_gib(6.0)
@pytest.mark.requested_sglang_kv_tokens(512)
# Budget: tool-install fixtures (~30-60s first session run, near-zero on
# cache hit) + sglang cold start (30-60s) + bun compliance (up to 180s) +
# codex exec (up to 180s) + claude exec (up to 180s) + two inter-suite
# health checks + teardown. 750s leaves headroom for CI variance without
# masking real hangs.
@pytest.mark.timeout(750)
@pytest.mark.frontend_api_surface_compliance
@pytest.mark.pre_merge
def test_frontend_api_surface_compliance(
    request,
    runtime_services_dynamic_ports,
    dynamo_dynamic_ports,
    predownload_models,
    tmp_path,
    _bun_binary,
    _node_bin,
    _openresponses_suite,
    _codex_cli,
    _claude_cli,
):
    """Assert the frontend passes the upstream OpenResponses compliance suite."""

    frontend_port = int(dynamo_dynamic_ports.frontend_port)
    system_port = int(dynamo_dynamic_ports.system_ports[0])

    config = EngineConfig(
        name="responses_compliance",
        directory=sglang_dir,
        marks=[],
        request_payloads=[],
        model=COMPLIANCE_MODEL,
        script_name="agg.sh",
        # Qwen3-VL-2B-specific flags: vision-model CUDA graph workaround +
        # model-aware reasoning/tool-call parsers. Forwarded verbatim to
        # `dynamo.sglang` by agg.sh's pass-through loop.
        #
        # Tool-call parser is `hermes`, not `qwen3_coder`: Qwen3-VL-Instruct
        # emits `<tool_call>{"name":..., "arguments":...}</tool_call>` (JSON
        # inside the tags — Hermes-style), while `qwen3_coder` expects the
        # XML-structured `<tool_call><function=name><parameter=k>v</parameter>
        # </function></tool_call>` that Qwen3-Coder models emit. Using the
        # wrong parser leaves tool calls as raw text in the response and
        # breaks end-to-end agent flows (codex exec, etc.).
        script_args=[
            "--model-path",
            COMPLIANCE_MODEL,
            "--disable-piecewise-cuda-graph",
            "--dyn-reasoning-parser",
            "qwen3",
            "--dyn-tool-call-parser",
            "hermes",
        ],
        timeout=360,
        env={},
        frontend_port=frontend_port,
    )

    merged_env = {
        "DYN_HTTP_PORT": str(frontend_port),
        "DYN_SYSTEM_PORT": str(system_port),
        # agg.sh doesn't forward frontend args, but the frontend reads this
        # env var directly. Enables /v1/messages for the claude smoke step.
        "DYN_ENABLE_ANTHROPIC_API": "1",
    }

    codex_home = tmp_path / "codex_home"
    _write_codex_config(codex_home, frontend_port)

    # Marker file that the agents can only "see" by invoking their shell/Bash
    # tool; if a model answers from its prior without actually running `ls`,
    # the marker won't appear in stdout and the assertion fails. Proves the
    # tool-call paths through the frontend end-to-end (both /v1/responses
    # for codex and /v1/messages for claude), not just text generation.
    agent_cwd = tmp_path / "agent_cwd"
    agent_cwd.mkdir()
    marker_filename = "dynamo_compliance_marker.txt"
    (agent_cwd / marker_filename).write_text("compliance-smoke")

    # Isolated HOME so claude doesn't write session state into the runner's
    # ~/.claude during CI / local invocation.
    claude_home = tmp_path / "claude_home"
    claude_home.mkdir()

    with EngineProcess.from_script(config, request, extra_env=merged_env):
        _run_bun_compliance(_bun_binary, _openresponses_suite, frontend_port)
        _wait_for_frontend_healthy(frontend_port)
        _run_codex_exec_smoke(
            _codex_cli, _node_bin, codex_home, agent_cwd, marker_filename
        )
        _wait_for_frontend_healthy(frontend_port)
        _run_claude_exec_smoke(
            _claude_cli,
            _node_bin,
            claude_home,
            agent_cwd,
            marker_filename,
            frontend_port,
        )


def _attach_subprocess_log(
    name: str,
    cmd: list[str],
    result: subprocess.CompletedProcess,
    extra_env: dict[str, str] | None = None,
    cwd: str | None = None,
) -> None:
    """Attach a reproducible transcript of `cmd` to the Allure report.

    Lands in `test-results/allure-results/<uuid>-attachment.txt`, which the
    CI workflow uploads as an artifact on every run (pass or fail). Contents
    are a cut-and-paste-able shell invocation plus the raw stdout + stderr
    so a failing CI run can be reproduced locally from the artifact alone.

    Only explicitly listed env vars (`extra_env`) are recorded — not the
    inherited `os.environ` — to avoid leaking runner secrets into the
    artifact. CI runners keep HF tokens and cloud creds in env vars the
    subprocess inherits; we don't need those in the log to reproduce.
    """
    # Local import: `allure` is only available inside the test image (via
    # allure-pytest). Pre-commit's collection-only pytest runs in a clean
    # uvx env without it, so a module-level import would fail collection.
    import allure

    lines: list[str] = []
    if cwd:
        lines.append(f"$ cd {shlex.quote(cwd)}")
    if extra_env:
        for k, v in sorted(extra_env.items()):
            lines.append(f"$ export {k}={shlex.quote(v)}")
    lines.append("$ " + " ".join(shlex.quote(c) for c in cmd))
    lines.append("")
    lines.append(f"exit: {result.returncode}")
    lines.append("")
    lines.append("=== stdout ===")
    lines.append(result.stdout or "(empty)")
    lines.append("")
    lines.append("=== stderr ===")
    lines.append(result.stderr or "(empty)")

    allure.attach(
        "\n".join(lines),
        name=name,
        attachment_type=allure.attachment_type.TEXT,
    )


def _wait_for_frontend_healthy(
    frontend_port: int, timeout_s: float = 15.0, model: str = COMPLIANCE_MODEL
) -> None:
    """Confirm the frontend is still serving before the next subprocess fires.

    Without this check, if bun compliance accidentally destabilized the
    server (e.g. a hang that the bun timeout cut short) a codex exec
    failure looks identical to "codex is broken" in CI logs. The health
    probe collapses that ambiguity: if the frontend has crashed or the
    worker has deregistered, fail here with a clear message rather than
    letting codex run and time out.
    """
    deadline = time.monotonic() + timeout_s
    last_err: Exception | None = None
    while time.monotonic() < deadline:
        try:
            resp = requests.get(
                f"http://localhost:{frontend_port}/v1/models", timeout=2
            )
            if resp.ok and any(
                m.get("id") == model for m in resp.json().get("data", [])
            ):
                return
        except requests.RequestException as e:
            last_err = e
        time.sleep(0.5)
    pytest.fail(
        f"frontend unhealthy after bun compliance — /v1/models did not list "
        f"{model!r} within {timeout_s}s (last error: {last_err})"
    )


def _run_bun_compliance(
    bun_binary: Path, openresponses_dir: Path, frontend_port: int
) -> None:
    """Invoke compliance-test.ts against the running frontend."""
    base_url = f"http://localhost:{frontend_port}/v1"
    logger.info("Running OpenResponses compliance suite against %s", base_url)

    cmd = [
        str(bun_binary),
        "run",
        "bin/compliance-test.ts",
        "--base-url",
        base_url,
        "--api-key",
        "sk-compliance-dummy",
        "--model",
        COMPLIANCE_MODEL,
        "--verbose",
    ]
    result = subprocess.run(
        cmd,
        cwd=str(openresponses_dir),
        capture_output=True,
        text=True,
        timeout=180,
    )

    _attach_subprocess_log(
        name="bun_compliance_suite.log",
        cmd=cmd,
        result=result,
        cwd=str(openresponses_dir),
    )
    if result.stdout:
        logger.info("compliance stdout:\n%s", result.stdout)
    if result.stderr:
        logger.info("compliance stderr:\n%s", result.stderr)

    if result.returncode != 0:
        pytest.fail(
            f"OpenResponses compliance suite failed (exit={result.returncode}).\n"
            f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
        )


def _write_codex_config(codex_home, frontend_port: int) -> None:
    """Emit a minimal ~/.codex/config.toml pointing Codex at Dynamo.

    Using a per-test CODEX_HOME keeps the runner's global Codex state
    (if any) untouched.
    """
    codex_home.mkdir(parents=True, exist_ok=True)
    config_path = codex_home / "config.toml"
    config_path.write_text(
        f"""
[model_providers.local]
name = "local-dynamo"
base_url = "http://localhost:{frontend_port}/v1"
wire_api = "responses"
env_key = "LOCAL_API_KEY"
""".lstrip()
    )


def _run_codex_exec_smoke(
    codex_cli: Path, node_bin: Path, codex_home, cwd, marker_filename: str
) -> None:
    """Run `codex exec` against the Dynamo Responses endpoint and assert the
    tool-call path actually fires.

    We prompt codex to list `cwd`; `cwd` contains `marker_filename` and nothing
    else the model could pattern-match from prior knowledge. If codex answers
    without invoking its shell tool, the marker won't appear in stdout and the
    assertion fails — which proves we're testing the full Responses API
    tool-calling chain, not just text generation.
    """
    logger.info("Running codex exec smoke test against CODEX_HOME=%s", codex_home)

    # Isolate HOME for codex the same way we do for claude below. CODEX_HOME
    # scopes codex's own state, but the agent still invokes a shell tool under
    # `--dangerously-bypass-approvals-and-sandbox`, which inherits HOME for
    # any shell/helper reads and writes. Point it at `codex_home` so nothing
    # escapes `tmp_path`.
    extra_env = {
        "CODEX_HOME": str(codex_home),
        "HOME": str(codex_home),
        "LOCAL_API_KEY": "sk-none",
    }
    # codex is a node script (`#!/usr/bin/env node`); prepend the fixture-
    # installed node runtime to PATH so the shebang resolves without pulling
    # in the runner's system node (if any).
    env = _agent_subprocess_env(extra_env, path_prepend=[node_bin])

    cmd = [
        str(codex_cli),
        "-m",
        COMPLIANCE_MODEL,
        "-c",
        "model_provider=local",
        "exec",
        "What files exist in the current working directory? Use your shell tool to run ls and report each filename verbatim from the output.",
        "--dangerously-bypass-approvals-and-sandbox",
    ]
    result = subprocess.run(
        cmd,
        cwd=str(cwd),
        env=env,
        capture_output=True,
        text=True,
        timeout=180,
    )

    _attach_subprocess_log(
        name="codex_exec_smoke.log",
        cmd=cmd,
        result=result,
        extra_env=extra_env,
        cwd=str(cwd),
    )
    if result.stdout:
        logger.info("codex stdout:\n%s", result.stdout)
    if result.stderr:
        logger.info("codex stderr:\n%s", result.stderr)

    if result.returncode != 0:
        pytest.fail(
            f"codex exec failed (exit={result.returncode}).\n"
            f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
        )

    if marker_filename not in result.stdout:
        pytest.fail(
            "codex exec did not report the marker file — expected stdout to "
            f"contain {marker_filename!r} (implies the shell tool was invoked "
            f"and actually ran `ls` in {cwd}). Got:\n{result.stdout}"
        )


def _run_claude_exec_smoke(
    claude_cli: Path,
    node_bin: Path,
    claude_home,
    cwd,
    marker_filename: str,
    frontend_port: int,
) -> None:
    """Run `claude -p` against the Dynamo Anthropic Messages endpoint and
    assert the Bash tool-call path actually fires.

    Same marker-file pattern as the codex step but hitting /v1/messages:
    if claude answers without invoking its Bash tool, the marker won't
    appear in stdout and the assertion fails — which proves the full
    Anthropic Messages + tool-calling chain, not just text generation.

    Isolated HOME so claude doesn't write session state into the runner's
    `~/.claude`. An `ANTHROPIC_AUTH_TOKEN` is required even though Dynamo
    ignores the value: on a fresh HOME with no cached OAuth, the CLI
    aborts with "Not logged in" unless a bearer is supplied.
    """
    base_url = f"http://localhost:{frontend_port}"
    logger.info("Running claude exec smoke test against %s", base_url)

    extra_env = {
        "HOME": str(claude_home),
        "ANTHROPIC_BASE_URL": base_url,
        "ANTHROPIC_AUTH_TOKEN": "sk-none",
    }
    # claude shells out to `node` internally; make sure the fixture-installed
    # runtime resolves on PATH without inheriting the runner's node.
    env = _agent_subprocess_env(extra_env, path_prepend=[node_bin])

    cmd = [
        str(claude_cli),
        "--model",
        COMPLIANCE_MODEL,
        "--dangerously-skip-permissions",
        "-p",
        "What files exist in the current working directory? Use your shell tool to run ls and report each filename verbatim from the output.",
    ]
    result = subprocess.run(
        cmd,
        cwd=str(cwd),
        env=env,
        capture_output=True,
        text=True,
        timeout=180,
    )

    _attach_subprocess_log(
        name="claude_exec_smoke.log",
        cmd=cmd,
        result=result,
        extra_env=extra_env,
        cwd=str(cwd),
    )
    if result.stdout:
        logger.info("claude stdout:\n%s", result.stdout)
    if result.stderr:
        logger.info("claude stderr:\n%s", result.stderr)

    if result.returncode != 0:
        pytest.fail(
            f"claude -p failed (exit={result.returncode}).\n"
            f"stdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
        )

    if marker_filename not in result.stdout:
        pytest.fail(
            "claude -p did not report the marker file — expected stdout to "
            f"contain {marker_filename!r} (implies the Bash tool was invoked "
            f"and actually ran `ls` in {cwd}). Got:\n{result.stdout}"
        )