test(fault_tolerance): replace hand-rolled SSE parser with openai SDK (#8536)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

test(fault_tolerance): replace hand-rolled SSE parser with openai SDK (#8536)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
48911230 · Dmitry Tokarev · GitHub · a2a2753d · 48911230 · 48911230
Unverified Commit 48911230 authored Apr 22, 2026 by Dmitry Tokarev Committed by GitHub Apr 22, 2026
Show whitespace changes
Inline Side-by-side

Showing with 63 additions and 153 deletions

.pre-commit-config.yaml .pre-commit-config.yaml +1 -0

tests/fault_tolerance/migration/utils.py tests/fault_tolerance/migration/utils.py +62 -153

No files found.
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -90,3 +90,4 @@ repos:
        - filelock
        - pyyaml
        - prometheus_client>=0.23.1
+        - openai
--- a/tests/fault_tolerance/migration/utils.py
+++ b/tests/fault_tolerance/migration/utils.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-import json
 import logging
 import re
 import threading
@@ -9,6 +8,7 @@ import time
 import pytest
 import requests
+from openai import APIError, OpenAI
 from tests.utils.constants import FAULT_TOLERANCE_MODEL_NAME
 from tests.utils.managed_process import (
@@ -46,34 +46,18 @@ class DynamoFrontendProcess(BaseDynamoFrontendProcess):
        )
-def _parse_completion_sse_content(line: str) -> str | Exception | None:
+def _make_client(frontend_port: int) -> OpenAI:
-    """
+    """Build an OpenAI client pointed at the test frontend.
-    Parse an SSE line from the completions API and extract the text content.
-    Args:
-        line: Raw SSE line string
-    Returns:
+    max_retries=0 so fault-tolerance tests see the first error instead of
-        str: The text content if found
+    silent retries; api_key is a placeholder since the frontend doesn't auth.
-        Exception: If error event or parse error
-        None: If no content (e.g., [DONE] or empty)
    """
-    if line.startswith("event: error"):
+    return OpenAI(
-        return Exception(f"SSE error event received: {line}")
+        base_url=f"http://localhost:{frontend_port}/v1",
+        api_key="not-needed",
-    if not line.startswith("data: "):
+        max_retries=0,
-        return None  # Skip non-data lines
+        timeout=240,
+    )
-    data_str = line[6:]  # Remove "data: " prefix
-    if data_str == "[DONE]":
-        return None
-    try:
-        chunk = json.loads(data_str)
-        text = chunk["choices"][0].get("text")
-        return text  # May be None if no text content
-    except Exception as e:
-        return Exception(f"Error parsing response chunk: {e}")
 def start_completion_request(
@@ -102,14 +86,6 @@ def start_completion_request(
        prompt = "Tell me a long long long story about yourself?"
        if use_long_prompt:
            prompt += " Make sure it is" + " long" * 8000 + "!"
-        timeout = 240  # Extended timeout for long request
-        payload = {
-            "model": FAULT_TOLERANCE_MODEL_NAME,
-            "prompt": prompt,
-            "stream": stream,
-        }
-        headers = {"Content-Type": "application/json"}
        logger.info(
            f"Sending completion request (stream={stream}) with prompt: '{prompt[:50]}...'"
@@ -118,45 +94,30 @@ def start_completion_request(
        response_list.append((None, time.time()))  # start timestamp
        try:
-            with requests.post(
+            client = _make_client(frontend_port)
-                f"http://localhost:{frontend_port}/v1/completions",
-                headers=headers,
-                json=payload,
-                timeout=timeout,
-                stream=stream,
-            ) as response:
-                logger.info(
-                    f"Received response with status code: {response.status_code}"
-                )
-                if response.status_code != 200:
-                    response_list.append(
-                        (
-                            Exception(
-                                f"Request failed with status {response.status_code}: {response.text}"
-                            ),
-                            time.time(),
-                        )
-                    )
-                    return
            if stream:
-                    for line in response.iter_lines():
+                for chunk in client.completions.create(
-                        if line:
+                    model=FAULT_TOLERANCE_MODEL_NAME,
-                            content = _parse_completion_sse_content(
+                    prompt=prompt,
-                                line.decode("utf-8")
+                    stream=True,
-                            )
+                ):
-                            if content is not None:
+                    text = chunk.choices[0].text if chunk.choices else None
-                                response_list.append((content, time.time()))
+                    # Match the original hand-rolled parser: keep empty strings,
+                    # drop only None. Empty chunks (e.g. the first stream frame)
+                    # still count as a response arrival for delay measurement.
+                    if text is not None:
+                        response_list.append((text, time.time()))
            else:
-                    try:
+                resp = client.completions.create(
-                        content = response.json()["choices"][0]["text"]
+                    model=FAULT_TOLERANCE_MODEL_NAME,
-                        response_list.append((content, time.time()))
+                    prompt=prompt,
-                    except Exception as e:
+                    stream=False,
-                        response_list.append(
-                            (Exception(f"Error parsing response: {e}"), time.time())
                )
+                response_list.append((resp.choices[0].text, time.time()))
        except Exception as e:
+            # openai.APIError subclasses cover HTTP non-200, mid-stream
+            # structured `data: {"error": {...}}` frames, connection failures,
+            # and timeouts. Non-openai exceptions (network, etc.) also bubble.
            logger.error(f"Request failed with error: {e}")
            response_list.append((e, time.time()))
@@ -166,36 +127,6 @@ def start_completion_request(
    return request_thread, response_list
-def _parse_chat_completion_sse_content(line: str) -> str | Exception | None:
-    """
-    Parse an SSE line and extract the content.
-    Args:
-        line: Raw SSE line string
-    Returns:
-        str: The content delta if found
-        Exception: If error event or parse error
-        None: If no content (e.g., [DONE] or empty delta)
-    """
-    if line.startswith("event: error"):
-        return Exception(f"SSE error event received: {line}")
-    if not line.startswith("data: "):
-        return None  # Skip non-data lines
-    data_str = line[6:]  # Remove "data: " prefix
-    if data_str == "[DONE]":
-        return None
-    try:
-        chunk = json.loads(data_str)
-        content = chunk["choices"][0]["delta"].get("content")
-        return content  # May be None if delta has no content
-    except Exception as e:
-        return Exception(f"Error parsing response chunk: {e}")
 def start_chat_completion_request(
    frontend_port: int, stream: bool, use_long_prompt: bool = False
 ) -> tuple:
@@ -222,14 +153,6 @@ def start_chat_completion_request(
        prompt = "Tell me a long long long story about yourself?"
        if use_long_prompt:
            prompt += " Make sure it is" + " long" * 8000 + "!"
-        timeout = 240  # Extended timeout for long request
-        payload = {
-            "model": FAULT_TOLERANCE_MODEL_NAME,
-            "messages": [{"role": "user", "content": prompt}],
-            "stream": stream,
-        }
-        headers = {"Content-Type": "application/json"}
        logger.info(
            f"Sending chat completion request (stream={stream}) with prompt: '{prompt[:50]}...'"
@@ -238,45 +161,31 @@ def start_chat_completion_request(
        response_list.append((None, time.time()))  # start timestamp
        try:
-            with requests.post(
+            client = _make_client(frontend_port)
-                f"http://localhost:{frontend_port}/v1/chat/completions",
-                headers=headers,
-                json=payload,
-                timeout=timeout,
-                stream=stream,
-            ) as response:
-                logger.info(
-                    f"Received response with status code: {response.status_code}"
-                )
-                if response.status_code != 200:
-                    response_list.append(
-                        (
-                            Exception(
-                                f"Request failed with status {response.status_code}: {response.text}"
-                            ),
-                            time.time(),
-                        )
-                    )
-                    return
            if stream:
-                    for line in response.iter_lines():
+                for chunk in client.chat.completions.create(
-                        if line:
+                    model=FAULT_TOLERANCE_MODEL_NAME,
-                            content = _parse_chat_completion_sse_content(
+                    messages=[{"role": "user", "content": prompt}],
-                                line.decode("utf-8")
+                    stream=True,
-                            )
+                ):
+                    content = chunk.choices[0].delta.content if chunk.choices else None
+                    # Match the original hand-rolled parser: keep empty strings,
+                    # drop only None. Empty chunks (e.g. the first `role`-only
+                    # stream frame) still count as a response arrival for delay
+                    # measurement.
                    if content is not None:
                        response_list.append((content, time.time()))
            else:
-                    try:
+                resp = client.chat.completions.create(
-                        content = response.json()["choices"][0]["message"]["content"]
+                    model=FAULT_TOLERANCE_MODEL_NAME,
-                        response_list.append((content, time.time()))
+                    messages=[{"role": "user", "content": prompt}],
-                    except Exception as e:
+                    stream=False,
-                        response_list.append(
-                            (Exception(f"Error parsing response: {e}"), time.time())
                )
+                response_list.append((resp.choices[0].message.content, time.time()))
        except Exception as e:
+            # openai.APIError subclasses cover HTTP non-200, mid-stream
+            # structured `data: {"error": {...}}` frames, connection failures,
+            # and timeouts. Non-openai exceptions also bubble for visibility.
            logger.error(f"Request failed with error: {e}")
            response_list.append((e, time.time()))
@@ -634,12 +543,12 @@ def run_migration_test(
            pytest.fail(
                "Request succeeded unexpectedly when migration should have failed"
            )
-        except Exception as e:
+        except APIError as e:
-            error_str = str(e)
+            # Expected: openai.APIError covers mid-stream structured error
-            assert (
+            # frames (DIS-1768 contract) and HTTP non-200 responses. A typed
-                "SSE error event received:" in error_str
+            # check is more robust than matching the exception's stringified
-                or "Request failed with status" in error_str
+            # message against a specific wire-format prefix.
-            ), f"Unexpected error: {e}"
+            logger.info(f"Got expected APIError: {e}")
        try:
            verify_migration_occurred(frontend)