test: fault tolerance tests (#1444)

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

test: fault tolerance tests (#1444)
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
36f03d40 · Neelay Shah · GitHub · fb213a2f · 36f03d40 · 36f03d40
Unverified Commit 36f03d40 authored Jul 03, 2025 by Neelay Shah Committed by GitHub Jul 03, 2025
20 changed files
--- a/deploy/sdk/src/dynamo/sdk/cli/circus.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/circus.py
@@ -86,6 +86,23 @@ def create_circus_watcher(
    use_sockets: bool = True,
    **kwargs: Any,
 ) -> Watcher:
+    log_dir = os.environ.get("DYN_CIRCUS_LOG_DIR", None)
+    if log_dir is not None:
+        prefix = f"{log_dir}/{name}"
+        os.makedirs(prefix, exist_ok=True)
+        stdout_stream = {
+            "class": "FileStream",
+            "filename": f"{prefix}/output.log",
+            "backup_count": 10,
+        }
+        stderr_stream = {
+            "class": "FileStream",
+            "filename": f"{prefix}/error.log",
+            "backup_count": 10,
+        }
+    else:
+        stdout_stream = None
+        stderr_stream = None
    return Watcher(
        name=name,
        cmd=shlex.quote(cmd) if psutil.POSIX else cmd,
@@ -94,7 +111,10 @@ def create_circus_watcher(
        stop_children=True,
        use_sockets=use_sockets,
        graceful_timeout=86400,
-        respawn=False,  # TODO
+        respawn=os.environ.get("DYN_CIRCUS_RESPAWN", "false").lower()
+        in ("true", "1", "yes"),
+        stdout_stream=stdout_stream,
+        stderr_stream=stderr_stream,
        **kwargs,
    )

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -162,6 +162,8 @@ markers = [
    "weekly: marks tests to run weekly",
    "gpu_1: marks tests to run on GPU",
    "gpu_2: marks tests to run on 2GPUs",
+    "gpu_4: marks tests to run on 4GPUs",
+    "gpu_8: marks tests to run on 8GPUs",
    "e2e: marks tests as end-to-end tests",
    "integration: marks tests as integration tests",
    "unit: marks tests as unit tests",

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,6 +15,7 @@
 import logging
 import os
+import shutil
 import tempfile
 import pytest
@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess
 # Custom format inspired by your example
 LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
+DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format=LOG_FORMAT,
-    datefmt="%Y-%m-%dT%H:%M:%S",  # ISO 8601 UTC format
+    datefmt=DATE_FORMAT,  # ISO 8601 UTC format
 )
+@pytest.fixture(autouse=True)
+def logger(request):
+    log_path = os.path.join(request.node.name, "test.log.txt")
+    logger = logging.getLogger()
+    shutil.rmtree(request.node.name, ignore_errors=True)
+    os.makedirs(request.node.name, exist_ok=True)
+    handler = logging.FileHandler(log_path, mode="w")
+    formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    yield
+    handler.close()
+    logger.removeHandler(handler)
 def pytest_collection_modifyitems(config, items):
    """
    This function is called to modify the list of tests to run.
@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess):
            timeout=timeout,
            display_output=False,
            health_check_ports=[port],
-            data_dir=tempfile.mkdtemp(prefix="etcd_"),
+            data_dir=data_dir,
            log_dir=request.node.name,
        )

--- a/tests/fault_tolerance/README.md
+++ b/tests/fault_tolerance/README.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Fault Tolerance Test Suite
+As a large scale distributed inference serving framework in addition
+to providing high throughput and low latency, Dynamo needs to
+provide fault detection, resilency, and quick recovery in the face of
+unforseen failures. In order to test Dynamo we are developing a test
+suite to inject and measure the impact of different types of failure
+conditions.
+## Test Architecture
+The fault tolerance test suite is designed as a set of pytest
+configurations that launch typical dynamo serve graph deployments and
+then inject failures by terminating processes in the graph. To test
+the recovery time and impact of failures a set number of clients are
+launched in parallel. Each client sends a set number of synchronous
+requests. Log files are stored for each dynamo process as well as for
+each client and inspected using a post-processing script.
+> [!NOTE]
+> Test pass / failure is not an indication of SLA for recovery or resilience
+> It only indicates is the test was executed and data was collected
+> [!NOTE]
+> The test suite currently targets single node Dynamo Serve.
+> Support for Dynamo Deploy is a work in progress.
+###  Test Sequence Diagram
+```mermaid
+sequenceDiagram
+    participant Tester as Test Runner
+    participant Dynamo as DynamoServeProcess
+    participant Circus as CircusController
+    participant Client as Test Clients
+    participant Metrics as Metrics Collector
+    Tester->>Dynamo: Launch deployment graph
+    Dynamo-->>Tester: Signal ready
+    Tester->>Metrics: Start metrics collection
+    Tester->>Client: Spawn multiple clients
+    loop During Test
+        Client->>Dynamo: Send chat completion requests
+        Dynamo-->>Client: Return responses
+        Metrics->>Dynamo: Collect runtime metrics
+    end
+    Tester->>Dynamo: Inject failures (terminate components)
+    Dynamo-->>Tester: Recover/respawn as configured
+    Client-->>Tester: Log request results
+    Metrics-->>Tester: Log metrics data
+    Tester->>Dynamo: Shutdown deployment
+    Tester->>Metrics: Stop metrics collection
+    Tester->>Tester: Parse logs and summarize results
+```
+### Failure Scenarios
+The test suite includes several predefined fault injection scenarios designed to validate system resilience under various failure conditions. These scenarios are configured in `scenarios.py` and can be selected via pytest parameters. Below is a description of the available scenarios:
+| Scenario Name          | Description                                                                 | Affected Components                             | Timing Example     |
+|------------------------|-----------------------------------------------------------------------------|-------------------------------------------------|--------------------|
+| **decode_worker**      | Terminates decoder worker processes                                         | `dynamo_vllmworker`                             | 30 seconds         |
+| **prefill_worker**     | Terminates prefill worker processes                                         | `dynamo_prefillworker`                          | 30 seconds         |
+| **frontend**           | Terminates frontend processes handling client requests                      | `dynamo_frontend`                               | 30 seconds         |
+| **processor**          | Terminates processor nodes responsible for task orchestration               | `dynamo_processor`                              | 30 seconds         |
+| **vllm_worker**        | Terminates low-level VLLM worker processes                                  | `vllm_worker` (external to Dynamo)              | 30 seconds         |
+| **none**               | Baseline scenario with no failures                                          | N/A                                             | N/A                |
+#### Key Characteristics:
+1. **Timing**: Failures are injected at predefined intervals (e.g., 30 seconds after test start)
+2. **Severity**: The number of terminated processes can be configured (default: 1)
+3. **Scope**: Failures target specific components while leaving others operational
+#### Configuration:
+- **Injection Timing**: Defined in `failure_scenarios` dictionary in `scenarios.py`
+- **Process Count**: Adjustable via tuple values (e.g., `("dynamo_vllmworker", 1)` terminates 1 process)
+- **Component Mapping**:
+  - `dynamo_*`: Internal Dynamo services
+  - `vllm_worker`: External VLLM model workers
+#### Example Scenario Execution:
+Run all graph configurations injecting a decode_worker failure.
+```bash
+cd tests/fault_tolerance
+pytest test_runner.py -k decode_worker
+```
+### Test Results Directory
+For each test scenario a directory of log files is created and post processed to summarize the test.
+```
+test_worker_failure[agg-tp-2-dp-4-none]
+.
+├── client_0.log.txt
+├── client_1.log.txt
+├── client_2.log.txt
+├── client_3.log.txt
+├── client_4.log.txt
+├── client_5.log.txt
+├── client_6.log.txt
+├── client_7.log.txt
+├── dynamo_Frontend
+│   ├── error.log
+│   └── output.log
+├── dynamo.log.txt
+├── dynamo_Planner
+│   ├── error.log
+│   └── output.log
+├── dynamo_Processor
+│   ├── error.log
+│   └── output.log
+├── dynamo_VllmWorker
+│   ├── error.log
+│   └── output.log
+├── etcd.log.txt
+├── nats-server.log.txt
+├── nvidia-smi.log.txt
+├── test.log.txt
+└── watcher.log.txt
+```
+| File/Directory Name                | Description                                                                                      |
+|------------------------------------|------------------------------------------------------------------------------------------------|
+| **client_*.log.txt**               | Request/response logs for each client instance (contains JSON-formatted request details)        |
+| **dynamo_*/error.log**             | Error logs for specific Dynamo components (e.g., Frontend, Processor, VllmWorker)               |
+| **dynamo_*/output.log**            | Standard output logs for Dynamo components (service startup/shutdown messages)                |
+| **dynamo.log.txt**                 | Aggregate logs for Dynamo services (orchestration and initialization)                           |
+| **etcd.log.txt**                   | Logs for etcd, the distributed key-value store used for service coordination                    |
+| **nats-server.log.txt**            | Logs for NATS message broker, handling inter-service communication                             |
+| **nvidia-smi.log.txt**             | GPU monitoring logs (records utilization statistics during test execution)                      |
+| **test.log.txt**                   | Primary test execution log (contains fault injection timing, process management, and test status)|
+| **watcher.log.txt**                | Metrics collected by the watcher service (e.g., pending requests, active workers)               |
+### Summary Results
+Results are presented in table format after each test providing summary statistics.
+**Test Group:** agg-tp-2-dp-1
+**Test Command:**  dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
+|    Failure    |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|     none      |          56.00 |    800.00 |     0.00 |             1.97 |             N/A |             0.00 |             N/A |                8.00 |                N/A |             N/A |
+|   frontend    |          56.00 |    656.00 |   144.00 |             1.96 |            1.96 |             0.00 |            0.00 |                0.00 |               0.00 |           17.53 |
+|   processor   |          57.00 |    584.00 |   216.00 |             1.96 |            1.96 |             0.00 |            0.00 |                0.00 |               0.00 |           25.96 |
+| decode_worker |          80.00 |    520.00 |   280.00 |             2.01 |            1.98 |             0.00 |            0.00 |                8.00 |               8.00 |           37.99 |
+|  vllm_worker  |          58.00 |    120.00 |   680.00 |             1.98 |             nan |             0.00 |            0.00 |                0.00 |               0.00 |             N/A |
+| Column Name           | Description                                                                 |
+|-----------------------|-----------------------------------------------------------------------------|
+| **Failure**           | Type of fault injection applied during the test (or 'none' for baseline)     |
+| **Startup Time**      | Time (seconds) taken for the service to become ready after initialization    |
+| **Success**           | Number of client requests that succeeded                                    |
+| **Failed**            | Number of client requests that failed or were invalid                       |
+| **Latency Before**    | Average request latency (seconds) for successful requests before fault injection |
+| **Latency After**     | Average request latency (seconds) for successful requests after fault injection (N/A if no fault) |
+| **Pending Before**    | Average number of pending requests observed before fault injection          |
+| **Pending After**     | Average number of pending requests observed after fault injection (N/A if no fault) |
+| **Violations Before** | Number of successful requests exceeding SLA latency before fault injection  |
+| **Violations After**  | Number of successful requests exceeding SLA latency after fault injection (N/A if no fault) |
+| **Recovery Time**     | Time (seconds) taken for failed components to recover after fault injection  |
+## Example Results
+The following results were obtained running on a single node with 8
+L40 GPUs using "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" with 8
+concurrent clients each sending 100 requests.
+### Aggregated Workers
+#### No Redundancy
+To demonstrate the failure and recovery time in the case that there is
+a single instance of each process we ran a simmple "agg-tp-2-dp-1" configuration.
+```mermaid
+graph LR
+    Client["Client"]
+    Frontend["Frontend"]
+    Processor["Processor"]
+    Client --> Frontend
+    Frontend --> Processor
+    Processor --> DecodePool
+    %% Decode Worker Pool (vertical layout)
+    subgraph DecodePool["Decode Worker Pool"]
+        direction TB
+        subgraph Decode1["Decode 1"]
+            direction TB
+            D1GPU0["GPU 0"]
+            D1GPU1["GPU 1"]
+        end
+    end
+    %% Styling
+    style DecodePool stroke:#000,stroke-width:2px
+```
+#### Results:
+**Test Group: agg-tp-2-dp-1**
+**Test Command:**  dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
+|    Failure    |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|     none      |          56.00 |    800.00 |     0.00 |             1.97 |             N/A |             0.00 |             N/A |                8.00 |                N/A |             N/A |
+|   frontend    |          56.00 |    656.00 |   144.00 |             1.96 |            1.96 |             0.00 |            0.00 |                0.00 |               0.00 |           17.53 |
+|   processor   |          57.00 |    584.00 |   216.00 |             1.96 |            1.96 |             0.00 |            0.00 |                0.00 |               0.00 |           25.96 |
+| decode_worker |          80.00 |    520.00 |   280.00 |             2.01 |            1.98 |             0.00 |            0.00 |                8.00 |               8.00 |           37.99 |
+|  vllm_worker  |          58.00 |    120.00 |   680.00 |             1.98 |             nan |             0.00 |            0.00 |                0.00 |               0.00 |             N/A |
+#### Summary:
+1. Dynamo does not currently detect and recover from direct vllm worker sub process failure. (WIP)
+2. Recovery time for the decode worker itself is the largest and a decode worker failure has the largest impact (as expected)
+3. Overall failure count is roughly equal to recovery time multiplied by number of clients (as expected).
+#### Redundant Workers (Over Provisoned)
+To demonstrate the failure and recovery time in the case that there
+are multiple instances of each process (except for the frontend) we
+ran a simple "agg-tp-2-dp-4" configuration.
+In this case we also consider the system to be "over provisioned" for
+the workload as multiple workers are not needed to maintain SLA for
+the 8 clients.
+```mermaid
+graph LR
+    Client["Client"]
+    Frontend["Frontend"]
+    Processor_1["Processor 1"]
+    Processor_2["Processor 2"]
+    Client --> Frontend
+    Frontend --> Processor_1
+    Frontend --> Processor_2
+    subgraph DecodePool["Decode Worker Pool"]
+        direction LR
+        subgraph Decode1["Decode 1"]
+            direction TB
+            D1GPU0["GPU 0"]
+            D1GPU1["GPU 1"]
+        end
+        subgraph Decode2["Decode 2"]
+            direction TB
+            D2GPU0["GPU 0"]
+            D2GPU1["GPU 1"]
+        end
+        subgraph Decode3["Decode 3"]
+            direction TB
+            D3GPU0["GPU 0"]
+            D3GPU1["GPU 1"]
+        end
+        subgraph Decode4["Decode 4"]
+            direction TB
+            D4GPU0["GPU 0"]
+            D4GPU1["GPU 1"]
+        end
+    end
+    Processor_1 --> DecodePool
+    Processor_2 --> DecodePool
+    style DecodePool stroke:#000,stroke-width:2px
+```
+#### Results:
+**Test Group:** agg-tp-2-dp-4
+**Test Command:**  dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 in /workspace/examples/llm
+|    Failure    |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|     none      |          57.00 |    800.00 |     0.00 |             1.76 |             N/A |             0.00 |             N/A |                0.00 |                N/A |             N/A |
+|   frontend    |          57.00 |    672.00 |   128.00 |             1.77 |            1.74 |             0.00 |            0.00 |                0.00 |               0.00 |           16.65 |
+|   processor   |          52.00 |    680.00 |   120.00 |             1.79 |            1.78 |             0.00 |            0.00 |                0.00 |               0.00 |           21.25 |
+| decode_worker |          56.00 |    796.00 |     4.00 |             1.82 |            1.78 |             0.00 |            0.00 |                0.00 |               0.00 |           44.88 |
+|  vllm_worker  |          52.00 |    634.00 |   166.00 |             1.78 |            1.78 |             0.00 |            0.00 |                0.00 |               0.00 |             N/A |
+#### Summary:
+1. Dynamo does not currently detect and recover from direct vllm
+   worker sub process failure. In the case of redundant workers this
+   results in roughtly 1/4 the requests failing after the initial 30
+   seconds. (WIP)
+2. By immediately detecting a decode worker failure, Dynamo can limit
+   the failures and reroute requests to healthy workers with minimal
+   impact.
+3. While the processor was configured with redundancy - the system was
+   unable to instantiate two processors successfully leading to
+   failure when the processor was terminated. (WIP)
+#### Redundant Workers (Exact Provisioning)
+To demonstrate the failure and recovery time in the case that there
+are multiple instances of each process (except for the frontend) we
+ran a simple "agg-tp-2-dp-4" configuration.
+In this case we also consider the system to be "exact provisioned" for
+the workload as we limit the max-num-seqs for each decode worker to
+exactly 2. This artificially creates a scenario that results in queing
+when a failur occurs before a worker is recovered.
+```mermaid
+graph LR
+    Client["Client"]
+    Frontend["Frontend"]
+    Processor_1["Processor 1"]
+    Processor_2["Processor 2"]
+    Client --> Frontend
+    Frontend --> Processor_1
+    Frontend --> Processor_2
+    subgraph DecodePool["Decode Worker Pool"]
+        direction LR
+        subgraph Decode1["Decode 1 (max 2 seq)"]
+            direction TB
+            D1GPU0["GPU 0"]
+            D1GPU1["GPU 1"]
+        end
+        subgraph Decode2["Decode 2 (max 2 seq)"]
+            direction TB
+            D2GPU0["GPU 0"]
+            D2GPU1["GPU 1"]
+        end
+        subgraph Decode3["Decode 3 (max 2 seq)"]
+            direction TB
+            D3GPU0["GPU 0"]
+            D3GPU1["GPU 1"]
+        end
+        subgraph Decode4["Decode 4 (max 2 seq)"]
+            direction TB
+            D4GPU0["GPU 0"]
+            D4GPU1["GPU 1"]
+        end
+    end
+    Processor_1 --> DecodePool
+    Processor_2 --> DecodePool
+    style DecodePool stroke:#000,stroke-width:2px
+```
+#### Results:
+**Test Group:** agg-tp-2-dp-4
+**Test Command:**  dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 --VllmWorker.max_num_seqs 2 in /workspace/examples/llm
+|    Failure    |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|     none      |          57.00 |    800.00 |     0.00 |             1.77 |             N/A |             0.01 |             N/A |                0.00 |                N/A |             N/A |
+|   frontend    |          56.00 |    664.00 |   136.00 |             1.80 |            1.77 |             0.00 |            0.00 |                0.00 |               0.00 |           17.22 |
+|   processor   |          56.00 |    649.00 |   151.00 |             1.76 |            1.77 |             0.01 |            0.00 |                0.00 |               0.00 |           25.79 |
+| decode_worker |          56.00 |    798.00 |     2.00 |             1.77 |            1.89 |             0.00 |            0.13 |                0.00 |              84.00 |           44.57 |
+|  vllm_worker  |          56.00 |    632.00 |   168.00 |             1.80 |            2.23 |             0.00 |            0.38 |                0.00 |             232.00 |             N/A |
+#### Summary:
+1. Dynamo does not currently detect and recover from direct vllm
+   worker sub process failure. In the case of redundant workers this
+   results in roughtly 1/4 the requests failing after the initial 30
+   seconds. All requests after the initial 30 seconds would also be
+   subject to queing as a result and we see increased SLA
+   violations. (WIP)
+2. By immediately detecting a decode worker failure, Dynamo can limit
+   the failures and reroute requests to healthy workers with minimal
+   impact. However during the recovery period requests are subject to
+   queing and as a results we see increased SLA violations.
+3. While the processor was configured with redundancy - the system was
+   unable to instantiate two processors successfully leading to
+   failure when the processor was terminated. (WIP)
+### Disaggregated Workers
+#### No Redunancy
+To demonstrate the failure and recovery time in the case of a
+disaaggregated deployment with a single instance for each process in
+the graph we ran a simple `disagg-p-tp-2-dp-1-d-tp-4-dp-1` configuration.
+```mermaid
+graph LR
+    Client["Client"]
+    Frontend["Frontend"]
+    Processor["Processor"]
+    PrefillQueue["Remote Prefill Queue"]
+    Client --> Frontend
+    Frontend --> Processor
+    Processor <--> DecodePool
+    %% Prefill Worker Pool (horizontal layout)
+    subgraph PrefillPool["Prefill Worker Pool"]
+        direction LR
+        subgraph Prefill1["Prefill 1"]
+            direction TB
+            P1GPU0["GPU 0"]
+            P1GPU1["GPU 1"]
+		end
+    end
+    %% Decode Worker Pool (vertical layout)
+    subgraph DecodePool["Decode Worker Pool"]
+        direction TB
+        subgraph Decode1["Decode 1"]
+            direction TB
+            D1GPU0["GPU 0"]
+            D1GPU1["GPU 1"]
+            D1GPU2["GPU 2"]
+            D1GPU3["GPU 3"]
+        end
+    end
+	PrefillQueue --> PrefillPool
+    DecodePool --> PrefillQueue
+    PrefillPool -.-> DecodePool
+    %% Styling
+    style PrefillPool stroke:#0066cc,stroke-width:2px
+    style DecodePool stroke:#000,stroke-width:2px
+```
+#### Results:
+**Test Group:** disagg-p-tp-2-dp-1-d-tp-4-dp-1
+**Test Command:**  dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
+|    Failure     |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|      none      |          83.00 |    800.00 |     0.00 |             1.19 |             N/A |             0.01 |             N/A |                0.00 |                N/A |             N/A |
+|    frontend    |          78.00 |    664.00 |   136.00 |             1.19 |            1.19 |             0.07 |            0.02 |                0.00 |               0.00 |           17.24 |
+|   processor    |          77.00 |    576.00 |   224.00 |             1.19 |            1.19 |             0.00 |            0.00 |                0.00 |               0.00 |           26.90 |
+| decode_worker  |          72.00 |    200.00 |   600.00 |             1.20 |            1.28 |             0.03 |             N/A |                0.00 |               0.00 |             N/A |
+| prefill_worker |          81.00 |    798.00 |     2.00 |             1.19 |            1.22 |             0.05 |            0.05 |                0.00 |               0.00 |           42.31 |
+|  vllm_worker   |          83.00 |    797.00 |     3.00 |             1.19 |            1.22 |             0.00 |            0.03 |                0.00 |               8.00 |             N/A |
+#### Summary:
+1. Dynamo does not currently detect and recover from direct vllm
+   worker sub process failure. In this example the vllm sub process
+   failure targets a prefill worker and has the same overall impact.
+   (WIP)
+2. Prefill worker failure causes request timeout (30 sec) and in
+   addition during recovery time prefill requests are queued in the
+   prefill queue.
+3. Decode worker failure is currently permanent in the disaggregated
+   case as the prefill worker holds references to memory and which are
+   not freed. This leads to total failure after fault injection.
+#### Redundant Workers
+To demonstrate the failure and recovery time in the case that there
+are multiple instances of each process (except for the frontend and
+decode worker) we ran a simple "disagg-p-tp-2-dp-2-d-tp-4-dp-1"
+configuration.
+```mermaid
+graph LR
+    Client["Client"]
+    Frontend["Frontend"]
+    Processor_1["Processor 1"]
+	Processor_2["Processor 2"]
+    PrefillQueue["Remote Prefill Queue"]
+    Client --> Frontend
+    Frontend --> Processor_1
+    Frontend --> Processor_2
+    Processor_1 <--> DecodePool
+	Processor_2 <--> DecodePool
+    %% Prefill Worker Pool (horizontal layout)
+    subgraph PrefillPool["Prefill Worker Pool"]
+        direction LR
+        subgraph Prefill1["Prefill 1"]
+            direction TB
+            P1GPU0["GPU 0"]
+            P1GPU1["GPU 1"]
+		end
+        subgraph Prefill2["Prefill 2"]
+            direction TB
+            P2GPU0["GPU 0"]
+            P2GPU1["GPU 1"]
+		end
+    end
+    %% Decode Worker Pool (vertical layout)
+    subgraph DecodePool["Decode Worker Pool"]
+        direction TB
+        subgraph Decode1["Decode 1"]
+            direction TB
+            D1GPU0["GPU 0"]
+            D1GPU1["GPU 1"]
+            D1GPU2["GPU 2"]
+            D1GPU3["GPU 3"]
+        end
+    end
+	PrefillQueue --> PrefillPool
+    DecodePool --> PrefillQueue
+    PrefillPool -.-> DecodePool
+    %% Styling
+    style PrefillPool stroke:#0066cc,stroke-width:2px
+    style DecodePool stroke:#000,stroke-width:2px
+```
+#### Results:
+**Test Group:** disagg-p-tp-2-dp-2-d-tp-4-dp-1
+**Test Command:**  dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
+|    Failure     |   Startup Time |   Success |   Failed |   Latency Before |   Latency After |   Pending Before |   Pending After |   Violations Before |   Violations After |   Recovery Time |
+|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
+|      none      |          83.00 |    800.00 |     0.00 |             1.19 |             N/A |             0.00 |             N/A |                1.00 |                N/A |             N/A |
+|    frontend    |          82.00 |    704.00 |    96.00 |             1.19 |            1.17 |             0.00 |            0.01 |                1.00 |               0.00 |           12.95 |
+|   processor    |          78.00 |    795.00 |     5.00 |             1.20 |            1.18 |             0.02 |            0.01 |                1.00 |               0.00 |           25.91 |
+| decode_worker  |          78.00 |    199.00 |   601.00 |             1.21 |             nan |             0.00 |             N/A |                1.00 |               0.00 |             N/A |
+| prefill_worker |          77.00 |    800.00 |     0.00 |             1.22 |            1.18 |             0.00 |            0.01 |                1.00 |               1.00 |           45.14 |
+|  vllm_worker   |          77.00 |    799.00 |     1.00 |             1.20 |            1.16 |             0.02 |            0.00 |                1.00 |               1.00 |             N/A |
+#### Summary:
+1. Dynamo does not currently detect and recover from direct vllm
+   worker sub process failure. In this example the vllm sub process
+   failure targets a prefill worker and has the same overall impact.
+   Since the prefill workers are redundant - a failure has low impact.
+2. Redundant prefill workers are able to absorb the load and no
+   additional queing is needed.
+3. Decode worker failure is currently permanent in the disaggregated
+   case as the prefill worker holds references to memory and which are
+   not freed. This leads to total failure after fault injection.
+4. Redundant processors work in this case.
--- a/tests/fault_tolerance/__init__.py
+++ b/tests/fault_tolerance/__init__.py
--- a/tests/fault_tolerance/client.py
+++ b/tests/fault_tolerance/client.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import os
+import random
+import time
+from datetime import datetime
+import requests
+def _get_random_prompt(length):
+    word_list = [f"{i}" for i in range(10)]
+    return " ".join(random.choices(word_list, k=length))
+def _single_request(
+    url,
+    payload,
+    logger,
+    retry_attempts=1,
+    input_token_length=100,
+    output_token_length=100,
+    timeout=30,
+    retry_delay=1,
+):
+    prompt = _get_random_prompt(input_token_length)
+    payload["messages"][0]["content"] = prompt
+    payload["max_tokens"] = output_token_length
+    response = None
+    end_time = None
+    start_time = time.time()
+    results = []
+    while retry_attempts:
+        start_request_time = time.time()
+        try:
+            response = requests.post(
+                url,
+                json=payload,
+                timeout=timeout,
+            )
+            end_time = time.time()
+            content = None
+            try:
+                content = response.json()
+            except json.JSONDecodeError:
+                pass
+            results.append(
+                {
+                    "status": response.status_code,
+                    "result": content,
+                    "request_elapsed_time": end_time - start_request_time,
+                }
+            )
+            if response.status_code != 200:
+                time.sleep(retry_delay)
+                retry_attempts -= 1
+                continue
+            else:
+                break
+        except (requests.RequestException, requests.Timeout) as e:
+            results.append(
+                {
+                    "status": str(e),
+                    "result": None,
+                    "request_elapsed_time": time.time() - start_request_time,
+                }
+            )
+            logger.warning("Retrying due to Request failed: %s", e)
+            time.sleep(retry_delay)
+            retry_attempts -= 1
+            continue
+    return {
+        "time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
+        "results": results,
+        "total_time": time.time() - start_time,
+    }
+def client(
+    deployment_graph,
+    server_process,
+    payload,
+    log_dir,
+    index,
+    requests_per_client,
+    input_token_length,
+    output_token_length,
+    max_retries,
+    retry_delay=1,
+):
+    logger = logging.getLogger(f"CLIENT: {index}")
+    try:
+        log_path = os.path.join(log_dir, f"client_{index}.log.txt")
+        with open(log_path, "w") as log:
+            url = f"http://localhost:{server_process.port}/{deployment_graph.endpoints[0]}"
+            for i in range(requests_per_client):
+                result = _single_request(
+                    url,
+                    payload.payload_chat,
+                    logger,
+                    max_retries,
+                    input_token_length=input_token_length,
+                    output_token_length=output_token_length,
+                    retry_delay=retry_delay,
+                )
+                logger.info(
+                    f"Request: {i} Status: {result['results'][-1]['status']} Latency: {result['results'][-1]['request_elapsed_time']}"
+                )
+                log.write(json.dumps(result) + "\n")
+                log.flush()
+    except Exception as e:
+        logger.error(str(e))
+    logger.info("Exiting")
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
\ No newline at end of file
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 8
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+  ServiceArgs:
+    workers: 1
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 1
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 1
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/conftest.py
+++ b/tests/fault_tolerance/conftest.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+def pytest_addoption(parser):
+    parser.addoption("--requests-per-client", type=int, default=100)
+    parser.addoption("--clients", type=int, default=10)
+    parser.addoption("--no-respawn", action="store_true", default=False)
+    parser.addoption("--input-token-length", type=int, default=100)
+    parser.addoption("--output-token-length", type=int, default=100)
+    parser.addoption("--max-num-seqs", type=int, default=None)
+    parser.addoption("--max-retries", type=int, default=1)
+    parser.addoption("--display-dynamo-output", action="store_true", default=False)
+    parser.addoption("--combine-process-logs", action="store_true", default=False)
+    parser.addoption("--hf-hub-offline", action="store_true", default=False)
+@pytest.fixture
+def display_dynamo_output(request):
+    return request.config.getoption("--display-dynamo-output")
+@pytest.fixture
+def max_retries(request):
+    return request.config.getoption("--max-retries")
+@pytest.fixture
+def max_num_seqs(request):
+    return request.config.getoption("--max-num-seqs")
+@pytest.fixture
+def num_clients(request):
+    return request.config.getoption("--clients")
+@pytest.fixture
+def input_token_length(request):
+    return request.config.getoption("--input-token-length")
+@pytest.fixture
+def output_token_length(request):
+    return request.config.getoption("--output-token-length")
+@pytest.fixture
+def requests_per_client(request):
+    return request.config.getoption("--requests-per-client")
+@pytest.fixture
+def respawn(request):
+    return not request.config.getoption("--no-respawn")
+@pytest.fixture
+def separate_process_logs(request):
+    return not request.config.getoption("--combine-process-logs")
+@pytest.fixture
+def hf_hub_offline(request):
+    return request.config.getoption("--hf-hub-offline")
--- a/tests/fault_tolerance/parse_results.py
+++ b/tests/fault_tolerance/parse_results.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import re
+from datetime import datetime
+from typing import Any
+import pandas as pd
+from tabulate import tabulate
+def parse_test_log(file_path):
+    start_time = None
+    ready_time = None
+    fault_time = None
+    start_cmd = None
+    if not os.path.isfile(file_path):
+        return None, None, None
+    with open(file_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if "Running command: dynamo serve" in line:
+                start_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+                start_cmd = line.split("Running command:")[1]
+            elif "Deployment Ready" in line:
+                ready_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+            elif "Injecting failure for:" in line:
+                fault_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+    startup_time = (
+        (ready_time - start_time).total_seconds() if start_time and ready_time else None
+    )
+    return startup_time, fault_time, start_cmd
+def parse_client_logs(test_dir, expected_length=100):
+    all_logs = []
+    for file in os.listdir(test_dir):
+        if file.startswith("client_") and file.endswith(".log.txt"):
+            with open(os.path.join(test_dir, file), "r") as f:
+                request_number = 0
+                for line in f:
+                    request_number += 1
+                    data = json.loads(line.strip())
+                    for result in data["results"]:
+                        log_entry = {
+                            "time": datetime.fromisoformat(
+                                data["time"].replace("T", " ")
+                            ),
+                            "status": result["status"],
+                            "request_elapsed_time": result["request_elapsed_time"],
+                            "request_number": request_number - 1,
+                            "client": file.split("_")[1].split(".")[0],
+                        }
+                        if (
+                            "result" in result
+                            and result["result"]
+                            and "choices" in result["result"]
+                            and result["result"]["choices"]
+                        ):
+                            log_entry["success"] = True
+                            content = result["result"]["choices"][0]["message"][
+                                "content"
+                            ]
+                            if not content or len(content) < expected_length:
+                                log_entry["success"] = False
+                        else:
+                            log_entry["success"] = False
+                        all_logs.append(log_entry)
+    if len(all_logs):
+        df = pd.DataFrame(all_logs)
+        df.sort_values("time", inplace=True)
+        return df
+    return None
+def calculate_metrics(df, fault_time, sla=2.1):
+    success = df["success"].sum()
+    failure = len(df) - success
+    if fault_time:
+        before_fault = df[df["time"] <= fault_time]
+        after_fault = df[df["time"] > fault_time]
+    else:
+        before_fault = df
+        after_fault = None
+    # Existing latency metrics (only successful requests)
+    successful_before = before_fault[before_fault["success"]]
+    avg_before = successful_before["request_elapsed_time"].mean()
+    std_before = successful_before["request_elapsed_time"].std()
+    avg_after, std_after = None, None
+    if after_fault is not None and not after_fault.empty:
+        successful_after = after_fault[after_fault["success"]]
+        avg_after = successful_after["request_elapsed_time"].mean()
+        std_after = successful_after["request_elapsed_time"].std()
+    # SLA violations (only successful requests exceeding the SLA)
+    violations_before = (successful_before["request_elapsed_time"] > sla).sum()
+    violations_after = (
+        (successful_after["request_elapsed_time"] > sla).sum()
+        if after_fault is not None and not after_fault.empty
+        else None
+    )
+    return (
+        success,
+        failure,
+        avg_before,
+        std_before,
+        avg_after,
+        std_after,
+        violations_before,
+        violations_after,
+    )
+def parse_process_log(log_dir, process_name):
+    process_ready_line = {
+        "dynamo_Frontend": "added model",
+        "dynamo_VllmWorker": "Starting VllmWorker instance with all registered endpoints",
+        "dynamo_Processor": "Starting Processor instance with all registered endpoints",
+        "dynamo_PrefillWorker": "Starting PrefillWorker instance with all registered endpoints",
+    }
+    process_shutdown_line = {
+        "dynamo_Frontend": "SIGTERM received, starting graceful shutdown",
+        "dynamo_VllmWorker": "Received shutdown signal, shutting down DistributedRuntime",
+        "dynamo_Processor": "Received signal 15, initiating graceful shutdown",
+        "dynamo_PrefillWorker": "Shutdown hooks completed successfully",
+    }
+    process_log_path = os.path.join(log_dir, "error.log")
+    if not os.path.isfile(process_log_path):
+        return None, None
+    process_ready = []
+    process_shutdown = []
+    process_start_time = None
+    with open(process_log_path, "r") as f:
+        for line in f:
+            clean_line = re.sub(r"\x1b\[.*?m", "", line.strip())  # Remove ANSI codes
+            if not clean_line:
+                continue
+            parts = clean_line.split()
+            if len(parts) < 2:
+                continue
+            try:
+                # Parse timestamp (remove 'Z' for naive datetime)
+                timestamp = datetime.fromisoformat(parts[0].replace("Z", ""))
+            except ValueError:
+                continue
+            if not process_start_time:
+                process_start_time = timestamp
+            log_message = " ".join(parts[1:])
+            relative_time = (timestamp - process_start_time).total_seconds()
+            # Check for process start lines
+            if process_name in process_ready_line:
+                if process_ready_line[process_name] in log_message:
+                    process_ready.append((timestamp, log_message, relative_time))
+            # Check for process end lines
+            if process_name in process_shutdown_line:
+                if process_shutdown_line[process_name] in log_message:
+                    process_shutdown.append((timestamp, log_message, relative_time))
+    return process_ready, process_shutdown
+def parse_watcher_log(test_dir, fault_time):
+    before_requests = []
+    after_requests = []
+    watcher_log_path = os.path.join(test_dir, "watcher.log.txt")
+    if not os.path.isfile(watcher_log_path):
+        return None, None
+    with open(watcher_log_path, "r") as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+            except json.JSONDecodeError:
+                continue
+            if "metrics" not in data:
+                continue
+            entry_time = datetime.fromisoformat(data["time"].replace("T", " "))
+            for metric in data["metrics"]:
+                if len(metric) != 2:
+                    continue
+                _, metric_data = metric
+                if (
+                    "num_requests_waiting" in metric_data
+                    and "request_active_slots" in metric_data
+                    and metric_data["request_active_slots"] > 0
+                ):
+                    if fault_time is None or entry_time <= fault_time:
+                        before_requests.append(metric_data["num_requests_waiting"])
+                    else:
+                        after_requests.append(metric_data["num_requests_waiting"])
+    avg_before = (
+        sum(before_requests) / len(before_requests) if before_requests else None
+    )
+    avg_after = sum(after_requests) / len(after_requests) if after_requests else None
+    return avg_before, avg_after
+def calculate_recovery_time(test_dir, failure_type, fault_time):
+    processes = [
+        "dynamo_Frontend",
+        "dynamo_Processor",
+        "dynamo_VllmWorker",
+        "dynamo_PrefillWorker",
+    ]
+    process_start_ends = {}
+    start_time = None
+    for process in processes:
+        starts, ends = parse_process_log(os.path.join(test_dir, process), process)
+        if starts:
+            process_start_ends[process] = (starts, ends)
+    if failure_type == "processor":
+        start_time = process_start_ends["dynamo_Processor"][0][-1][0]
+    elif failure_type == "frontend":
+        start_time = process_start_ends["dynamo_Frontend"][0][-1][0]
+    elif failure_type == "decode_worker":
+        start_times = [
+            x
+            for x in process_start_ends["dynamo_VllmWorker"][0]
+            if "VllmWorker:1" in x[1]
+        ]
+        if not start_times:
+            return None
+        start_time = start_times[-1][0]
+    elif failure_type == "prefill_worker":
+        if "dynamo_PrefillWorker" not in process_start_ends:
+            return None
+        start_times = [
+            x
+            for x in process_start_ends["dynamo_PrefillWorker"][0]
+            if "PrefillWorker:1" in x[1]
+        ]
+        start_time = start_times[-1][0]
+    if not start_time:
+        return None
+    if fault_time > start_time:
+        return None
+    return (start_time - fault_time).total_seconds()
+def process_test_directory(test_dir):
+    test_name = test_dir.split("test_worker_failure[", 1)[1].rstrip("]")
+    failure_type = test_name.split("-")[-1]
+    test_prefix = "-".join(test_name.split("-")[:-1])
+    startup_time, fault_time, start_cmd = parse_test_log(
+        os.path.join(test_dir, "test.log.txt")
+    )
+    df = parse_client_logs(test_dir)
+    if df is None or df.empty:
+        return None
+    pending_requests_before, pending_requests_after = parse_watcher_log(
+        test_dir, fault_time
+    )
+    (
+        success,
+        failure,
+        avg_before,
+        std_before,
+        avg_after,
+        std_after,
+        violations_before,
+        violations_after,
+    ) = calculate_metrics(df, fault_time)
+    recovery_time = calculate_recovery_time(test_dir, failure_type, fault_time)
+    return {
+        "test": test_prefix,
+        "cmd": start_cmd,
+        "failure": failure_type,
+        "start_time": startup_time,
+        "success_requests": success,
+        "failed_requests": failure,
+        "avg_latency_before": avg_before,
+        "std_latency_before": std_before,
+        "avg_latency_after": avg_after,
+        "std_latency_after": std_after,
+        "pending_requests_before": pending_requests_before,
+        "pending_requests_after": pending_requests_after,
+        "violations_before": violations_before,
+        "violations_after": violations_after,
+        "recovery_time": recovery_time,
+    }
+def main(logs_dir, tablefmt, log_paths=[]):
+    results = []
+    if log_paths:
+        for log_path in log_paths:
+            result = process_test_directory(log_path)
+            if result:
+                results.append(result)
+    elif logs_dir:
+        for entry in os.listdir(logs_dir):
+            if entry.startswith("test_worker_failure[") and os.path.isdir(
+                os.path.join(logs_dir, entry)
+            ):
+                result = process_test_directory(os.path.join(logs_dir, entry))
+                if result:
+                    results.append(result)
+    # Group results by test prefix
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    commands = {}
+    for res in results:
+        test_prefix = res["test"]
+        if test_prefix not in grouped:
+            grouped[test_prefix] = []
+            commands[test_prefix] = res["cmd"]
+        grouped[test_prefix].append(res)
+    order = [
+        "none",
+        "frontend",
+        "processor",
+        "decode_worker",
+        "prefill_worker",
+        "vllm_worker",
+    ]
+    # Print grouped tables
+    for test_prefix, group in grouped.items():
+        new_group = []
+        for failure in order:
+            for res in group:
+                if failure == res["failure"]:
+                    new_group.append(res)
+        group = new_group
+        headers = [
+            "Failure",
+            "Startup Time",
+            "Success",
+            "Failed",
+            "Latency Before",
+            "Latency After",
+            "Pending Before",
+            "Pending After",
+            "Violations Before",
+            "Violations After",
+            "Recovery Time",
+        ]
+        rows = []
+        for res in group:
+            row = [
+                res["failure"],
+                res["start_time"],  # if res["start_time"] is not None else "N/A",
+                res["success_requests"],
+                res["failed_requests"],
+                res["avg_latency_before"],
+                res["avg_latency_after"],
+                res["pending_requests_before"],
+                res["pending_requests_after"],
+                res["violations_before"],
+                res["violations_after"],
+                res["recovery_time"],
+            ]
+            rows.append(row)
+        print(f"\nTest Group: {test_prefix}")
+        print(f"\nTest Command: {commands[test_prefix]}")
+        print(
+            tabulate(
+                rows,
+                headers,
+                tablefmt=tablefmt,
+                floatfmt=".2f",
+                missingval="N/A",
+                numalign="right",
+                stralign="center",
+            )
+        )
+        print("\n" + "=" * 80)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parse test results")
+    parser.add_argument("--log-dir", default=".", help="Path to the logs directory")
+    parser.add_argument(
+        "--format", choices=["fancy", "markdown"], default="fancy", help="Table format"
+    )
+    args = parser.parse_args()
+    # Map format choices to tabulate formats
+    tablefmt = (
+        "fancy_grid" if args.format == "fancy" else "pipe"
+    )  # Using pipe for markdown compatibility
+    main(args.log_dir, tablefmt)