Unverified Commit 36f03d40 authored by Neelay Shah's avatar Neelay Shah Committed by GitHub
Browse files

test: fault tolerance tests (#1444)


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent fb213a2f
...@@ -86,6 +86,23 @@ def create_circus_watcher( ...@@ -86,6 +86,23 @@ def create_circus_watcher(
use_sockets: bool = True, use_sockets: bool = True,
**kwargs: Any, **kwargs: Any,
) -> Watcher: ) -> Watcher:
log_dir = os.environ.get("DYN_CIRCUS_LOG_DIR", None)
if log_dir is not None:
prefix = f"{log_dir}/{name}"
os.makedirs(prefix, exist_ok=True)
stdout_stream = {
"class": "FileStream",
"filename": f"{prefix}/output.log",
"backup_count": 10,
}
stderr_stream = {
"class": "FileStream",
"filename": f"{prefix}/error.log",
"backup_count": 10,
}
else:
stdout_stream = None
stderr_stream = None
return Watcher( return Watcher(
name=name, name=name,
cmd=shlex.quote(cmd) if psutil.POSIX else cmd, cmd=shlex.quote(cmd) if psutil.POSIX else cmd,
...@@ -94,7 +111,10 @@ def create_circus_watcher( ...@@ -94,7 +111,10 @@ def create_circus_watcher(
stop_children=True, stop_children=True,
use_sockets=use_sockets, use_sockets=use_sockets,
graceful_timeout=86400, graceful_timeout=86400,
respawn=False, # TODO respawn=os.environ.get("DYN_CIRCUS_RESPAWN", "false").lower()
in ("true", "1", "yes"),
stdout_stream=stdout_stream,
stderr_stream=stderr_stream,
**kwargs, **kwargs,
) )
......
...@@ -162,6 +162,8 @@ markers = [ ...@@ -162,6 +162,8 @@ markers = [
"weekly: marks tests to run weekly", "weekly: marks tests to run weekly",
"gpu_1: marks tests to run on GPU", "gpu_1: marks tests to run on GPU",
"gpu_2: marks tests to run on 2GPUs", "gpu_2: marks tests to run on 2GPUs",
"gpu_4: marks tests to run on 4GPUs",
"gpu_8: marks tests to run on 8GPUs",
"e2e: marks tests as end-to-end tests", "e2e: marks tests as end-to-end tests",
"integration: marks tests as integration tests", "integration: marks tests as integration tests",
"unit: marks tests as unit tests", "unit: marks tests as unit tests",
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import logging import logging
import os import os
import shutil
import tempfile import tempfile
import pytest import pytest
...@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess ...@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess
# Custom format inspired by your example # Custom format inspired by your example
LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s" LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format=LOG_FORMAT, format=LOG_FORMAT,
datefmt="%Y-%m-%dT%H:%M:%S", # ISO 8601 UTC format datefmt=DATE_FORMAT, # ISO 8601 UTC format
) )
@pytest.fixture(autouse=True)
def logger(request):
log_path = os.path.join(request.node.name, "test.log.txt")
logger = logging.getLogger()
shutil.rmtree(request.node.name, ignore_errors=True)
os.makedirs(request.node.name, exist_ok=True)
handler = logging.FileHandler(log_path, mode="w")
formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
handler.setFormatter(formatter)
logger.addHandler(handler)
yield
handler.close()
logger.removeHandler(handler)
def pytest_collection_modifyitems(config, items): def pytest_collection_modifyitems(config, items):
""" """
This function is called to modify the list of tests to run. This function is called to modify the list of tests to run.
...@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess): ...@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess):
timeout=timeout, timeout=timeout,
display_output=False, display_output=False,
health_check_ports=[port], health_check_ports=[port],
data_dir=tempfile.mkdtemp(prefix="etcd_"), data_dir=data_dir,
log_dir=request.node.name, log_dir=request.node.name,
) )
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Fault Tolerance Test Suite
As a large scale distributed inference serving framework in addition
to providing high throughput and low latency, Dynamo needs to
provide fault detection, resilency, and quick recovery in the face of
unforseen failures. In order to test Dynamo we are developing a test
suite to inject and measure the impact of different types of failure
conditions.
## Test Architecture
The fault tolerance test suite is designed as a set of pytest
configurations that launch typical dynamo serve graph deployments and
then inject failures by terminating processes in the graph. To test
the recovery time and impact of failures a set number of clients are
launched in parallel. Each client sends a set number of synchronous
requests. Log files are stored for each dynamo process as well as for
each client and inspected using a post-processing script.
> [!NOTE]
> Test pass / failure is not an indication of SLA for recovery or resilience
> It only indicates is the test was executed and data was collected
> [!NOTE]
> The test suite currently targets single node Dynamo Serve.
> Support for Dynamo Deploy is a work in progress.
### Test Sequence Diagram
```mermaid
sequenceDiagram
participant Tester as Test Runner
participant Dynamo as DynamoServeProcess
participant Circus as CircusController
participant Client as Test Clients
participant Metrics as Metrics Collector
Tester->>Dynamo: Launch deployment graph
Dynamo-->>Tester: Signal ready
Tester->>Metrics: Start metrics collection
Tester->>Client: Spawn multiple clients
loop During Test
Client->>Dynamo: Send chat completion requests
Dynamo-->>Client: Return responses
Metrics->>Dynamo: Collect runtime metrics
end
Tester->>Dynamo: Inject failures (terminate components)
Dynamo-->>Tester: Recover/respawn as configured
Client-->>Tester: Log request results
Metrics-->>Tester: Log metrics data
Tester->>Dynamo: Shutdown deployment
Tester->>Metrics: Stop metrics collection
Tester->>Tester: Parse logs and summarize results
```
### Failure Scenarios
The test suite includes several predefined fault injection scenarios designed to validate system resilience under various failure conditions. These scenarios are configured in `scenarios.py` and can be selected via pytest parameters. Below is a description of the available scenarios:
| Scenario Name | Description | Affected Components | Timing Example |
|------------------------|-----------------------------------------------------------------------------|-------------------------------------------------|--------------------|
| **decode_worker** | Terminates decoder worker processes | `dynamo_vllmworker` | 30 seconds |
| **prefill_worker** | Terminates prefill worker processes | `dynamo_prefillworker` | 30 seconds |
| **frontend** | Terminates frontend processes handling client requests | `dynamo_frontend` | 30 seconds |
| **processor** | Terminates processor nodes responsible for task orchestration | `dynamo_processor` | 30 seconds |
| **vllm_worker** | Terminates low-level VLLM worker processes | `vllm_worker` (external to Dynamo) | 30 seconds |
| **none** | Baseline scenario with no failures | N/A | N/A |
#### Key Characteristics:
1. **Timing**: Failures are injected at predefined intervals (e.g., 30 seconds after test start)
2. **Severity**: The number of terminated processes can be configured (default: 1)
3. **Scope**: Failures target specific components while leaving others operational
#### Configuration:
- **Injection Timing**: Defined in `failure_scenarios` dictionary in `scenarios.py`
- **Process Count**: Adjustable via tuple values (e.g., `("dynamo_vllmworker", 1)` terminates 1 process)
- **Component Mapping**:
- `dynamo_*`: Internal Dynamo services
- `vllm_worker`: External VLLM model workers
#### Example Scenario Execution:
Run all graph configurations injecting a decode_worker failure.
```bash
cd tests/fault_tolerance
pytest test_runner.py -k decode_worker
```
### Test Results Directory
For each test scenario a directory of log files is created and post processed to summarize the test.
```
test_worker_failure[agg-tp-2-dp-4-none]
.
├── client_0.log.txt
├── client_1.log.txt
├── client_2.log.txt
├── client_3.log.txt
├── client_4.log.txt
├── client_5.log.txt
├── client_6.log.txt
├── client_7.log.txt
├── dynamo_Frontend
│   ├── error.log
│   └── output.log
├── dynamo.log.txt
├── dynamo_Planner
│   ├── error.log
│   └── output.log
├── dynamo_Processor
│   ├── error.log
│   └── output.log
├── dynamo_VllmWorker
│   ├── error.log
│   └── output.log
├── etcd.log.txt
├── nats-server.log.txt
├── nvidia-smi.log.txt
├── test.log.txt
└── watcher.log.txt
```
| File/Directory Name | Description |
|------------------------------------|------------------------------------------------------------------------------------------------|
| **client_*.log.txt** | Request/response logs for each client instance (contains JSON-formatted request details) |
| **dynamo_*/error.log** | Error logs for specific Dynamo components (e.g., Frontend, Processor, VllmWorker) |
| **dynamo_*/output.log** | Standard output logs for Dynamo components (service startup/shutdown messages) |
| **dynamo.log.txt** | Aggregate logs for Dynamo services (orchestration and initialization) |
| **etcd.log.txt** | Logs for etcd, the distributed key-value store used for service coordination |
| **nats-server.log.txt** | Logs for NATS message broker, handling inter-service communication |
| **nvidia-smi.log.txt** | GPU monitoring logs (records utilization statistics during test execution) |
| **test.log.txt** | Primary test execution log (contains fault injection timing, process management, and test status)|
| **watcher.log.txt** | Metrics collected by the watcher service (e.g., pending requests, active workers) |
### Summary Results
Results are presented in table format after each test providing summary statistics.
**Test Group:** agg-tp-2-dp-1
**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 56.00 | 800.00 | 0.00 | 1.97 | N/A | 0.00 | N/A | 8.00 | N/A | N/A |
| frontend | 56.00 | 656.00 | 144.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 17.53 |
| processor | 57.00 | 584.00 | 216.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 25.96 |
| decode_worker | 80.00 | 520.00 | 280.00 | 2.01 | 1.98 | 0.00 | 0.00 | 8.00 | 8.00 | 37.99 |
| vllm_worker | 58.00 | 120.00 | 680.00 | 1.98 | nan | 0.00 | 0.00 | 0.00 | 0.00 | N/A |
| Column Name | Description |
|-----------------------|-----------------------------------------------------------------------------|
| **Failure** | Type of fault injection applied during the test (or 'none' for baseline) |
| **Startup Time** | Time (seconds) taken for the service to become ready after initialization |
| **Success** | Number of client requests that succeeded |
| **Failed** | Number of client requests that failed or were invalid |
| **Latency Before** | Average request latency (seconds) for successful requests before fault injection |
| **Latency After** | Average request latency (seconds) for successful requests after fault injection (N/A if no fault) |
| **Pending Before** | Average number of pending requests observed before fault injection |
| **Pending After** | Average number of pending requests observed after fault injection (N/A if no fault) |
| **Violations Before** | Number of successful requests exceeding SLA latency before fault injection |
| **Violations After** | Number of successful requests exceeding SLA latency after fault injection (N/A if no fault) |
| **Recovery Time** | Time (seconds) taken for failed components to recover after fault injection |
## Example Results
The following results were obtained running on a single node with 8
L40 GPUs using "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" with 8
concurrent clients each sending 100 requests.
### Aggregated Workers
#### No Redundancy
To demonstrate the failure and recovery time in the case that there is
a single instance of each process we ran a simmple "agg-tp-2-dp-1" configuration.
```mermaid
graph LR
Client["Client"]
Frontend["Frontend"]
Processor["Processor"]
Client --> Frontend
Frontend --> Processor
Processor --> DecodePool
%% Decode Worker Pool (vertical layout)
subgraph DecodePool["Decode Worker Pool"]
direction TB
subgraph Decode1["Decode 1"]
direction TB
D1GPU0["GPU 0"]
D1GPU1["GPU 1"]
end
end
%% Styling
style DecodePool stroke:#000,stroke-width:2px
```
#### Results:
**Test Group: agg-tp-2-dp-1**
**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 56.00 | 800.00 | 0.00 | 1.97 | N/A | 0.00 | N/A | 8.00 | N/A | N/A |
| frontend | 56.00 | 656.00 | 144.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 17.53 |
| processor | 57.00 | 584.00 | 216.00 | 1.96 | 1.96 | 0.00 | 0.00 | 0.00 | 0.00 | 25.96 |
| decode_worker | 80.00 | 520.00 | 280.00 | 2.01 | 1.98 | 0.00 | 0.00 | 8.00 | 8.00 | 37.99 |
| vllm_worker | 58.00 | 120.00 | 680.00 | 1.98 | nan | 0.00 | 0.00 | 0.00 | 0.00 | N/A |
#### Summary:
1. Dynamo does not currently detect and recover from direct vllm worker sub process failure. (WIP)
2. Recovery time for the decode worker itself is the largest and a decode worker failure has the largest impact (as expected)
3. Overall failure count is roughly equal to recovery time multiplied by number of clients (as expected).
#### Redundant Workers (Over Provisoned)
To demonstrate the failure and recovery time in the case that there
are multiple instances of each process (except for the frontend) we
ran a simple "agg-tp-2-dp-4" configuration.
In this case we also consider the system to be "over provisioned" for
the workload as multiple workers are not needed to maintain SLA for
the 8 clients.
```mermaid
graph LR
Client["Client"]
Frontend["Frontend"]
Processor_1["Processor 1"]
Processor_2["Processor 2"]
Client --> Frontend
Frontend --> Processor_1
Frontend --> Processor_2
subgraph DecodePool["Decode Worker Pool"]
direction LR
subgraph Decode1["Decode 1"]
direction TB
D1GPU0["GPU 0"]
D1GPU1["GPU 1"]
end
subgraph Decode2["Decode 2"]
direction TB
D2GPU0["GPU 0"]
D2GPU1["GPU 1"]
end
subgraph Decode3["Decode 3"]
direction TB
D3GPU0["GPU 0"]
D3GPU1["GPU 1"]
end
subgraph Decode4["Decode 4"]
direction TB
D4GPU0["GPU 0"]
D4GPU1["GPU 1"]
end
end
Processor_1 --> DecodePool
Processor_2 --> DecodePool
style DecodePool stroke:#000,stroke-width:2px
```
#### Results:
**Test Group:** agg-tp-2-dp-4
**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 57.00 | 800.00 | 0.00 | 1.76 | N/A | 0.00 | N/A | 0.00 | N/A | N/A |
| frontend | 57.00 | 672.00 | 128.00 | 1.77 | 1.74 | 0.00 | 0.00 | 0.00 | 0.00 | 16.65 |
| processor | 52.00 | 680.00 | 120.00 | 1.79 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | 21.25 |
| decode_worker | 56.00 | 796.00 | 4.00 | 1.82 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | 44.88 |
| vllm_worker | 52.00 | 634.00 | 166.00 | 1.78 | 1.78 | 0.00 | 0.00 | 0.00 | 0.00 | N/A |
#### Summary:
1. Dynamo does not currently detect and recover from direct vllm
worker sub process failure. In the case of redundant workers this
results in roughtly 1/4 the requests failing after the initial 30
seconds. (WIP)
2. By immediately detecting a decode worker failure, Dynamo can limit
the failures and reroute requests to healthy workers with minimal
impact.
3. While the processor was configured with redundancy - the system was
unable to instantiate two processors successfully leading to
failure when the processor was terminated. (WIP)
#### Redundant Workers (Exact Provisioning)
To demonstrate the failure and recovery time in the case that there
are multiple instances of each process (except for the frontend) we
ran a simple "agg-tp-2-dp-4" configuration.
In this case we also consider the system to be "exact provisioned" for
the workload as we limit the max-num-seqs for each decode worker to
exactly 2. This artificially creates a scenario that results in queing
when a failur occurs before a worker is recovered.
```mermaid
graph LR
Client["Client"]
Frontend["Frontend"]
Processor_1["Processor 1"]
Processor_2["Processor 2"]
Client --> Frontend
Frontend --> Processor_1
Frontend --> Processor_2
subgraph DecodePool["Decode Worker Pool"]
direction LR
subgraph Decode1["Decode 1 (max 2 seq)"]
direction TB
D1GPU0["GPU 0"]
D1GPU1["GPU 1"]
end
subgraph Decode2["Decode 2 (max 2 seq)"]
direction TB
D2GPU0["GPU 0"]
D2GPU1["GPU 1"]
end
subgraph Decode3["Decode 3 (max 2 seq)"]
direction TB
D3GPU0["GPU 0"]
D3GPU1["GPU 1"]
end
subgraph Decode4["Decode 4 (max 2 seq)"]
direction TB
D4GPU0["GPU 0"]
D4GPU1["GPU 1"]
end
end
Processor_1 --> DecodePool
Processor_2 --> DecodePool
style DecodePool stroke:#000,stroke-width:2px
```
#### Results:
**Test Group:** agg-tp-2-dp-4
**Test Command:** dynamo serve graphs.agg:Frontend -f /workspace/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml --Frontend.port 8000 --VllmWorker.max_num_seqs 2 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:-------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 57.00 | 800.00 | 0.00 | 1.77 | N/A | 0.01 | N/A | 0.00 | N/A | N/A |
| frontend | 56.00 | 664.00 | 136.00 | 1.80 | 1.77 | 0.00 | 0.00 | 0.00 | 0.00 | 17.22 |
| processor | 56.00 | 649.00 | 151.00 | 1.76 | 1.77 | 0.01 | 0.00 | 0.00 | 0.00 | 25.79 |
| decode_worker | 56.00 | 798.00 | 2.00 | 1.77 | 1.89 | 0.00 | 0.13 | 0.00 | 84.00 | 44.57 |
| vllm_worker | 56.00 | 632.00 | 168.00 | 1.80 | 2.23 | 0.00 | 0.38 | 0.00 | 232.00 | N/A |
#### Summary:
1. Dynamo does not currently detect and recover from direct vllm
worker sub process failure. In the case of redundant workers this
results in roughtly 1/4 the requests failing after the initial 30
seconds. All requests after the initial 30 seconds would also be
subject to queing as a result and we see increased SLA
violations. (WIP)
2. By immediately detecting a decode worker failure, Dynamo can limit
the failures and reroute requests to healthy workers with minimal
impact. However during the recovery period requests are subject to
queing and as a results we see increased SLA violations.
3. While the processor was configured with redundancy - the system was
unable to instantiate two processors successfully leading to
failure when the processor was terminated. (WIP)
### Disaggregated Workers
#### No Redunancy
To demonstrate the failure and recovery time in the case of a
disaaggregated deployment with a single instance for each process in
the graph we ran a simple `disagg-p-tp-2-dp-1-d-tp-4-dp-1` configuration.
```mermaid
graph LR
Client["Client"]
Frontend["Frontend"]
Processor["Processor"]
PrefillQueue["Remote Prefill Queue"]
Client --> Frontend
Frontend --> Processor
Processor <--> DecodePool
%% Prefill Worker Pool (horizontal layout)
subgraph PrefillPool["Prefill Worker Pool"]
direction LR
subgraph Prefill1["Prefill 1"]
direction TB
P1GPU0["GPU 0"]
P1GPU1["GPU 1"]
end
end
%% Decode Worker Pool (vertical layout)
subgraph DecodePool["Decode Worker Pool"]
direction TB
subgraph Decode1["Decode 1"]
direction TB
D1GPU0["GPU 0"]
D1GPU1["GPU 1"]
D1GPU2["GPU 2"]
D1GPU3["GPU 3"]
end
end
PrefillQueue --> PrefillPool
DecodePool --> PrefillQueue
PrefillPool -.-> DecodePool
%% Styling
style PrefillPool stroke:#0066cc,stroke-width:2px
style DecodePool stroke:#000,stroke-width:2px
```
#### Results:
**Test Group:** disagg-p-tp-2-dp-1-d-tp-4-dp-1
**Test Command:** dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 83.00 | 800.00 | 0.00 | 1.19 | N/A | 0.01 | N/A | 0.00 | N/A | N/A |
| frontend | 78.00 | 664.00 | 136.00 | 1.19 | 1.19 | 0.07 | 0.02 | 0.00 | 0.00 | 17.24 |
| processor | 77.00 | 576.00 | 224.00 | 1.19 | 1.19 | 0.00 | 0.00 | 0.00 | 0.00 | 26.90 |
| decode_worker | 72.00 | 200.00 | 600.00 | 1.20 | 1.28 | 0.03 | N/A | 0.00 | 0.00 | N/A |
| prefill_worker | 81.00 | 798.00 | 2.00 | 1.19 | 1.22 | 0.05 | 0.05 | 0.00 | 0.00 | 42.31 |
| vllm_worker | 83.00 | 797.00 | 3.00 | 1.19 | 1.22 | 0.00 | 0.03 | 0.00 | 8.00 | N/A |
#### Summary:
1. Dynamo does not currently detect and recover from direct vllm
worker sub process failure. In this example the vllm sub process
failure targets a prefill worker and has the same overall impact.
(WIP)
2. Prefill worker failure causes request timeout (30 sec) and in
addition during recovery time prefill requests are queued in the
prefill queue.
3. Decode worker failure is currently permanent in the disaggregated
case as the prefill worker holds references to memory and which are
not freed. This leads to total failure after fault injection.
#### Redundant Workers
To demonstrate the failure and recovery time in the case that there
are multiple instances of each process (except for the frontend and
decode worker) we ran a simple "disagg-p-tp-2-dp-2-d-tp-4-dp-1"
configuration.
```mermaid
graph LR
Client["Client"]
Frontend["Frontend"]
Processor_1["Processor 1"]
Processor_2["Processor 2"]
PrefillQueue["Remote Prefill Queue"]
Client --> Frontend
Frontend --> Processor_1
Frontend --> Processor_2
Processor_1 <--> DecodePool
Processor_2 <--> DecodePool
%% Prefill Worker Pool (horizontal layout)
subgraph PrefillPool["Prefill Worker Pool"]
direction LR
subgraph Prefill1["Prefill 1"]
direction TB
P1GPU0["GPU 0"]
P1GPU1["GPU 1"]
end
subgraph Prefill2["Prefill 2"]
direction TB
P2GPU0["GPU 0"]
P2GPU1["GPU 1"]
end
end
%% Decode Worker Pool (vertical layout)
subgraph DecodePool["Decode Worker Pool"]
direction TB
subgraph Decode1["Decode 1"]
direction TB
D1GPU0["GPU 0"]
D1GPU1["GPU 1"]
D1GPU2["GPU 2"]
D1GPU3["GPU 3"]
end
end
PrefillQueue --> PrefillPool
DecodePool --> PrefillQueue
PrefillPool -.-> DecodePool
%% Styling
style PrefillPool stroke:#0066cc,stroke-width:2px
style DecodePool stroke:#000,stroke-width:2px
```
#### Results:
**Test Group:** disagg-p-tp-2-dp-2-d-tp-4-dp-1
**Test Command:** dynamo serve graphs.disagg:Frontend -f /workspace/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml --Frontend.port 8000 in /workspace/examples/llm
| Failure | Startup Time | Success | Failed | Latency Before | Latency After | Pending Before | Pending After | Violations Before | Violations After | Recovery Time |
|:--------------:|---------------:|----------:|---------:|-----------------:|----------------:|-----------------:|----------------:|--------------------:|-------------------:|----------------:|
| none | 83.00 | 800.00 | 0.00 | 1.19 | N/A | 0.00 | N/A | 1.00 | N/A | N/A |
| frontend | 82.00 | 704.00 | 96.00 | 1.19 | 1.17 | 0.00 | 0.01 | 1.00 | 0.00 | 12.95 |
| processor | 78.00 | 795.00 | 5.00 | 1.20 | 1.18 | 0.02 | 0.01 | 1.00 | 0.00 | 25.91 |
| decode_worker | 78.00 | 199.00 | 601.00 | 1.21 | nan | 0.00 | N/A | 1.00 | 0.00 | N/A |
| prefill_worker | 77.00 | 800.00 | 0.00 | 1.22 | 1.18 | 0.00 | 0.01 | 1.00 | 1.00 | 45.14 |
| vllm_worker | 77.00 | 799.00 | 1.00 | 1.20 | 1.16 | 0.02 | 0.00 | 1.00 | 1.00 | N/A |
#### Summary:
1. Dynamo does not currently detect and recover from direct vllm
worker sub process failure. In this example the vllm sub process
failure targets a prefill worker and has the same overall impact.
Since the prefill workers are redundant - a failure has low impact.
2. Redundant prefill workers are able to absorb the load and no
additional queing is needed.
3. Decode worker failure is currently permanent in the disaggregated
case as the prefill worker holds references to memory and which are
not freed. This leads to total failure after fault injection.
4. Redundant processors work in this case.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import logging
import os
import random
import time
from datetime import datetime
import requests
def _get_random_prompt(length):
word_list = [f"{i}" for i in range(10)]
return " ".join(random.choices(word_list, k=length))
def _single_request(
url,
payload,
logger,
retry_attempts=1,
input_token_length=100,
output_token_length=100,
timeout=30,
retry_delay=1,
):
prompt = _get_random_prompt(input_token_length)
payload["messages"][0]["content"] = prompt
payload["max_tokens"] = output_token_length
response = None
end_time = None
start_time = time.time()
results = []
while retry_attempts:
start_request_time = time.time()
try:
response = requests.post(
url,
json=payload,
timeout=timeout,
)
end_time = time.time()
content = None
try:
content = response.json()
except json.JSONDecodeError:
pass
results.append(
{
"status": response.status_code,
"result": content,
"request_elapsed_time": end_time - start_request_time,
}
)
if response.status_code != 200:
time.sleep(retry_delay)
retry_attempts -= 1
continue
else:
break
except (requests.RequestException, requests.Timeout) as e:
results.append(
{
"status": str(e),
"result": None,
"request_elapsed_time": time.time() - start_request_time,
}
)
logger.warning("Retrying due to Request failed: %s", e)
time.sleep(retry_delay)
retry_attempts -= 1
continue
return {
"time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
"results": results,
"total_time": time.time() - start_time,
}
def client(
deployment_graph,
server_process,
payload,
log_dir,
index,
requests_per_client,
input_token_length,
output_token_length,
max_retries,
retry_delay=1,
):
logger = logging.getLogger(f"CLIENT: {index}")
try:
log_path = os.path.join(log_dir, f"client_{index}.log.txt")
with open(log_path, "w") as log:
url = f"http://localhost:{server_process.port}/{deployment_graph.endpoints[0]}"
for i in range(requests_per_client):
result = _single_request(
url,
payload.payload_chat,
logger,
max_retries,
input_token_length=input_token_length,
output_token_length=output_token_length,
retry_delay=retry_delay,
)
logger.info(
f"Request: {i} Status: {result['results'][-1]['status']} Latency: {result['results'][-1]['request_elapsed_time']}"
)
log.write(json.dumps(result) + "\n")
log.flush()
except Exception as e:
logger.error(str(e))
logger.info("Exiting")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 4
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 8
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
ServiceArgs:
workers: 1
resources:
cpu: "10"
memory: "20Gi"
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 2
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 2
ServiceArgs:
workers: 4
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 1
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 1
ServiceArgs:
workers: 2
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 4
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
ServiceArgs:
workers: 2
resources:
cpu: "10"
memory: "20Gi"
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 4
ServiceArgs:
workers: 1
resources:
gpu: '4'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
tensor-parallel-size: 2
ServiceArgs:
workers: 2
resources:
gpu: '2'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
def pytest_addoption(parser):
parser.addoption("--requests-per-client", type=int, default=100)
parser.addoption("--clients", type=int, default=10)
parser.addoption("--no-respawn", action="store_true", default=False)
parser.addoption("--input-token-length", type=int, default=100)
parser.addoption("--output-token-length", type=int, default=100)
parser.addoption("--max-num-seqs", type=int, default=None)
parser.addoption("--max-retries", type=int, default=1)
parser.addoption("--display-dynamo-output", action="store_true", default=False)
parser.addoption("--combine-process-logs", action="store_true", default=False)
parser.addoption("--hf-hub-offline", action="store_true", default=False)
@pytest.fixture
def display_dynamo_output(request):
return request.config.getoption("--display-dynamo-output")
@pytest.fixture
def max_retries(request):
return request.config.getoption("--max-retries")
@pytest.fixture
def max_num_seqs(request):
return request.config.getoption("--max-num-seqs")
@pytest.fixture
def num_clients(request):
return request.config.getoption("--clients")
@pytest.fixture
def input_token_length(request):
return request.config.getoption("--input-token-length")
@pytest.fixture
def output_token_length(request):
return request.config.getoption("--output-token-length")
@pytest.fixture
def requests_per_client(request):
return request.config.getoption("--requests-per-client")
@pytest.fixture
def respawn(request):
return not request.config.getoption("--no-respawn")
@pytest.fixture
def separate_process_logs(request):
return not request.config.getoption("--combine-process-logs")
@pytest.fixture
def hf_hub_offline(request):
return request.config.getoption("--hf-hub-offline")
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os
import re
from datetime import datetime
from typing import Any
import pandas as pd
from tabulate import tabulate
def parse_test_log(file_path):
start_time = None
ready_time = None
fault_time = None
start_cmd = None
if not os.path.isfile(file_path):
return None, None, None
with open(file_path, "r") as f:
for line in f:
line = line.strip()
if "Running command: dynamo serve" in line:
start_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
start_cmd = line.split("Running command:")[1]
elif "Deployment Ready" in line:
ready_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
elif "Injecting failure for:" in line:
fault_time = datetime.fromisoformat(
line.split(" ")[1].replace("T", " ")
)
startup_time = (
(ready_time - start_time).total_seconds() if start_time and ready_time else None
)
return startup_time, fault_time, start_cmd
def parse_client_logs(test_dir, expected_length=100):
all_logs = []
for file in os.listdir(test_dir):
if file.startswith("client_") and file.endswith(".log.txt"):
with open(os.path.join(test_dir, file), "r") as f:
request_number = 0
for line in f:
request_number += 1
data = json.loads(line.strip())
for result in data["results"]:
log_entry = {
"time": datetime.fromisoformat(
data["time"].replace("T", " ")
),
"status": result["status"],
"request_elapsed_time": result["request_elapsed_time"],
"request_number": request_number - 1,
"client": file.split("_")[1].split(".")[0],
}
if (
"result" in result
and result["result"]
and "choices" in result["result"]
and result["result"]["choices"]
):
log_entry["success"] = True
content = result["result"]["choices"][0]["message"][
"content"
]
if not content or len(content) < expected_length:
log_entry["success"] = False
else:
log_entry["success"] = False
all_logs.append(log_entry)
if len(all_logs):
df = pd.DataFrame(all_logs)
df.sort_values("time", inplace=True)
return df
return None
def calculate_metrics(df, fault_time, sla=2.1):
success = df["success"].sum()
failure = len(df) - success
if fault_time:
before_fault = df[df["time"] <= fault_time]
after_fault = df[df["time"] > fault_time]
else:
before_fault = df
after_fault = None
# Existing latency metrics (only successful requests)
successful_before = before_fault[before_fault["success"]]
avg_before = successful_before["request_elapsed_time"].mean()
std_before = successful_before["request_elapsed_time"].std()
avg_after, std_after = None, None
if after_fault is not None and not after_fault.empty:
successful_after = after_fault[after_fault["success"]]
avg_after = successful_after["request_elapsed_time"].mean()
std_after = successful_after["request_elapsed_time"].std()
# SLA violations (only successful requests exceeding the SLA)
violations_before = (successful_before["request_elapsed_time"] > sla).sum()
violations_after = (
(successful_after["request_elapsed_time"] > sla).sum()
if after_fault is not None and not after_fault.empty
else None
)
return (
success,
failure,
avg_before,
std_before,
avg_after,
std_after,
violations_before,
violations_after,
)
def parse_process_log(log_dir, process_name):
process_ready_line = {
"dynamo_Frontend": "added model",
"dynamo_VllmWorker": "Starting VllmWorker instance with all registered endpoints",
"dynamo_Processor": "Starting Processor instance with all registered endpoints",
"dynamo_PrefillWorker": "Starting PrefillWorker instance with all registered endpoints",
}
process_shutdown_line = {
"dynamo_Frontend": "SIGTERM received, starting graceful shutdown",
"dynamo_VllmWorker": "Received shutdown signal, shutting down DistributedRuntime",
"dynamo_Processor": "Received signal 15, initiating graceful shutdown",
"dynamo_PrefillWorker": "Shutdown hooks completed successfully",
}
process_log_path = os.path.join(log_dir, "error.log")
if not os.path.isfile(process_log_path):
return None, None
process_ready = []
process_shutdown = []
process_start_time = None
with open(process_log_path, "r") as f:
for line in f:
clean_line = re.sub(r"\x1b\[.*?m", "", line.strip()) # Remove ANSI codes
if not clean_line:
continue
parts = clean_line.split()
if len(parts) < 2:
continue
try:
# Parse timestamp (remove 'Z' for naive datetime)
timestamp = datetime.fromisoformat(parts[0].replace("Z", ""))
except ValueError:
continue
if not process_start_time:
process_start_time = timestamp
log_message = " ".join(parts[1:])
relative_time = (timestamp - process_start_time).total_seconds()
# Check for process start lines
if process_name in process_ready_line:
if process_ready_line[process_name] in log_message:
process_ready.append((timestamp, log_message, relative_time))
# Check for process end lines
if process_name in process_shutdown_line:
if process_shutdown_line[process_name] in log_message:
process_shutdown.append((timestamp, log_message, relative_time))
return process_ready, process_shutdown
def parse_watcher_log(test_dir, fault_time):
before_requests = []
after_requests = []
watcher_log_path = os.path.join(test_dir, "watcher.log.txt")
if not os.path.isfile(watcher_log_path):
return None, None
with open(watcher_log_path, "r") as f:
for line in f:
try:
data = json.loads(line.strip())
except json.JSONDecodeError:
continue
if "metrics" not in data:
continue
entry_time = datetime.fromisoformat(data["time"].replace("T", " "))
for metric in data["metrics"]:
if len(metric) != 2:
continue
_, metric_data = metric
if (
"num_requests_waiting" in metric_data
and "request_active_slots" in metric_data
and metric_data["request_active_slots"] > 0
):
if fault_time is None or entry_time <= fault_time:
before_requests.append(metric_data["num_requests_waiting"])
else:
after_requests.append(metric_data["num_requests_waiting"])
avg_before = (
sum(before_requests) / len(before_requests) if before_requests else None
)
avg_after = sum(after_requests) / len(after_requests) if after_requests else None
return avg_before, avg_after
def calculate_recovery_time(test_dir, failure_type, fault_time):
processes = [
"dynamo_Frontend",
"dynamo_Processor",
"dynamo_VllmWorker",
"dynamo_PrefillWorker",
]
process_start_ends = {}
start_time = None
for process in processes:
starts, ends = parse_process_log(os.path.join(test_dir, process), process)
if starts:
process_start_ends[process] = (starts, ends)
if failure_type == "processor":
start_time = process_start_ends["dynamo_Processor"][0][-1][0]
elif failure_type == "frontend":
start_time = process_start_ends["dynamo_Frontend"][0][-1][0]
elif failure_type == "decode_worker":
start_times = [
x
for x in process_start_ends["dynamo_VllmWorker"][0]
if "VllmWorker:1" in x[1]
]
if not start_times:
return None
start_time = start_times[-1][0]
elif failure_type == "prefill_worker":
if "dynamo_PrefillWorker" not in process_start_ends:
return None
start_times = [
x
for x in process_start_ends["dynamo_PrefillWorker"][0]
if "PrefillWorker:1" in x[1]
]
start_time = start_times[-1][0]
if not start_time:
return None
if fault_time > start_time:
return None
return (start_time - fault_time).total_seconds()
def process_test_directory(test_dir):
test_name = test_dir.split("test_worker_failure[", 1)[1].rstrip("]")
failure_type = test_name.split("-")[-1]
test_prefix = "-".join(test_name.split("-")[:-1])
startup_time, fault_time, start_cmd = parse_test_log(
os.path.join(test_dir, "test.log.txt")
)
df = parse_client_logs(test_dir)
if df is None or df.empty:
return None
pending_requests_before, pending_requests_after = parse_watcher_log(
test_dir, fault_time
)
(
success,
failure,
avg_before,
std_before,
avg_after,
std_after,
violations_before,
violations_after,
) = calculate_metrics(df, fault_time)
recovery_time = calculate_recovery_time(test_dir, failure_type, fault_time)
return {
"test": test_prefix,
"cmd": start_cmd,
"failure": failure_type,
"start_time": startup_time,
"success_requests": success,
"failed_requests": failure,
"avg_latency_before": avg_before,
"std_latency_before": std_before,
"avg_latency_after": avg_after,
"std_latency_after": std_after,
"pending_requests_before": pending_requests_before,
"pending_requests_after": pending_requests_after,
"violations_before": violations_before,
"violations_after": violations_after,
"recovery_time": recovery_time,
}
def main(logs_dir, tablefmt, log_paths=[]):
results = []
if log_paths:
for log_path in log_paths:
result = process_test_directory(log_path)
if result:
results.append(result)
elif logs_dir:
for entry in os.listdir(logs_dir):
if entry.startswith("test_worker_failure[") and os.path.isdir(
os.path.join(logs_dir, entry)
):
result = process_test_directory(os.path.join(logs_dir, entry))
if result:
results.append(result)
# Group results by test prefix
grouped: dict[str, list[dict[str, Any]]] = {}
commands = {}
for res in results:
test_prefix = res["test"]
if test_prefix not in grouped:
grouped[test_prefix] = []
commands[test_prefix] = res["cmd"]
grouped[test_prefix].append(res)
order = [
"none",
"frontend",
"processor",
"decode_worker",
"prefill_worker",
"vllm_worker",
]
# Print grouped tables
for test_prefix, group in grouped.items():
new_group = []
for failure in order:
for res in group:
if failure == res["failure"]:
new_group.append(res)
group = new_group
headers = [
"Failure",
"Startup Time",
"Success",
"Failed",
"Latency Before",
"Latency After",
"Pending Before",
"Pending After",
"Violations Before",
"Violations After",
"Recovery Time",
]
rows = []
for res in group:
row = [
res["failure"],
res["start_time"], # if res["start_time"] is not None else "N/A",
res["success_requests"],
res["failed_requests"],
res["avg_latency_before"],
res["avg_latency_after"],
res["pending_requests_before"],
res["pending_requests_after"],
res["violations_before"],
res["violations_after"],
res["recovery_time"],
]
rows.append(row)
print(f"\nTest Group: {test_prefix}")
print(f"\nTest Command: {commands[test_prefix]}")
print(
tabulate(
rows,
headers,
tablefmt=tablefmt,
floatfmt=".2f",
missingval="N/A",
numalign="right",
stralign="center",
)
)
print("\n" + "=" * 80)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parse test results")
parser.add_argument("--log-dir", default=".", help="Path to the logs directory")
parser.add_argument(
"--format", choices=["fancy", "markdown"], default="fancy", help="Table format"
)
args = parser.parse_args()
# Map format choices to tabulate formats
tablefmt = (
"fancy_grid" if args.format == "fancy" else "pipe"
) # Using pipe for markdown compatibility
main(args.log_dir, tablefmt)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment