test: fault tolerance tests (#1444)

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

test: fault tolerance tests (#1444)
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
36f03d40 · Neelay Shah · GitHub · fb213a2f · 36f03d40 · 36f03d40
Unverified Commit 36f03d40 authored Jul 03, 2025 by Neelay Shah Committed by GitHub Jul 03, 2025
20 changed files
--- a/deploy/sdk/src/dynamo/sdk/cli/circus.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/circus.py
@@ -86,6 +86,23 @@ def create_circus_watcher(
    use_sockets: bool = True,
    **kwargs: Any,
 ) -> Watcher:
+    log_dir = os.environ.get("DYN_CIRCUS_LOG_DIR", None)
+    if log_dir is not None:
+        prefix = f"{log_dir}/{name}"
+        os.makedirs(prefix, exist_ok=True)
+        stdout_stream = {
+            "class": "FileStream",
+            "filename": f"{prefix}/output.log",
+            "backup_count": 10,
+        }
+        stderr_stream = {
+            "class": "FileStream",
+            "filename": f"{prefix}/error.log",
+            "backup_count": 10,
+        }
+    else:
+        stdout_stream = None
+        stderr_stream = None
    return Watcher(
        name=name,
        cmd=shlex.quote(cmd) if psutil.POSIX else cmd,
@@ -94,7 +111,10 @@ def create_circus_watcher(
        stop_children=True,
        use_sockets=use_sockets,
        graceful_timeout=86400,
-        respawn=False,  # TODO
+        respawn=os.environ.get("DYN_CIRCUS_RESPAWN", "false").lower()
+        in ("true", "1", "yes"),
+        stdout_stream=stdout_stream,
+        stderr_stream=stderr_stream,
        **kwargs,
    )

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -162,6 +162,8 @@ markers = [
    "weekly: marks tests to run weekly",
    "gpu_1: marks tests to run on GPU",
    "gpu_2: marks tests to run on 2GPUs",
+    "gpu_4: marks tests to run on 4GPUs",
+    "gpu_8: marks tests to run on 8GPUs",
    "e2e: marks tests as end-to-end tests",
    "integration: marks tests as integration tests",
    "unit: marks tests as unit tests",

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,6 +15,7 @@
 import logging
 import os
+import shutil
 import tempfile
 import pytest
@@ -23,15 +24,31 @@ from tests.utils.managed_process import ManagedProcess
 # Custom format inspired by your example
 LOG_FORMAT = "[TEST] %(asctime)s %(levelname)s %(name)s: %(message)s"
+DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format=LOG_FORMAT,
-    datefmt="%Y-%m-%dT%H:%M:%S",  # ISO 8601 UTC format
+    datefmt=DATE_FORMAT,  # ISO 8601 UTC format
 )
+@pytest.fixture(autouse=True)
+def logger(request):
+    log_path = os.path.join(request.node.name, "test.log.txt")
+    logger = logging.getLogger()
+    shutil.rmtree(request.node.name, ignore_errors=True)
+    os.makedirs(request.node.name, exist_ok=True)
+    handler = logging.FileHandler(log_path, mode="w")
+    formatter = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    yield
+    handler.close()
+    logger.removeHandler(handler)
 def pytest_collection_modifyitems(config, items):
    """
    This function is called to modify the list of tests to run.
@@ -69,7 +86,7 @@ class EtcdServer(ManagedProcess):
            timeout=timeout,
            display_output=False,
            health_check_ports=[port],
-            data_dir=tempfile.mkdtemp(prefix="etcd_"),
+            data_dir=data_dir,
            log_dir=request.node.name,
        )

--- a/tests/fault_tolerance/README.md
+++ b/tests/fault_tolerance/README.md
--- a/tests/fault_tolerance/__init__.py
+++ b/tests/fault_tolerance/__init__.py
--- a/tests/fault_tolerance/client.py
+++ b/tests/fault_tolerance/client.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import os
+import random
+import time
+from datetime import datetime
+import requests
+def _get_random_prompt(length):
+    word_list = [f"{i}" for i in range(10)]
+    return " ".join(random.choices(word_list, k=length))
+def _single_request(
+    url,
+    payload,
+    logger,
+    retry_attempts=1,
+    input_token_length=100,
+    output_token_length=100,
+    timeout=30,
+    retry_delay=1,
+):
+    prompt = _get_random_prompt(input_token_length)
+    payload["messages"][0]["content"] = prompt
+    payload["max_tokens"] = output_token_length
+    response = None
+    end_time = None
+    start_time = time.time()
+    results = []
+    while retry_attempts:
+        start_request_time = time.time()
+        try:
+            response = requests.post(
+                url,
+                json=payload,
+                timeout=timeout,
+            )
+            end_time = time.time()
+            content = None
+            try:
+                content = response.json()
+            except json.JSONDecodeError:
+                pass
+            results.append(
+                {
+                    "status": response.status_code,
+                    "result": content,
+                    "request_elapsed_time": end_time - start_request_time,
+                }
+            )
+            if response.status_code != 200:
+                time.sleep(retry_delay)
+                retry_attempts -= 1
+                continue
+            else:
+                break
+        except (requests.RequestException, requests.Timeout) as e:
+            results.append(
+                {
+                    "status": str(e),
+                    "result": None,
+                    "request_elapsed_time": time.time() - start_request_time,
+                }
+            )
+            logger.warning("Retrying due to Request failed: %s", e)
+            time.sleep(retry_delay)
+            retry_attempts -= 1
+            continue
+    return {
+        "time": datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
+        "results": results,
+        "total_time": time.time() - start_time,
+    }
+def client(
+    deployment_graph,
+    server_process,
+    payload,
+    log_dir,
+    index,
+    requests_per_client,
+    input_token_length,
+    output_token_length,
+    max_retries,
+    retry_delay=1,
+):
+    logger = logging.getLogger(f"CLIENT: {index}")
+    try:
+        log_path = os.path.join(log_dir, f"client_{index}.log.txt")
+        with open(log_path, "w") as log:
+            url = f"http://localhost:{server_process.port}/{deployment_graph.endpoints[0]}"
+            for i in range(requests_per_client):
+                result = _single_request(
+                    url,
+                    payload.payload_chat,
+                    logger,
+                    max_retries,
+                    input_token_length=input_token_length,
+                    output_token_length=output_token_length,
+                    retry_delay=retry_delay,
+                )
+                logger.info(
+                    f"Request: {i} Status: {result['results'][-1]['status']} Latency: {result['results'][-1]['request_elapsed_time']}"
+                )
+                log.write(json.dumps(result) + "\n")
+                log.flush()
+    except Exception as e:
+        logger.error(str(e))
+    logger.info("Exiting")
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
\ No newline at end of file
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_4.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_1_dp_8.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 8
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_2.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+  ServiceArgs:
+    workers: 1
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
+++ b/tests/fault_tolerance/configs/agg_tp_2_dp_4.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  router-num-threads: 4
+  common-configs: [model, block-size, max-model-len]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  enforce-eager: true
+  max-num-batched-tokens: 16384
+  enable-prefix-caching: true
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_1_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_1_d_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 1
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_2_d_tp_2_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 1
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_1_dp_4_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  ServiceArgs:
+    workers: 4
+    resources:
+      gpu: '1'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_1_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
+++ b/tests/fault_tolerance/configs/disagg_p_tp_2_dp_2_d_tp_4_dp_1.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.Processor.chat/completions
+  port: 8000
+Processor:
+  router: round-robin
+  common-configs: [model, block-size]
+  ServiceArgs:
+    workers: 2
+    resources:
+      cpu: "10"
+      memory: "20Gi"
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  tensor-parallel-size: 4
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: '4'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  tensor-parallel-size: 2
+  ServiceArgs:
+    workers: 2
+    resources:
+      gpu: '2'
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
--- a/tests/fault_tolerance/conftest.py
+++ b/tests/fault_tolerance/conftest.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+def pytest_addoption(parser):
+    parser.addoption("--requests-per-client", type=int, default=100)
+    parser.addoption("--clients", type=int, default=10)
+    parser.addoption("--no-respawn", action="store_true", default=False)
+    parser.addoption("--input-token-length", type=int, default=100)
+    parser.addoption("--output-token-length", type=int, default=100)
+    parser.addoption("--max-num-seqs", type=int, default=None)
+    parser.addoption("--max-retries", type=int, default=1)
+    parser.addoption("--display-dynamo-output", action="store_true", default=False)
+    parser.addoption("--combine-process-logs", action="store_true", default=False)
+    parser.addoption("--hf-hub-offline", action="store_true", default=False)
+@pytest.fixture
+def display_dynamo_output(request):
+    return request.config.getoption("--display-dynamo-output")
+@pytest.fixture
+def max_retries(request):
+    return request.config.getoption("--max-retries")
+@pytest.fixture
+def max_num_seqs(request):
+    return request.config.getoption("--max-num-seqs")
+@pytest.fixture
+def num_clients(request):
+    return request.config.getoption("--clients")
+@pytest.fixture
+def input_token_length(request):
+    return request.config.getoption("--input-token-length")
+@pytest.fixture
+def output_token_length(request):
+    return request.config.getoption("--output-token-length")
+@pytest.fixture
+def requests_per_client(request):
+    return request.config.getoption("--requests-per-client")
+@pytest.fixture
+def respawn(request):
+    return not request.config.getoption("--no-respawn")
+@pytest.fixture
+def separate_process_logs(request):
+    return not request.config.getoption("--combine-process-logs")
+@pytest.fixture
+def hf_hub_offline(request):
+    return request.config.getoption("--hf-hub-offline")
--- a/tests/fault_tolerance/parse_results.py
+++ b/tests/fault_tolerance/parse_results.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import re
+from datetime import datetime
+from typing import Any
+import pandas as pd
+from tabulate import tabulate
+def parse_test_log(file_path):
+    start_time = None
+    ready_time = None
+    fault_time = None
+    start_cmd = None
+    if not os.path.isfile(file_path):
+        return None, None, None
+    with open(file_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if "Running command: dynamo serve" in line:
+                start_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+                start_cmd = line.split("Running command:")[1]
+            elif "Deployment Ready" in line:
+                ready_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+            elif "Injecting failure for:" in line:
+                fault_time = datetime.fromisoformat(
+                    line.split(" ")[1].replace("T", " ")
+                )
+    startup_time = (
+        (ready_time - start_time).total_seconds() if start_time and ready_time else None
+    )
+    return startup_time, fault_time, start_cmd
+def parse_client_logs(test_dir, expected_length=100):
+    all_logs = []
+    for file in os.listdir(test_dir):
+        if file.startswith("client_") and file.endswith(".log.txt"):
+            with open(os.path.join(test_dir, file), "r") as f:
+                request_number = 0
+                for line in f:
+                    request_number += 1
+                    data = json.loads(line.strip())
+                    for result in data["results"]:
+                        log_entry = {
+                            "time": datetime.fromisoformat(
+                                data["time"].replace("T", " ")
+                            ),
+                            "status": result["status"],
+                            "request_elapsed_time": result["request_elapsed_time"],
+                            "request_number": request_number - 1,
+                            "client": file.split("_")[1].split(".")[0],
+                        }
+                        if (
+                            "result" in result
+                            and result["result"]
+                            and "choices" in result["result"]
+                            and result["result"]["choices"]
+                        ):
+                            log_entry["success"] = True
+                            content = result["result"]["choices"][0]["message"][
+                                "content"
+                            ]
+                            if not content or len(content) < expected_length:
+                                log_entry["success"] = False
+                        else:
+                            log_entry["success"] = False
+                        all_logs.append(log_entry)
+    if len(all_logs):
+        df = pd.DataFrame(all_logs)
+        df.sort_values("time", inplace=True)
+        return df
+    return None
+def calculate_metrics(df, fault_time, sla=2.1):
+    success = df["success"].sum()
+    failure = len(df) - success
+    if fault_time:
+        before_fault = df[df["time"] <= fault_time]
+        after_fault = df[df["time"] > fault_time]
+    else:
+        before_fault = df
+        after_fault = None
+    # Existing latency metrics (only successful requests)
+    successful_before = before_fault[before_fault["success"]]
+    avg_before = successful_before["request_elapsed_time"].mean()
+    std_before = successful_before["request_elapsed_time"].std()
+    avg_after, std_after = None, None
+    if after_fault is not None and not after_fault.empty:
+        successful_after = after_fault[after_fault["success"]]
+        avg_after = successful_after["request_elapsed_time"].mean()
+        std_after = successful_after["request_elapsed_time"].std()
+    # SLA violations (only successful requests exceeding the SLA)
+    violations_before = (successful_before["request_elapsed_time"] > sla).sum()
+    violations_after = (
+        (successful_after["request_elapsed_time"] > sla).sum()
+        if after_fault is not None and not after_fault.empty
+        else None
+    )
+    return (
+        success,
+        failure,
+        avg_before,
+        std_before,
+        avg_after,
+        std_after,
+        violations_before,
+        violations_after,
+    )
+def parse_process_log(log_dir, process_name):
+    process_ready_line = {
+        "dynamo_Frontend": "added model",
+        "dynamo_VllmWorker": "Starting VllmWorker instance with all registered endpoints",
+        "dynamo_Processor": "Starting Processor instance with all registered endpoints",
+        "dynamo_PrefillWorker": "Starting PrefillWorker instance with all registered endpoints",
+    }
+    process_shutdown_line = {
+        "dynamo_Frontend": "SIGTERM received, starting graceful shutdown",
+        "dynamo_VllmWorker": "Received shutdown signal, shutting down DistributedRuntime",
+        "dynamo_Processor": "Received signal 15, initiating graceful shutdown",
+        "dynamo_PrefillWorker": "Shutdown hooks completed successfully",
+    }
+    process_log_path = os.path.join(log_dir, "error.log")
+    if not os.path.isfile(process_log_path):
+        return None, None
+    process_ready = []
+    process_shutdown = []
+    process_start_time = None
+    with open(process_log_path, "r") as f:
+        for line in f:
+            clean_line = re.sub(r"\x1b\[.*?m", "", line.strip())  # Remove ANSI codes
+            if not clean_line:
+                continue
+            parts = clean_line.split()
+            if len(parts) < 2:
+                continue
+            try:
+                # Parse timestamp (remove 'Z' for naive datetime)
+                timestamp = datetime.fromisoformat(parts[0].replace("Z", ""))
+            except ValueError:
+                continue
+            if not process_start_time:
+                process_start_time = timestamp
+            log_message = " ".join(parts[1:])
+            relative_time = (timestamp - process_start_time).total_seconds()
+            # Check for process start lines
+            if process_name in process_ready_line:
+                if process_ready_line[process_name] in log_message:
+                    process_ready.append((timestamp, log_message, relative_time))
+            # Check for process end lines
+            if process_name in process_shutdown_line:
+                if process_shutdown_line[process_name] in log_message:
+                    process_shutdown.append((timestamp, log_message, relative_time))
+    return process_ready, process_shutdown
+def parse_watcher_log(test_dir, fault_time):
+    before_requests = []
+    after_requests = []
+    watcher_log_path = os.path.join(test_dir, "watcher.log.txt")
+    if not os.path.isfile(watcher_log_path):
+        return None, None
+    with open(watcher_log_path, "r") as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+            except json.JSONDecodeError:
+                continue
+            if "metrics" not in data:
+                continue
+            entry_time = datetime.fromisoformat(data["time"].replace("T", " "))
+            for metric in data["metrics"]:
+                if len(metric) != 2:
+                    continue
+                _, metric_data = metric
+                if (
+                    "num_requests_waiting" in metric_data
+                    and "request_active_slots" in metric_data
+                    and metric_data["request_active_slots"] > 0
+                ):
+                    if fault_time is None or entry_time <= fault_time:
+                        before_requests.append(metric_data["num_requests_waiting"])
+                    else:
+                        after_requests.append(metric_data["num_requests_waiting"])
+    avg_before = (
+        sum(before_requests) / len(before_requests) if before_requests else None
+    )
+    avg_after = sum(after_requests) / len(after_requests) if after_requests else None
+    return avg_before, avg_after
+def calculate_recovery_time(test_dir, failure_type, fault_time):
+    processes = [
+        "dynamo_Frontend",
+        "dynamo_Processor",
+        "dynamo_VllmWorker",
+        "dynamo_PrefillWorker",
+    ]
+    process_start_ends = {}
+    start_time = None
+    for process in processes:
+        starts, ends = parse_process_log(os.path.join(test_dir, process), process)
+        if starts:
+            process_start_ends[process] = (starts, ends)
+    if failure_type == "processor":
+        start_time = process_start_ends["dynamo_Processor"][0][-1][0]
+    elif failure_type == "frontend":
+        start_time = process_start_ends["dynamo_Frontend"][0][-1][0]
+    elif failure_type == "decode_worker":
+        start_times = [
+            x
+            for x in process_start_ends["dynamo_VllmWorker"][0]
+            if "VllmWorker:1" in x[1]
+        ]
+        if not start_times:
+            return None
+        start_time = start_times[-1][0]
+    elif failure_type == "prefill_worker":
+        if "dynamo_PrefillWorker" not in process_start_ends:
+            return None
+        start_times = [
+            x
+            for x in process_start_ends["dynamo_PrefillWorker"][0]
+            if "PrefillWorker:1" in x[1]
+        ]
+        start_time = start_times[-1][0]
+    if not start_time:
+        return None
+    if fault_time > start_time:
+        return None
+    return (start_time - fault_time).total_seconds()
+def process_test_directory(test_dir):
+    test_name = test_dir.split("test_worker_failure[", 1)[1].rstrip("]")
+    failure_type = test_name.split("-")[-1]
+    test_prefix = "-".join(test_name.split("-")[:-1])
+    startup_time, fault_time, start_cmd = parse_test_log(
+        os.path.join(test_dir, "test.log.txt")
+    )
+    df = parse_client_logs(test_dir)
+    if df is None or df.empty:
+        return None
+    pending_requests_before, pending_requests_after = parse_watcher_log(
+        test_dir, fault_time
+    )
+    (
+        success,
+        failure,
+        avg_before,
+        std_before,
+        avg_after,
+        std_after,
+        violations_before,
+        violations_after,
+    ) = calculate_metrics(df, fault_time)
+    recovery_time = calculate_recovery_time(test_dir, failure_type, fault_time)
+    return {
+        "test": test_prefix,
+        "cmd": start_cmd,
+        "failure": failure_type,
+        "start_time": startup_time,
+        "success_requests": success,
+        "failed_requests": failure,
+        "avg_latency_before": avg_before,
+        "std_latency_before": std_before,
+        "avg_latency_after": avg_after,
+        "std_latency_after": std_after,
+        "pending_requests_before": pending_requests_before,
+        "pending_requests_after": pending_requests_after,
+        "violations_before": violations_before,
+        "violations_after": violations_after,
+        "recovery_time": recovery_time,
+    }
+def main(logs_dir, tablefmt, log_paths=[]):
+    results = []
+    if log_paths:
+        for log_path in log_paths:
+            result = process_test_directory(log_path)
+            if result:
+                results.append(result)
+    elif logs_dir:
+        for entry in os.listdir(logs_dir):
+            if entry.startswith("test_worker_failure[") and os.path.isdir(
+                os.path.join(logs_dir, entry)
+            ):
+                result = process_test_directory(os.path.join(logs_dir, entry))
+                if result:
+                    results.append(result)
+    # Group results by test prefix
+    grouped: dict[str, list[dict[str, Any]]] = {}
+    commands = {}
+    for res in results:
+        test_prefix = res["test"]
+        if test_prefix not in grouped:
+            grouped[test_prefix] = []
+            commands[test_prefix] = res["cmd"]
+        grouped[test_prefix].append(res)
+    order = [
+        "none",
+        "frontend",
+        "processor",
+        "decode_worker",
+        "prefill_worker",
+        "vllm_worker",
+    ]
+    # Print grouped tables
+    for test_prefix, group in grouped.items():
+        new_group = []
+        for failure in order:
+            for res in group:
+                if failure == res["failure"]:
+                    new_group.append(res)
+        group = new_group
+        headers = [
+            "Failure",
+            "Startup Time",
+            "Success",
+            "Failed",
+            "Latency Before",
+            "Latency After",
+            "Pending Before",
+            "Pending After",
+            "Violations Before",
+            "Violations After",
+            "Recovery Time",
+        ]
+        rows = []
+        for res in group:
+            row = [
+                res["failure"],
+                res["start_time"],  # if res["start_time"] is not None else "N/A",
+                res["success_requests"],
+                res["failed_requests"],
+                res["avg_latency_before"],
+                res["avg_latency_after"],
+                res["pending_requests_before"],
+                res["pending_requests_after"],
+                res["violations_before"],
+                res["violations_after"],
+                res["recovery_time"],
+            ]
+            rows.append(row)
+        print(f"\nTest Group: {test_prefix}")
+        print(f"\nTest Command: {commands[test_prefix]}")
+        print(
+            tabulate(
+                rows,
+                headers,
+                tablefmt=tablefmt,
+                floatfmt=".2f",
+                missingval="N/A",
+                numalign="right",
+                stralign="center",
+            )
+        )
+        print("\n" + "=" * 80)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parse test results")
+    parser.add_argument("--log-dir", default=".", help="Path to the logs directory")
+    parser.add_argument(
+        "--format", choices=["fancy", "markdown"], default="fancy", help="Table format"
+    )
+    args = parser.parse_args()
+    # Map format choices to tabulate formats
+    tablefmt = (
+        "fancy_grid" if args.format == "fancy" else "pipe"
+    )  # Using pipe for markdown compatibility
+    main(args.log_dir, tablefmt)