feat(fault-injection): Enable runtime CUDA fault injection toggling without pod restarts (#4679)

d2c23e41 · nv-oviya · GitHub · 300e5d55 · d2c23e41 · d2c23e41
Unverified Commit d2c23e41 authored Dec 08, 2025 by nv-oviya Committed by GitHub Dec 08, 2025
4 changed files
--- a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/README.md
+++ b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/README.md
@@ -6,13 +6,16 @@
 ## What This Does
-Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
+Intercepts CUDA calls to simulate GPU failures using LD_PRELOAD. Faults persist across pod restarts via hostPath volumes, enabling realistic hardware failure testing.
 ```
-Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
+Pod calls cudaMalloc() → LD_PRELOAD intercepts → Checks /host-fault/cuda_fault_enabled → Returns error → Pod crashes
 ```
-**Result**: Realistic GPU failure testing without hardware damage.
+**Key Features**:
+- **Persistent faults**: hostPath volume (`/var/lib/cuda-fault-test`) survives pod restarts on same node
+- **Runtime toggle**: Enable/disable faults without pod restarts via `/host-fault/cuda_fault_enabled`
+- **Node-specific**: Faults only on target node, healthy nodes unaffected
 ## Scope
@@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when
 | **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
 | **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
+## How It Works
+1. **Deployment patching**: Adds hostPath volume + init container to compile library
+2. **LD_PRELOAD injection**: Environment variable loads library before CUDA
+3. **Runtime control**: Toggle file (`/host-fault/cuda_fault_enabled`) controls fault state
+4. **Node persistence**: hostPath ensures faults survive pod restarts on same node
 ## Files in This Directory
 | File | Purpose |
 |------|---------|
-| `cuda_intercept.c` | C library source that intercepts CUDA calls |
+| `cuda_intercept.c` | C library that intercepts CUDA calls and checks fault markers |
-| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
+| `inject_into_pods.py` | Kubernetes deployment patcher (adds hostPath volume + library) |
-| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
+| `Makefile` | Local build (optional, for testing) |
 ## Prerequisites

--- a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/cuda_intercept.c
+++ b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/cuda_intercept.c
@@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = {
 };
 // Get XID type and corresponding CUDA error
+// Supports runtime toggling via /tmp/cuda_fault_enabled file
 static void
 get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
 {
  static int initialized = 0;
-  static int cached_inject = 0;
+  static int env_inject = 0;   // From environment variable
  static int cached_xid = 79;  // Default to XID 79
  static cudaError_t cached_error = cudaErrorNoDevice;
  if (!initialized) {
-    // Check if injection is enabled
+    // Check if injection is enabled via environment
    char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
    if (env) {
-      cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+      env_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
    }
    // Get XID type
@@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
        if (xid_mappings[i].xid == cached_xid) {
          cached_error = xid_mappings[i].cuda_error;
          fprintf(
-              stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
+              stderr, "[CUDA FAULT INJECTION] Library loaded - XID %d (%s)\n", cached_xid, xid_mappings[i].description);
-              xid_mappings[i].description);
          found = 1;
          break;
        }
@@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
        cached_xid = 79;
        cached_error = cudaErrorNoDevice;
      }
-    } else {
-      fprintf(
-          stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
-          cached_inject ? "ENABLED" : "DISABLED");
    }
    initialized = 1;
  }
-  *inject = cached_inject;
+  // Runtime toggle: Check node-persistent fault marker on EVERY call
+  // Use hostPath (/host-fault) so fault persists across pod restarts on same node
+  // Pod reschedules to different node → no file there → automatic recovery!
+  int runtime_inject = env_inject;  // Default to env var
+  // Check hostPath first (persistent across restarts on same node)
+  FILE* toggle_file = fopen("/host-fault/cuda_fault_enabled", "r");
+  if (toggle_file) {
+    char toggle_value[4] = {0};
+    if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
+      runtime_inject = (toggle_value[0] == '1');
+    }
+    fclose(toggle_file);
+  } else {
+    // Fallback to ephemeral /tmp for backwards compatibility
+    toggle_file = fopen("/tmp/cuda_fault_enabled", "r");
+    if (toggle_file) {
+      char toggle_value[4] = {0};
+      if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
+        runtime_inject = (toggle_value[0] == '1');
+      }
+      fclose(toggle_file);
+    }
+  }
+  *inject = runtime_inject;
  *xid_type = cached_xid;
  *error_code = cached_error;
 }

--- a/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/inject_into_pods.py
+++ b/tests/fault_tolerance/hardware/fault_injection_service/cuda_fault_injection/inject_into_pods.py
@@ -201,6 +201,18 @@ def _patch_service_for_injection(
            {"name": "cuda-fault-lib", "emptyDir": {}}
        )
+        # Add hostPath volume for persistent fault marker (survives pod restarts on same node)
+        # This simulates persistent hardware failure!
+        service["extraPodSpec"]["volumes"].append(
+            {
+                "name": "node-fault-marker",
+                "hostPath": {
+                    "path": "/var/lib/cuda-fault-test",
+                    "type": "DirectoryOrCreate",
+                },
+            }
+        )
        # Add init container to decode base64
        if "initContainers" not in service["extraPodSpec"]:
            service["extraPodSpec"]["initContainers"] = []
@@ -247,7 +259,7 @@ def _patch_service_for_injection(
            if vm.get("name") != "cuda-fault-lib"
        ]
-        # Add mount
+        # Add mount for compiled library
        service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
            {
                "name": "cuda-fault-lib",
@@ -256,8 +268,18 @@ def _patch_service_for_injection(
            }
        )
+        # Add mount for persistent fault marker (hostPath)
+        service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
+            {
+                "name": "node-fault-marker",
+                "mountPath": "/host-fault",
+                "readOnly": False,  # Need write access
+            }
+        )
        print("      ✓ Added init container to compile library")
        print("      ✓ Added ConfigMap volume mount")
+        print("      ✓ Added hostPath volume for persistent fault marker")
    # Add node affinity to pin pods to target node (simulates real XID 79 behavior)
    if target_node and enable:
@@ -287,14 +309,15 @@ def _patch_service_for_injection(
            service["extraPodSpec"]["volumes"] = [
                v
                for v in service["extraPodSpec"]["volumes"]
-                if v.get("name") not in ["cuda-fault-lib", "cuda-fault-lib-source"]
+                if v.get("name")
+                not in ["cuda-fault-lib", "cuda-fault-lib-source", "node-fault-marker"]
            ]
        if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}):
            service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [
                vm
                for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"]
-                if vm.get("name") != "cuda-fault-lib"
+                if vm.get("name") not in ["cuda-fault-lib", "node-fault-marker"]
            ]
        # Remove init container
@@ -323,6 +346,7 @@ def patch_deployment_env(
    use_configmap=True,
    target_node=None,
    xid_type=79,
+    passthrough_mode=False,
 ):
    """Patch deployment to add/remove LD_PRELOAD environment variable.
@@ -334,6 +358,8 @@ def patch_deployment_env(
        target_node: If provided, adds node affinity to pin pods to this node
                    (simulates real XID where pods crash on the faulty node)
        xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
+        passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
+                         Allows baseline testing before enabling faults via toggle
    """
    custom_api = client.CustomObjectsApi()
    apps_api = client.AppsV1Api()
@@ -385,9 +411,14 @@ def patch_deployment_env(
            # Prepare environment variables
            new_envs = []
            if enable:
+                # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
+                fault_enabled_value = "0" if passthrough_mode else "1"
                new_envs = [
                    {"name": "LD_PRELOAD", "value": lib_path},
-                    {"name": "CUDA_FAULT_INJECTION_ENABLED", "value": "1"},
+                    {
+                        "name": "CUDA_FAULT_INJECTION_ENABLED",
+                        "value": fault_enabled_value,
+                    },
                    {"name": "CUDA_XID_TYPE", "value": str(xid_type)},
                ]
@@ -400,6 +431,28 @@ def patch_deployment_env(
            available_services = list(services.keys())
            print(f"    → Available services: {available_services}")
+            # Set aggressive update strategy when enabling (allow all pods to update at once)
+            # This ensures all pods get CUDA faults, not just the first few
+            if enable:
+                if "updateStrategy" not in spec:
+                    spec["updateStrategy"] = {}
+                if "rollingUpdate" not in spec["updateStrategy"]:
+                    spec["updateStrategy"]["rollingUpdate"] = {}
+                # Allow all pods to be unavailable during update
+                spec["updateStrategy"]["rollingUpdate"]["maxUnavailable"] = "100%"
+                # Don't create surge pods
+                spec["updateStrategy"]["rollingUpdate"]["maxSurge"] = 0
+                print("    → Set update strategy: maxUnavailable=100%, maxSurge=0")
+                print("       (All pods will update simultaneously)")
+            else:
+                # Restore default update strategy when disabling
+                if "updateStrategy" in spec:
+                    spec["updateStrategy"] = {
+                        "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}
+                    }
+                    print("    → Restored default update strategy (maxUnavailable=25%)")
            for service_name in services_to_patch:
                if service_name in services:
                    print(f"    → Patching service: {service_name}")
@@ -465,6 +518,38 @@ def patch_deployment_env(
            print(f"    Services patched: {', '.join(patched_services)}")
            if use_configmap and enable:
                print(f"    Library mounted at: {lib_path}")
+            # Force restart all worker pods when enabling to apply changes immediately
+            if enable:
+                print(
+                    "    → Force-deleting all worker pods to apply changes immediately..."
+                )
+                core_api = client.CoreV1Api()
+                try:
+                    worker_pods = core_api.list_namespaced_pod(
+                        namespace=namespace,
+                        label_selector=f"nvidia.com/dynamo-graph-deployment-name={deployment_name},nvidia.com/dynamo-component-type=worker",
+                    )
+                    deleted_count = 0
+                    for pod in worker_pods.items:
+                        try:
+                            core_api.delete_namespaced_pod(
+                                name=pod.metadata.name,
+                                namespace=namespace,
+                                grace_period_seconds=0,
+                            )
+                            deleted_count += 1
+                        except Exception as e:
+                            print(
+                                f"      ⚠ Could not delete pod {pod.metadata.name}: {e}"
+                            )
+                    print(
+                        f"    ✓ Deleted {deleted_count} pod(s) - they will restart with CUDA library"
+                    )
+                except Exception as e:
+                    print(f"      ⚠ Could not list/delete pods: {e}")
+                    print("         Pods will eventually restart, but may take longer")
            return True
        except ApiException as e:
@@ -505,11 +590,15 @@ def patch_deployment_env(
        if enable:
            # Add new env vars
+            # Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
+            fault_enabled_value = "0" if passthrough_mode else "1"
            container.env.append(
                client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so")
            )
            container.env.append(
-                client.V1EnvVar(name="CUDA_FAULT_INJECTION_ENABLED", value="1")
+                client.V1EnvVar(
+                    name="CUDA_FAULT_INJECTION_ENABLED", value=fault_enabled_value
+                )
            )
            container.env.append(
                client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type))

--- a/tests/fault_tolerance/hardware/fault_injection_service/helpers/cuda_fault_injection.py
+++ b/tests/fault_tolerance/hardware/fault_injection_service/helpers/cuda_fault_injection.py
@@ -37,7 +37,7 @@ class CUDAFaultInjector:
            lib_dir = Path(__file__).parent.parent / "cuda_fault_injection"
        self.lib_dir = lib_dir
-        self.lib_path = lib_dir / "fake_cuda_xid79.so"
+        self.lib_path = lib_dir / "cuda_intercept.so"
        self.lib_built = False
    def build_library(self) -> bool:
@@ -101,12 +101,57 @@ class CUDAFaultInjector:
            traceback.print_exc()
            return False
+    def check_if_cuda_library_deployed(
+        self, deployment_name: str, namespace: str
+    ) -> bool:
+        """
+        Check if CUDA fault injection is already deployed to the deployment.
+        Args:
+            deployment_name: Name of the deployment
+            namespace: Kubernetes namespace
+        Returns:
+            True if CUDA fault library is already deployed, False otherwise
+        """
+        try:
+            k8s_custom = client.CustomObjectsApi()
+            # Get the DynamoGraphDeployment
+            dgd = k8s_custom.get_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="dynamographdeployments",
+                name=deployment_name,
+            )
+            # Check for LD_PRELOAD in worker container env
+            spec = dgd.get("spec", {})
+            worker_spec = spec.get("workerSpec", {})
+            pod_spec = worker_spec.get("podSpec", {})
+            containers = pod_spec.get("containers", [])
+            for container in containers:
+                if container.get("name") in ["vllm-worker", "worker"]:
+                    env = container.get("env", [])
+                    for env_var in env:
+                        if env_var.get("name") == "LD_PRELOAD":
+                            return True
+            return False
+        except Exception:
+            # If we can't read the deployment, assume it's not deployed
+            return False
    def patch_deployment_for_cuda_fault(
        self,
        deployment_name: str,
        namespace: str,
        target_node: Optional[str] = None,
        xid_type: int = 79,
+        passthrough_mode: bool = False,
    ) -> bool:
        """
        Patch deployment to enable CUDA fault injection.
@@ -116,6 +161,7 @@ class CUDAFaultInjector:
        - Init container to compile library
        - LD_PRELOAD environment variable
        - CUDA_XID_TYPE environment variable
+        - CUDA_FAULT_INJECTION_ENABLED (0 in passthrough mode, 1 otherwise)
        - Node affinity (if target_node specified)
        Args:
@@ -123,6 +169,8 @@ class CUDAFaultInjector:
            namespace: Kubernetes namespace
            target_node: Node to pin pods to (simulates real XID behavior)
            xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
+            passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0
+                            (library loaded but faults disabled for baseline)
        Returns:
            True if patch succeeded
@@ -149,6 +197,7 @@ class CUDAFaultInjector:
                use_configmap=True,
                target_node=target_node,
                xid_type=xid_type,
+                passthrough_mode=passthrough_mode,
            )
        except Exception as e:
@@ -339,6 +388,248 @@ class CUDAFaultInjector:
            traceback.print_exc()
            return False
+    def enable_cuda_faults_via_toggle(
+        self, pods: List[client.V1Pod], namespace: str, enable: bool = True
+    ) -> bool:
+        """
+        Enable or disable CUDA faults on running pods via environment variable toggle.
+        This modifies the CUDA_FAULT_INJECTION_ENABLED env var in running pods
+        without restarting them. Requires the CUDA library to already be loaded.
+        Args:
+            pods: List of pods to toggle faults on
+            namespace: Kubernetes namespace
+            enable: True to enable faults, False to disable
+        Returns:
+            True if toggle succeeded
+        """
+        if not pods:
+            return False
+        toggle_value = "1" if enable else "0"
+        action = "Enabling" if enable else "Disabling"
+        print(f"\n[→] {action} CUDA faults via toggle on {len(pods)} pods...")
+        success_count = 0
+        failed_pods = []
+        for pod in pods:
+            pod_name = pod.metadata.name
+            try:
+                # Get the main container name from pod spec
+                container_name = (
+                    pod.spec.containers[0].name if pod.spec.containers else None
+                )
+                if not container_name:
+                    failed_pods.append((pod_name, "No container found"))
+                    continue
+                # Write toggle file to hostPath (persists across pod restarts on same node)
+                # This simulates persistent hardware failure!
+                exec_command = [
+                    "sh",
+                    "-c",
+                    f'mkdir -p /host-fault && echo "{toggle_value}" > /host-fault/cuda_fault_enabled && cat /host-fault/cuda_fault_enabled',
+                ]
+                result = subprocess.run(
+                    [
+                        "kubectl",
+                        "exec",
+                        "-n",
+                        namespace,
+                        pod_name,
+                        "-c",
+                        container_name,
+                        "--",
+                    ]
+                    + exec_command,
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    actual_value = result.stdout.strip()
+                    if actual_value == toggle_value:
+                        print(
+                            f"    ✓ Toggle={toggle_value} in {pod_name}/{container_name}"
+                        )
+                        success_count += 1
+                    else:
+                        failed_pods.append(
+                            (
+                                pod_name,
+                                f"Verify failed: expected '{toggle_value}', got '{actual_value}'",
+                            )
+                        )
+                else:
+                    failed_pods.append(
+                        (pod_name, f"Exec failed: {result.stderr.strip()}")
+                    )
+            except Exception as e:
+                failed_pods.append((pod_name, str(e)))
+                continue
+        if failed_pods:
+            print(f"    ⚠ Failed to toggle {len(failed_pods)} pods:")
+            for pod_name, error in failed_pods:
+                print(f"       - {pod_name}: {error}")
+        print(f"    → Result: {success_count}/{len(pods)} pods toggled successfully")
+        return success_count > 0
+    def disable_cuda_faults_via_toggle(
+        self, pods: List[client.V1Pod], namespace: str
+    ) -> bool:
+        """
+        Disable CUDA faults on running pods via toggle.
+        Args:
+            pods: List of pod objects to disable faults on
+            namespace: Kubernetes namespace
+        Returns:
+            True if disable succeeded
+        """
+        return self.enable_cuda_faults_via_toggle(pods, namespace, enable=False)
+    def cleanup_node_fault_markers(
+        self, pods: List[client.V1Pod], namespace: str
+    ) -> bool:
+        """
+        Remove persistent fault marker files from node hostPath.
+        This cleans up /host-fault/cuda_fault_enabled to prevent future tests from failing.
+        Args:
+            pods: List of pods (to access nodes)
+            namespace: Kubernetes namespace
+        Returns:
+            True if cleanup succeeded
+        """
+        if not pods:
+            return True
+        print("    [->] Cleaning persistent fault markers from nodes...")
+        success_count = 0
+        nodes_cleaned = set()
+        for pod in pods:
+            pod_name = pod.metadata.name
+            node_name = pod.spec.node_name
+            # Skip if we already cleaned this node
+            if node_name in nodes_cleaned:
+                continue
+            try:
+                container_name = (
+                    pod.spec.containers[0].name if pod.spec.containers else None
+                )
+                if not container_name:
+                    continue
+                # Remove the persistent marker file from hostPath
+                exec_command = [
+                    "sh",
+                    "-c",
+                    'rm -f /host-fault/cuda_fault_enabled 2>/dev/null; echo "ok"',
+                ]
+                result = subprocess.run(
+                    [
+                        "kubectl",
+                        "exec",
+                        "-n",
+                        namespace,
+                        pod_name,
+                        "-c",
+                        container_name,
+                        "--",
+                    ]
+                    + exec_command,
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0:
+                    print(f"    ✓ Cleaned fault marker on node {node_name}")
+                    nodes_cleaned.add(node_name)
+                    success_count += 1
+            except Exception:
+                continue
+        return success_count > 0
+    def verify_env_var_set(
+        self,
+        deployment_name: str,
+        namespace: str,
+        expected_value: str,
+        max_wait: int = 30,
+    ) -> bool:
+        """
+        Verify that CUDA_FAULT_INJECTION_ENABLED env var is set to expected value.
+        Polls until the value matches or timeout.
+        Args:
+            deployment_name: Name of the DynamoGraphDeployment
+            namespace: Kubernetes namespace
+            expected_value: Expected value ("0" or "1")
+            max_wait: Maximum seconds to wait
+        Returns:
+            True if verified
+        """
+        k8s_custom = client.CustomObjectsApi()
+        start_time = time.time()
+        while time.time() - start_time < max_wait:
+            try:
+                dgd = k8s_custom.get_namespaced_custom_object(
+                    group="nvidia.com",
+                    version="v1alpha1",
+                    namespace=namespace,
+                    plural="dynamographdeployments",
+                    name=deployment_name,
+                )
+                # Check both worker services
+                for service_name in ["VllmDecodeWorker", "VllmPrefillWorker"]:
+                    if service_name in dgd["spec"]["services"]:
+                        service = dgd["spec"]["services"][service_name]
+                        env_vars = (
+                            service.get("extraPodSpec", {})
+                            .get("mainContainer", {})
+                            .get("env", [])
+                        )
+                        for env_var in env_vars:
+                            if env_var.get("name") == "CUDA_FAULT_INJECTION_ENABLED":
+                                if env_var.get("value") != expected_value:
+                                    time.sleep(1)
+                                    break  # Try again
+                        else:
+                            continue  # This service is good
+                        break  # Inner loop broke, try again
+                else:
+                    # All services verified
+                    return True
+            except Exception:
+                time.sleep(1)
+        return False
    def trigger_pod_restart(self, pods: List[client.V1Pod], namespace: str):
        """
        Delete pods to trigger restart with new env vars.