feat(fault-injection): Add Kubernetes deployment manifests (#4044)

0e6bb7bf · nv-oviya · GitHub · e10319f3 · 0e6bb7bf · 0e6bb7bf
Unverified Commit 0e6bb7bf authored Nov 26, 2025 by nv-oviya Committed by GitHub Nov 26, 2025
4 changed files
--- a/tests/fault_tolerance/hardware/fault-injection-service/deploy/api-service.yaml
+++ b/tests/fault_tolerance/hardware/fault-injection-service/deploy/api-service.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: fault-injection-api
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods", "services"]
+  verbs: ["get", "list", "watch", "patch"]
+- apiGroups: ["apps"]
+  resources: ["deployments", "daemonsets", "statefulsets"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: ["networking.k8s.io"]
+  resources: ["networkpolicies"]
+  verbs: ["get", "list", "create", "delete"]
+- apiGroups: ["chaos-mesh.org"]
+  resources: ["networkchaos", "podchaos", "stresschaos", "iochaos"]
+  verbs: ["get", "list", "create", "delete", "watch"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: fault-injection-api
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: fault-injection-api
+subjects:
+- kind: ServiceAccount
+  name: fault-injection-api
+  namespace: fault-injection-system
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+  labels:
+    app: fault-injection-api
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: fault-injection-api
+  template:
+    metadata:
+      labels:
+        app: fault-injection-api
+    spec:
+      serviceAccountName: fault-injection-api
+      # Use host network to communicate with hostNetwork agents
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      # Tolerate GPU node taints
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              # Require GPU nodes (A100 pools)
+              - key: nvidia.com/gpu.present
+                operator: In
+                values:
+                - "true"
+              # Prefer stable instance types
+              - key: node.kubernetes.io/instance-type
+                operator: In
+                values:
+                - Standard_ND96amsr_A100_v4
+      containers:
+      - name: api
+        # Replace with your Azure Container Registry (ACR)
+        image: dynamoci.azurecr.io/fault-injection-api:latest
+        imagePullPolicy: Always
+        ports:
+        - name: http
+          containerPort: 8080
+          protocol: TCP
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 10
+          periodSeconds: 30
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 5
+          periodSeconds: 10
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: fault-injection-api
+  namespace: fault-injection-system
+  labels:
+    app: fault-injection-api
+spec:
+  type: ClusterIP
+  ports:
+  - name: http
+    port: 8080
+    targetPort: 8080
+    protocol: TCP
+  selector:
+    app: fault-injection-api
+
--- a/tests/fault_tolerance/hardware/fault-injection-service/deploy/chaos-mesh-gpu.yaml
+++ b/tests/fault_tolerance/hardware/fault-injection-service/deploy/chaos-mesh-gpu.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# ChaosMesh Setup for GPU Fault Injection
+# Install ChaosMesh if not already present
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: chaos-mesh
+---
+# This is a placeholder - use Helm to install ChaosMesh
+#
+# Installation commands:
+#
+# helm repo add chaos-mesh https://charts.chaos-mesh.org
+# helm install chaos-mesh chaos-mesh/chaos-mesh -n chaos-mesh \
+#   --set chaosDaemon.runtime=containerd \
+#   --set chaosDaemon.socketPath=/run/containerd/containerd.sock \
+#   --set dashboard.create=true \
+#   --set dashboard.securityMode=false
+#
+# Verify installation:
+#   kubectl get pods -n chaos-mesh
+#
+# Access dashboard:
+#   kubectl port-forward -n chaos-mesh svc/chaos-dashboard 2333:2333
+#   open http://localhost:2333
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chaos-mesh-gpu-experiments
+  namespace: fault-injection-system
+data:
+  README.md: |
+    # ChaosMesh GPU Fault Injection
+
+    ChaosMesh provides the following chaos types for GPU fault injection:
+
+    ## PodChaos
+    - **pod-kill**: Kill GPU pods (simulates XID 79)
+    - **container-kill**: Kill GPU containers
+    - **pod-failure**: Make GPU pods unavailable
+
+    ## StressChaos
+    - **memory-stress**: Stress GPU node memory (simulates XID 48, 94, 95)
+    - **cpu-stress**: Stress GPU node CPU (can trigger thermal issues)
+
+    ## IOChaos
+    - **fault**: Inject I/O errors on GPU devices
+    - **latency**: Add I/O latency
+
+    ## TimeChaos
+    - **time-offset**: Offset system time (can trigger XID 119, 120 timeouts)
+
+    ## NetworkChaos (for multi-GPU scenarios)
+    - **partition**: Isolate GPU nodes
+    - **loss**: Packet loss between GPU nodes (NVLink errors)
+    - **delay**: Network delay
+
+    ## Usage Examples
+
+    See gpu_chaos_mesh.py for programmatic injection via API.
+
+    Or use kubectl:
+
+
+    ```bash
+    # Kill GPU pod
+    kubectl apply -f - <<EOF
+    apiVersion: chaos-mesh.org/v1alpha1
+    kind: PodChaos
+    metadata:
+      name: gpu-pod-kill
+      namespace: dynamo-oviya
+    spec:
+      action: pod-kill
+      mode: one
+      selector:
+        namespaces: ["dynamo-oviya"]
+        labelSelectors:
+          app: vllm-worker
+        nodeSelectors:
+          kubernetes.io/hostname: <gpu-node-name>
+      duration: 60s
+    EOF
+    ```
+
+    ```bash
+    # Memory stress on GPU node
+    kubectl apply -f - <<EOF
+    apiVersion: chaos-mesh.org/v1alpha1
+    kind: StressChaos
+    metadata:
+      name: gpu-memory-stress
+      namespace: dynamo-oviya
+    spec:
+      mode: one
+      selector:
+        namespaces: ["dynamo-oviya"]
+        nodeSelectors:
+          kubernetes.io/hostname: <gpu-node-name>
+      stressors:
+        memory:
+          workers: 4
+          size: 8GB
+      duration: 60s
+    EOF
+    ```
+
--- a/tests/fault_tolerance/hardware/fault-injection-service/deploy/gpu-fault-injector-kernel.yaml
+++ b/tests/fault_tolerance/hardware/fault-injection-service/deploy/gpu-fault-injector-kernel.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# GPU Fault Injector with Kernel-Level Access (Privileged DaemonSet)
+# Similar to Strobelight's privileged configuration for kernel-level operations
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: gpu-fault-injector-kernel
+  namespace: fault-injection-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: gpu-fault-injector-kernel
+rules:
+- apiGroups: [""]
+  resources: ["nodes", "pods"]
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: gpu-fault-injector-kernel
+subjects:
+- kind: ServiceAccount
+  name: gpu-fault-injector-kernel
+  namespace: fault-injection-system
+roleRef:
+  kind: ClusterRole
+  name: gpu-fault-injector-kernel
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gpu-fault-injector-kernel
+  namespace: fault-injection-system
+  labels:
+    app: gpu-fault-injector-kernel
+    component: fault-injection
+spec:
+  selector:
+    matchLabels:
+      app: gpu-fault-injector-kernel
+  template:
+    metadata:
+      labels:
+        app: gpu-fault-injector-kernel
+        component: fault-injection
+    spec:
+      serviceAccountName: gpu-fault-injector-kernel
+      hostPID: true
+      hostNetwork: true
+      hostIPC: true
+
+      # Node selector: Only run on GPU nodes
+      nodeSelector:
+        nvidia.com/gpu.present: "true"
+
+      tolerations:
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+
+      containers:
+      - name: gpu-fault-injector
+        # UPDATE: Replace with your Azure Container Registry (ACR)
+        # Example: yourregistry.azurecr.io/gpu-fault-injector:latest
+        image: dynamoci.azurecr.io/gpu-fault-injector:latest
+        imagePullPolicy: Always
+
+        # PRIVILEGED MODE - Required for kernel-level operations
+        securityContext:
+          privileged: true
+          runAsUser: 0  # Must run as root
+          capabilities:
+            add:
+            - SYS_ADMIN      # System administration (mount, sysctl, etc)
+            - SYS_PTRACE     # Process tracing
+            - SYS_RESOURCE   # Resource limits
+            - NET_ADMIN      # Network administration
+            - BPF            # eBPF program loading
+            - SYS_BOOT       # Reboot/poweroff (for severe faults)
+            - SYS_MODULE     # Kernel module loading
+            - DAC_OVERRIDE   # Bypass file permissions
+            - CAP_PERFMON    # Performance monitoring
+
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: PRIVILEGED_MODE
+          value: "true"
+        - name: ENABLE_KERNEL_INJECTION
+          value: "true"
+        - name: ENABLE_EBPF
+          value: "true"
+
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "1Gi"
+            cpu: "1000m"
+
+        volumeMounts:
+        # Kernel access
+        - name: sys-kernel-debug
+          mountPath: /sys/kernel/debug
+        - name: sys-kernel-tracing
+          mountPath: /sys/kernel/tracing
+        - name: sys-kernel-btf
+          mountPath: /sys/kernel/btf
+        - name: lib-modules
+          mountPath: /lib/modules
+          readOnly: true
+
+        # System access
+        - name: sys
+          mountPath: /host/sys
+        - name: proc
+          mountPath: /host/proc
+        - name: dev
+          mountPath: /host/dev
+
+        # PCI bus access for XID 79 injection
+        - name: sys-bus-pci
+          mountPath: /sys/bus/pci
+
+        # CUDA/NVML libraries
+        - name: nvidia-driver
+          mountPath: /usr/local/nvidia
+          readOnly: true
+
+        # Configuration
+        - name: config
+          mountPath: /etc/gpu-fault-injector
+          readOnly: true
+
+        # Logs
+        - name: logs
+          mountPath: /var/log/gpu-fault-injector
+
+        ports:
+        - containerPort: 8083
+          name: http
+          protocol: TCP
+
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8083
+          initialDelaySeconds: 30
+          periodSeconds: 30
+          timeoutSeconds: 5
+
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8083
+          initialDelaySeconds: 10
+          periodSeconds: 10
+          timeoutSeconds: 5
+
+      volumes:
+      # Kernel volumes
+      - name: sys-kernel-debug
+        hostPath:
+          path: /sys/kernel/debug
+          type: DirectoryOrCreate
+      - name: sys-kernel-tracing
+        hostPath:
+          path: /sys/kernel/tracing
+          type: DirectoryOrCreate
+      - name: sys-kernel-btf
+        hostPath:
+          path: /sys/kernel/btf
+          type: DirectoryOrCreate
+      - name: lib-modules
+        hostPath:
+          path: /lib/modules
+          type: Directory
+
+      # System volumes
+      - name: sys
+        hostPath:
+          path: /sys
+          type: Directory
+      - name: proc
+        hostPath:
+          path: /proc
+          type: Directory
+      - name: dev
+        hostPath:
+          path: /dev
+          type: Directory
+
+      # PCI bus
+      - name: sys-bus-pci
+        hostPath:
+          path: /sys/bus/pci
+          type: Directory
+
+      # NVIDIA driver
+      - name: nvidia-driver
+        hostPath:
+          path: /usr/local/nvidia
+          type: Directory
+
+      # Configuration
+      - name: config
+        configMap:
+          name: gpu-fault-injector-config
+
+      # Logs
+      - name: logs
+        hostPath:
+          path: /var/log/gpu-fault-injector
+          type: DirectoryOrCreate
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gpu-fault-injector-config
+  namespace: fault-injection-system
+data:
+  config.yaml: |
+    # GPU Fault Injector Configuration
+
+    # Injection methods (in order of preference)
+    methods:
+      - kernel_level
+      - chaos_mesh
+      - nvidia_smi
+
+    # Safety settings
+    safety:
+      require_confirmation: false  # Set to true in production
+      max_concurrent_faults: 3
+      cooldown_seconds: 30
+
+    # Kernel-level injection settings
+    kernel:
+      enable_ebpf: true
+      enable_pci_manipulation: true
+      enable_cuda_interception: true
+      nvcc_path: /usr/local/cuda/bin/nvcc
+
+    # Logging
+    logging:
+      level: info
+      file: /var/log/gpu-fault-injector/agent.log
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-fault-injector-kernel
+  namespace: fault-injection-system
+  labels:
+    app: gpu-fault-injector-kernel
+spec:
+  type: ClusterIP
+  clusterIP: None  # Headless service for DaemonSet
+  selector:
+    app: gpu-fault-injector-kernel
+  ports:
+  - port: 8083
+    targetPort: 8083
+    protocol: TCP
+    name: http
+
--- a/tests/fault_tolerance/hardware/fault-injection-service/deploy/namespace.yaml
+++ b/tests/fault_tolerance/hardware/fault-injection-service/deploy/namespace.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: fault-injection-system
+  labels:
+    name: fault-injection-system
+