Unverified Commit 0e6bb7bf authored by nv-oviya's avatar nv-oviya Committed by GitHub
Browse files

feat(fault-injection): Add Kubernetes deployment manifests (#4044)

parent e10319f3
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: fault-injection-api
namespace: fault-injection-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: fault-injection-api
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services"]
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources: ["networkpolicies"]
verbs: ["get", "list", "create", "delete"]
- apiGroups: ["chaos-mesh.org"]
resources: ["networkchaos", "podchaos", "stresschaos", "iochaos"]
verbs: ["get", "list", "create", "delete", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: fault-injection-api
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: fault-injection-api
subjects:
- kind: ServiceAccount
name: fault-injection-api
namespace: fault-injection-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: fault-injection-api
namespace: fault-injection-system
labels:
app: fault-injection-api
spec:
replicas: 1
selector:
matchLabels:
app: fault-injection-api
template:
metadata:
labels:
app: fault-injection-api
spec:
serviceAccountName: fault-injection-api
# Use host network to communicate with hostNetwork agents
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
# Tolerate GPU node taints
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
# Require GPU nodes (A100 pools)
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
# Prefer stable instance types
- key: node.kubernetes.io/instance-type
operator: In
values:
- Standard_ND96amsr_A100_v4
containers:
- name: api
# Replace with your Azure Container Registry (ACR)
image: dynamoci.azurecr.io/fault-injection-api:latest
imagePullPolicy: Always
ports:
- name: http
containerPort: 8080
protocol: TCP
env:
- name: PYTHONUNBUFFERED
value: "1"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "500m"
---
apiVersion: v1
kind: Service
metadata:
name: fault-injection-api
namespace: fault-injection-system
labels:
app: fault-injection-api
spec:
type: ClusterIP
ports:
- name: http
port: 8080
targetPort: 8080
protocol: TCP
selector:
app: fault-injection-api
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# ChaosMesh Setup for GPU Fault Injection
# Install ChaosMesh if not already present
---
apiVersion: v1
kind: Namespace
metadata:
name: chaos-mesh
---
# This is a placeholder - use Helm to install ChaosMesh
#
# Installation commands:
#
# helm repo add chaos-mesh https://charts.chaos-mesh.org
# helm install chaos-mesh chaos-mesh/chaos-mesh -n chaos-mesh \
# --set chaosDaemon.runtime=containerd \
# --set chaosDaemon.socketPath=/run/containerd/containerd.sock \
# --set dashboard.create=true \
# --set dashboard.securityMode=false
#
# Verify installation:
# kubectl get pods -n chaos-mesh
#
# Access dashboard:
# kubectl port-forward -n chaos-mesh svc/chaos-dashboard 2333:2333
# open http://localhost:2333
---
apiVersion: v1
kind: ConfigMap
metadata:
name: chaos-mesh-gpu-experiments
namespace: fault-injection-system
data:
README.md: |
# ChaosMesh GPU Fault Injection
ChaosMesh provides the following chaos types for GPU fault injection:
## PodChaos
- **pod-kill**: Kill GPU pods (simulates XID 79)
- **container-kill**: Kill GPU containers
- **pod-failure**: Make GPU pods unavailable
## StressChaos
- **memory-stress**: Stress GPU node memory (simulates XID 48, 94, 95)
- **cpu-stress**: Stress GPU node CPU (can trigger thermal issues)
## IOChaos
- **fault**: Inject I/O errors on GPU devices
- **latency**: Add I/O latency
## TimeChaos
- **time-offset**: Offset system time (can trigger XID 119, 120 timeouts)
## NetworkChaos (for multi-GPU scenarios)
- **partition**: Isolate GPU nodes
- **loss**: Packet loss between GPU nodes (NVLink errors)
- **delay**: Network delay
## Usage Examples
See gpu_chaos_mesh.py for programmatic injection via API.
Or use kubectl:
```bash
# Kill GPU pod
kubectl apply -f - <<EOF
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: gpu-pod-kill
namespace: dynamo-oviya
spec:
action: pod-kill
mode: one
selector:
namespaces: ["dynamo-oviya"]
labelSelectors:
app: vllm-worker
nodeSelectors:
kubernetes.io/hostname: <gpu-node-name>
duration: 60s
EOF
```
```bash
# Memory stress on GPU node
kubectl apply -f - <<EOF
apiVersion: chaos-mesh.org/v1alpha1
kind: StressChaos
metadata:
name: gpu-memory-stress
namespace: dynamo-oviya
spec:
mode: one
selector:
namespaces: ["dynamo-oviya"]
nodeSelectors:
kubernetes.io/hostname: <gpu-node-name>
stressors:
memory:
workers: 4
size: 8GB
duration: 60s
EOF
```
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# GPU Fault Injector with Kernel-Level Access (Privileged DaemonSet)
# Similar to Strobelight's privileged configuration for kernel-level operations
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-fault-injector-kernel
namespace: fault-injection-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: gpu-fault-injector-kernel
rules:
- apiGroups: [""]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["pods/exec"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-fault-injector-kernel
subjects:
- kind: ServiceAccount
name: gpu-fault-injector-kernel
namespace: fault-injection-system
roleRef:
kind: ClusterRole
name: gpu-fault-injector-kernel
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-fault-injector-kernel
namespace: fault-injection-system
labels:
app: gpu-fault-injector-kernel
component: fault-injection
spec:
selector:
matchLabels:
app: gpu-fault-injector-kernel
template:
metadata:
labels:
app: gpu-fault-injector-kernel
component: fault-injection
spec:
serviceAccountName: gpu-fault-injector-kernel
hostPID: true
hostNetwork: true
hostIPC: true
# Node selector: Only run on GPU nodes
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
containers:
- name: gpu-fault-injector
# UPDATE: Replace with your Azure Container Registry (ACR)
# Example: yourregistry.azurecr.io/gpu-fault-injector:latest
image: dynamoci.azurecr.io/gpu-fault-injector:latest
imagePullPolicy: Always
# PRIVILEGED MODE - Required for kernel-level operations
securityContext:
privileged: true
runAsUser: 0 # Must run as root
capabilities:
add:
- SYS_ADMIN # System administration (mount, sysctl, etc)
- SYS_PTRACE # Process tracing
- SYS_RESOURCE # Resource limits
- NET_ADMIN # Network administration
- BPF # eBPF program loading
- SYS_BOOT # Reboot/poweroff (for severe faults)
- SYS_MODULE # Kernel module loading
- DAC_OVERRIDE # Bypass file permissions
- CAP_PERFMON # Performance monitoring
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: PRIVILEGED_MODE
value: "true"
- name: ENABLE_KERNEL_INJECTION
value: "true"
- name: ENABLE_EBPF
value: "true"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "1000m"
volumeMounts:
# Kernel access
- name: sys-kernel-debug
mountPath: /sys/kernel/debug
- name: sys-kernel-tracing
mountPath: /sys/kernel/tracing
- name: sys-kernel-btf
mountPath: /sys/kernel/btf
- name: lib-modules
mountPath: /lib/modules
readOnly: true
# System access
- name: sys
mountPath: /host/sys
- name: proc
mountPath: /host/proc
- name: dev
mountPath: /host/dev
# PCI bus access for XID 79 injection
- name: sys-bus-pci
mountPath: /sys/bus/pci
# CUDA/NVML libraries
- name: nvidia-driver
mountPath: /usr/local/nvidia
readOnly: true
# Configuration
- name: config
mountPath: /etc/gpu-fault-injector
readOnly: true
# Logs
- name: logs
mountPath: /var/log/gpu-fault-injector
ports:
- containerPort: 8083
name: http
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8083
initialDelaySeconds: 30
periodSeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /health
port: 8083
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
volumes:
# Kernel volumes
- name: sys-kernel-debug
hostPath:
path: /sys/kernel/debug
type: DirectoryOrCreate
- name: sys-kernel-tracing
hostPath:
path: /sys/kernel/tracing
type: DirectoryOrCreate
- name: sys-kernel-btf
hostPath:
path: /sys/kernel/btf
type: DirectoryOrCreate
- name: lib-modules
hostPath:
path: /lib/modules
type: Directory
# System volumes
- name: sys
hostPath:
path: /sys
type: Directory
- name: proc
hostPath:
path: /proc
type: Directory
- name: dev
hostPath:
path: /dev
type: Directory
# PCI bus
- name: sys-bus-pci
hostPath:
path: /sys/bus/pci
type: Directory
# NVIDIA driver
- name: nvidia-driver
hostPath:
path: /usr/local/nvidia
type: Directory
# Configuration
- name: config
configMap:
name: gpu-fault-injector-config
# Logs
- name: logs
hostPath:
path: /var/log/gpu-fault-injector
type: DirectoryOrCreate
---
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-fault-injector-config
namespace: fault-injection-system
data:
config.yaml: |
# GPU Fault Injector Configuration
# Injection methods (in order of preference)
methods:
- kernel_level
- chaos_mesh
- nvidia_smi
# Safety settings
safety:
require_confirmation: false # Set to true in production
max_concurrent_faults: 3
cooldown_seconds: 30
# Kernel-level injection settings
kernel:
enable_ebpf: true
enable_pci_manipulation: true
enable_cuda_interception: true
nvcc_path: /usr/local/cuda/bin/nvcc
# Logging
logging:
level: info
file: /var/log/gpu-fault-injector/agent.log
---
apiVersion: v1
kind: Service
metadata:
name: gpu-fault-injector-kernel
namespace: fault-injection-system
labels:
app: gpu-fault-injector-kernel
spec:
type: ClusterIP
clusterIP: None # Headless service for DaemonSet
selector:
app: gpu-fault-injector-kernel
ports:
- port: 8083
targetPort: 8083
protocol: TCP
name: http
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: Namespace
metadata:
name: fault-injection-system
labels:
name: fault-injection-system
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment