Unverified Commit e10319f3 authored by nv-oviya's avatar nv-oviya Committed by GitHub
Browse files

feat(fault-injection): Add GPU fault injector agent (#4043)


Signed-off-by: default avatarOviya Seeniraj <oseeniraj@nvidia.com>
Signed-off-by: default avatarHarrison Saturley-Hall <harrison.saturley.hall@gmail.com>
Signed-off-by: default avatarHarrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Co-authored-by: default avatarHarrison Saturley-Hall <hsaturleyhal@nvidia.com>
parent 39a9d0b2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg
# Runs as privileged DaemonSet on GPU nodes to inject XID errors
#
# NOTE: GPU nodes are AMD64/x86_64 architecture
# Build with: docker buildx build --platform linux/amd64 --load -t <image> .
FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04
# Install system dependencies (nsenter, nvidia-smi, journalctl)
RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
curl \
util-linux \
systemd \
kmod \
pciutils \
&& rm -rf /var/lib/apt/lists/*
# Install Python packages
COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
# Create working directory
WORKDIR /app
# Copy agent code
COPY agent.py /app/
COPY gpu_xid_injector.py /app/
# Create log directory
RUN mkdir -p /var/log/gpu-fault-injector
# Set environment
ENV PYTHONUNBUFFERED=1
# Expose port
EXPOSE 8083
# Health check
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8083/health || exit 1
# Run agent
CMD ["python3", "agent.py"]
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes.
This agent provides privileged access for XID error injection:
- XID injection via nsenter+kmsg (writes to host's /dev/kmsg)
- Triggers NVSentinel syslog-health-monitor detection
- Initiates complete fault tolerance workflow
Accepts ANY XID error code for testing flexibility.
Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
- Devastating: 79, 74, 48, 94, 95, 119, 120, 140
- Memory: 31, 32, 43, 63, 64
- PCIe: 38, 39, 42
- Thermal: 60, 61, 62
- Power: 54, 56, 57
- Graphics: 13, 45, 69
Unknown XIDs use generic error message format.
NVSentinel detects XIDs and handles actions based on its own rules.
See gpu_xid_injector.py for complete XID descriptions.
"""
import logging
import os
import subprocess
from datetime import datetime, timezone
from typing import Any, Optional, Type
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Import kernel-level XID injector (for XID 79 via nsenter+kmsg)
GPUXIDInjectorKernel: Optional[Type[Any]] = None
try:
from gpu_xid_injector import GPUXIDInjectorKernel # type: ignore[assignment]
KERNEL_XID_AVAILABLE = True
except ImportError:
logger.warning("Kernel-level XID injector not available")
KERNEL_XID_AVAILABLE = False
# ============================================================================
# Models and Enums
# ============================================================================
class XIDInjectRequest(BaseModel):
"""Request model for XID error injection via nsenter+kmsg"""
fault_id: str
xid_type: int
gpu_id: int = 0
duration: Optional[int] = None
# ============================================================================
# GPU Fault Injector
# ============================================================================
class GPUFaultInjector:
"""
GPU fault injection operations with DCGM integration.
Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages).
Accepts any XID value (1-1000) for comprehensive fault tolerance testing.
"""
def __init__(self):
self.active_faults: dict[str, dict[str, Any]] = {}
self.node_name = os.getenv("NODE_NAME", "unknown")
self.dcgm_available = self._check_dcgm()
self.gpu_count = self._get_gpu_count()
# Initialize kernel-level XID injector (XID 79 via nsenter+kmsg)
self.kernel_xid_injector = None
self.kernel_xid_available = False
if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None:
try:
self.kernel_xid_injector = GPUXIDInjectorKernel()
self.kernel_xid_available = self.kernel_xid_injector.privileged
logger.info(
f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})"
)
except Exception as e:
logger.warning(f"Kernel XID injector not available: {e}")
logger.info(f"GPU Fault Injector initialized on node: {self.node_name}")
logger.info(f"DCGM available: {self.dcgm_available}")
logger.info(f"GPU count: {self.gpu_count}")
logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}")
def _check_dcgm(self) -> bool:
"""Check if DCGM is available"""
try:
result = subprocess.run(
["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5
)
return result.returncode == 0
except Exception as e:
logger.warning(f"DCGM not available: {e}")
return False
def _get_gpu_count(self) -> int:
"""Get number of GPUs on this node"""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0:
return int(result.stdout.strip().split("\n")[0])
return 0
except Exception as e:
logger.error(f"Failed to get GPU count: {e}")
return 0
def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]:
"""Run shell command with timeout"""
try:
result = subprocess.run(
command, capture_output=True, text=True, timeout=timeout
)
success = result.returncode == 0
output = result.stdout if success else result.stderr
return success, output.strip()
except subprocess.TimeoutExpired:
return False, "Command timed out"
except Exception as e:
return False, str(e)
# ============================================================================
# FastAPI Application
# ============================================================================
app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0")
injector = GPUFaultInjector()
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"node": injector.node_name,
"gpu_count": injector.gpu_count,
"dcgm_available": injector.dcgm_available,
"active_faults": len(injector.active_faults),
}
@app.post("/inject-xid")
async def inject_xid(request: XIDInjectRequest):
"""
Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection).
Accepts any XID error code (1-1000) for maximum testing flexibility.
Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
Devastating (always FAIL):
- 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors
- 119/120: GSP errors | 140: ECC unrecovered
Subsystem (may WARN/escalate):
- Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement)
- PCIe: 38, 39, 42 (bus, fabric, replay rate)
- Thermal: 60, 61, 62 (temperature limits)
- Power: 54, 56, 57 (power/clock state)
- Graphics: 13, 45, 69 (SM exceptions)
Unknown XIDs use generic error message - NVSentinel will parse and handle
based on its own XID database.
"""
logger.info(
f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}"
)
# Validate XID type is a reasonable integer (basic sanity check)
if (
not isinstance(request.xid_type, int)
or request.xid_type < 1
or request.xid_type > 1000
):
raise HTTPException(
status_code=400,
detail=(
f"Invalid XID type: {request.xid_type}. "
f"XID must be an integer between 1-1000. "
f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)."
),
)
if not injector.kernel_xid_available or not injector.kernel_xid_injector:
raise HTTPException(
status_code=503,
detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.",
)
# Use the generic inject_xid method which supports multiple XID types
success, message = injector.kernel_xid_injector.inject_xid(
xid_type=request.xid_type, gpu_id=request.gpu_id
)
if not success:
raise HTTPException(status_code=500, detail=message)
# Track the fault
injector.active_faults[request.fault_id] = {
"type": f"xid_{request.xid_type}",
"gpu_id": request.gpu_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
return {
"status": "injected",
"node": injector.node_name,
"fault_id": request.fault_id,
"xid_type": request.xid_type,
"gpu_id": request.gpu_id,
"message": message,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
@app.get("/faults")
async def list_active_faults():
"""List active faults on this node"""
return {
"node": injector.node_name,
"active_faults": list(injector.active_faults.keys()),
"count": len(injector.active_faults),
}
if __name__ == "__main__":
uvicorn.run(
app,
host="0.0.0.0",
port=8083,
log_level="info",
)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
GPU XID Error Injector via nsenter+kmsg.
Injects fake XID messages to host's /dev/kmsg to trigger NVSentinel detection.
Uses nsenter to enter host namespaces and write kernel messages that NVSentinel
syslog-health-monitor can detect naturally.
Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection
Supported XIDs:
===============
This injector accepts ANY XID error code (1-255+) for maximum testing flexibility.
Pre-defined Messages for All DCGM/NVSentinel Monitored XIDs:
-------------------------------------------------------------
Based on DCGM health monitoring subsystems and NVSentinel detection rules.
DEVASTATING XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored):
- 79: GPU fell off bus (most critical - node-level action)
- 74: NVLink uncorrectable error (multi-GPU communication failure)
- 48: Double-bit ECC error (severe memory error)
- 94: Contained ECC error (less severe memory error)
- 95: Uncontained error (very severe, GPU reset required)
- 119: GSP RPC Timeout (GPU Service Processor communication)
- 120: GSP Error (GPU Service Processor internal error)
- 140: ECC unrecovered error (persistent memory issue)
SUBSYSTEM XIDs (DCGM_HEALTH_RESULT_WARN - may escalate):
Memory Subsystem (DCGM_HEALTH_WATCH_MEM):
- 31: MMU Error
- 32: PBDMA Error
- 43: Reset Channel Verification Error
- 63: Pending Page Retirements
- 64: Row Remap Failure
PCIe Subsystem (DCGM_HEALTH_WATCH_PCIE):
- 38: PCIe Bus Error
- 39: PCIe Fabric Error
- 42: PCIe Replay Rate exceeded
Thermal Subsystem (DCGM_HEALTH_WATCH_THERMAL):
- 60: Clocks Event: Thermal limit exceeded
- 61: EDPP Power Brake: Thermal limit
- 62: Thermal Violations detected
Power Subsystem (DCGM_HEALTH_WATCH_POWER):
- 54: Power state change event
- 56: Clock change event
- 57: Clocks Event: Power limit exceeded
Graphics/Common XIDs:
- 13: Graphics Engine Exception
- 45: Preemptive Cleanup (due to previous errors)
- 69: Graphics Exception: Class Error
Unknown XIDs:
-------------
Any XID not in XID_MESSAGES dict will use a generic error message format.
NVSentinel will parse and handle based on its own XID database and rules.
Note: XIDs 43, 48, 74, 94, 95 are already supported via CUDA interception
(cuda_intercept.c LD_PRELOAD). kmsg injection adds complementary syslog-based
detection path for NVSentinel's syslog-health-monitor.
"""
import logging
import os
import subprocess
from typing import Dict, Tuple
logger = logging.getLogger(__name__)
# XID error code to descriptive message mapping
# Based on DCGM XID database and NVSentinel monitoring rules
# Source: DCGM/modules/health/DcgmHealthWatch.cpp BuildXidMappings()
XID_MESSAGES: Dict[int, str] = {
# Devastating XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored)
79: "GPU has fallen off the bus",
48: "DBE (Double Bit Error) ECC Error",
74: "NVLink: Uncorrectable error",
94: "Contained ECC error",
95: "Uncontained error - GPU requires reset",
119: "GSP RPC Timeout",
120: "GSP Error",
140: "ECC unrecovered error",
# Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM)
31: "MMU Error",
32: "PBDMA Error",
43: "Reset Channel Verification Error",
63: "Pending Page Retirements",
64: "Row Remap Failure",
# PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE)
38: "PCIe Bus Error",
39: "PCIe Fabric Error",
42: "PCIe Replay Rate exceeded",
# 74 already defined above (can be PCIe or NVLink context)
# Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL)
60: "Clocks Event: Thermal limit exceeded",
61: "EDPP Power Brake: Thermal limit",
62: "Thermal Violations detected",
# 63 can be thermal or memory context ("Thermal diode detects short")
# Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER)
54: "Power state change event",
56: "Clock change event",
57: "Clocks Event: Power limit exceeded",
# Common Graphics XIDs (often seen in test environments)
13: "Graphics Engine Exception",
31: "GPU stopped responding", # Can be both MMU or timeout context
45: "Preemptive Cleanup, due to previous errors",
69: "Graphics Exception: Class Error",
}
class GPUXIDInjectorKernel:
"""
XID injector via nsenter+kmsg (triggers NVSentinel detection).
Accepts ANY XID error code for maximum flexibility in testing.
Pre-defined messages exist for common critical XIDs, but any XID value
can be injected - NVSentinel will parse and handle based on its own rules.
Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
Devastating XIDs (always trigger FAIL):
- 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors
- 119/120: GSP errors, 140: ECC unrecovered
Subsystem XIDs (trigger WARN, may escalate):
- Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors
- PCIe (38, 39, 42): Bus, fabric, replay rate errors
- Thermal (60, 61, 62, 63): Temperature limit violations
- Power (54, 56, 57): Power/clock state changes
- Graphics (13, 45, 69): SM exceptions, preemptive cleanup
Unknown XIDs use a generic error message format.
"""
def __init__(self):
self.node_name = os.getenv("NODE_NAME", "unknown")
self.privileged = self._check_privileged()
logger.info(f"XID Injector initialized on {self.node_name}")
logger.info(f"Privileged: {self.privileged}")
logger.info(f"Known XIDs with specific messages: {sorted(XID_MESSAGES.keys())}")
logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow")
logger.info("Note: Accepts ANY XID value - unknown XIDs use generic message")
def _check_privileged(self) -> bool:
"""Check if we have privileged access (required for nsenter)"""
return os.geteuid() == 0
def _normalize_pci_address(self, pci_addr: str) -> str:
"""
Normalize PCI address from nvidia-smi format to kernel sysfs format.
nvidia-smi returns: 00000001:00:00.0 (8-digit domain)
kernel expects: 0001:00:00.0 (4-digit domain)
Azure VMs use extended PCI addresses, but the kernel shortens them.
"""
parts = pci_addr.split(":")
if len(parts) >= 3:
# Keep only last 4 digits of domain
domain = parts[0][-4:] if len(parts[0]) > 4 else parts[0]
normalized = f"{domain}:{parts[1]}:{parts[2]}"
logger.debug(f"Normalized PCI address: {pci_addr} -> {normalized}")
return normalized
return pci_addr
def inject_xid(self, xid_type: int, gpu_id: int = 0) -> Tuple[bool, str]:
"""
Inject ANY XID error code via nsenter+kmsg.
This method accepts any integer XID value for maximum testing flexibility.
Pre-defined messages exist for well-known XIDs (79, 74, 48, etc.), but
any XID can be injected. Unknown XIDs use a generic error message.
Args:
xid_type: XID error code (any integer, commonly 1-255)
gpu_id: GPU device ID (default: 0)
Returns:
Tuple of (success: bool, message: str)
"""
logger.info(f"Injecting XID {xid_type} for GPU {gpu_id}")
if not self.privileged:
return (
False,
f"XID {xid_type} injection requires privileged mode (nsenter needs root)",
)
success, msg = self._inject_fake_xid_to_kmsg(gpu_id, xid_type)
if success:
logger.info(f"XID {xid_type} injected successfully: {msg}")
return True, msg
else:
logger.error(f"XID {xid_type} injection failed: {msg}")
return False, msg
# Convenience methods for specific XIDs (backward compatibility)
def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]:
"""Inject XID 79 (GPU Fell Off Bus) - most critical hardware failure."""
return self.inject_xid(79, gpu_id)
def inject_xid_74_nvlink_error(self, gpu_id: int = 0) -> Tuple[bool, str]:
"""Inject XID 74 (NVLink error) - multi-GPU communication failure."""
return self.inject_xid(74, gpu_id)
def inject_xid_48_ecc_dbe(self, gpu_id: int = 0) -> Tuple[bool, str]:
"""Inject XID 48 (Double-bit ECC error) - severe memory error."""
return self.inject_xid(48, gpu_id)
def inject_xid_94_ecc_contained(self, gpu_id: int = 0) -> Tuple[bool, str]:
"""Inject XID 94 (Contained ECC error) - less severe memory error."""
return self.inject_xid(94, gpu_id)
def inject_xid_95_uncontained(self, gpu_id: int = 0) -> Tuple[bool, str]:
"""Inject XID 95 (Uncontained error) - very severe, GPU reset required."""
return self.inject_xid(95, gpu_id)
def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
"""
Inject fake XID message to host's /dev/kmsg via nsenter.
Uses nsenter to enter all host namespaces (PID 1) and write to /dev/kmsg.
Creates real kernel messages with proper metadata that NVSentinel can detect.
Message format: "NVRM: NVRM: Xid (PCI:address): xid, message"
Duplicate "NVRM:" needed because /dev/kmsg splits on first colon.
Args:
gpu_id: GPU device ID (from nvidia-smi)
xid: XID error code (currently only 79 is used by public API)
Returns:
Tuple of (success: bool, message: str)
Note: This method accepts any XID code as a parameter for extensibility.
To add support for other XIDs (74, 48, 95, etc.), create corresponding
public methods like inject_xid_74_nvlink_error() and update the error
message template for each XID type.
"""
try:
# Get PCI address for the GPU
pci_result = subprocess.run(
[
"nvidia-smi",
"--query-gpu=pci.bus_id",
"--format=csv,noheader",
"-i",
str(gpu_id),
],
capture_output=True,
text=True,
timeout=10,
)
if pci_result.returncode != 0:
return (
False,
f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}",
)
pci_addr_full = pci_result.stdout.strip()
pci_addr = self._normalize_pci_address(pci_addr_full)
# Get appropriate error message for this XID type
# If XID is known, use specific message; otherwise use generic format
error_msg = XID_MESSAGES.get(
xid, f"Graphics Exception: XID {xid} occurred on GPU"
)
# Format XID message (duplicate "NVRM:" for /dev/kmsg parsing)
# Format matches NVSentinel pattern: NVRM: Xid (PCI:addr): code, description
xid_message = f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, {error_msg}"
logger.debug(f"Formatted XID message: {xid_message}")
# Write to host's /dev/kmsg via nsenter
kmsg_message = f"<3>{xid_message}" # <3> = kernel error priority
nsenter_cmd = [
"nsenter",
"--target",
"1", # Target host PID 1 (init)
"--mount", # Enter mount namespace (for /dev/kmsg access)
"--uts", # Enter UTS namespace (hostname)
"--ipc", # Enter IPC namespace
"--pid", # Enter PID namespace (appear as host process)
"--",
"sh",
"-c",
f"echo '{kmsg_message}' > /dev/kmsg",
]
nsenter_result = subprocess.run(
nsenter_cmd, capture_output=True, text=True, timeout=5
)
if nsenter_result.returncode != 0:
return (
False,
f"Failed to write to host /dev/kmsg: {nsenter_result.stderr}",
)
return (
True,
f"XID {xid} injected for GPU {gpu_id} (PCI: {pci_addr}) → NVSentinel",
)
except Exception as e:
logger.error(f"XID injection failed: {type(e).__name__}: {e}")
return False, f"Failed to inject XID: {e}"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
fastapi==0.111.0
httpx==0.26.0
kubernetes==28.1.0
pydantic==2.5.3
python-multipart==0.0.20
pyyaml==6.0.1
uvicorn[standard]==0.27.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment