feat(fault-injection): Add GPU fault injector agent (#4043)

Signed-off-by: Oviya Seeniraj <oseeniraj@nvidia.com> Signed-off-by: Harrison Saturley-Hall <harrison.saturley.hall@gmail.com> Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com> Co-authored-by: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>

feat(fault-injection): Add GPU fault injector agent (#4043)
Signed-off-by: Oviya Seeniraj <oseeniraj@nvidia.com> Signed-off-by: Harrison Saturley-Hall <harrison.saturley.hall@gmail.com> Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com> Co-authored-by: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>
e10319f3 · nv-oviya · GitHub · 39a9d0b2 · e10319f3 · e10319f3
Unverified Commit e10319f3 authored Nov 26, 2025 by nv-oviya Committed by GitHub Nov 26, 2025
4 changed files
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/Dockerfile
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# GPU Fault Injector Agent - XID 79 Injection via nsenter+kmsg
+# Runs as privileged DaemonSet on GPU nodes to inject XID errors
+#
+# NOTE: GPU nodes are AMD64/x86_64 architecture
+# Build with: docker buildx build --platform linux/amd64 --load -t <image> .
+FROM nvcr.io/nvidia/cuda:12.3.0-devel-ubuntu22.04
+# Install system dependencies (nsenter, nvidia-smi, journalctl)
+RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    curl \
+    util-linux \
+    systemd \
+    kmod \
+    pciutils \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python packages
+COPY requirements.txt /tmp/
+RUN pip3 install --no-cache-dir -r /tmp/requirements.txt
+# Create working directory
+WORKDIR /app
+# Copy agent code
+COPY agent.py /app/
+COPY gpu_xid_injector.py /app/
+# Create log directory
+RUN mkdir -p /var/log/gpu-fault-injector
+# Set environment
+ENV PYTHONUNBUFFERED=1
+# Expose port
+EXPOSE 8083
+# Health check
+HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:8083/health || exit 1
+# Run agent
+CMD ["python3", "agent.py"]
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/agent.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+"""
+GPU Fault Injector Agent - Runs as DaemonSet on GPU nodes.
+This agent provides privileged access for XID error injection:
+- XID injection via nsenter+kmsg (writes to host's /dev/kmsg)
+- Triggers NVSentinel syslog-health-monitor detection
+- Initiates complete fault tolerance workflow
+Accepts ANY XID error code for testing flexibility.
+Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+- Devastating: 79, 74, 48, 94, 95, 119, 120, 140
+- Memory: 31, 32, 43, 63, 64
+- PCIe: 38, 39, 42
+- Thermal: 60, 61, 62
+- Power: 54, 56, 57
+- Graphics: 13, 45, 69
+Unknown XIDs use generic error message format.
+NVSentinel detects XIDs and handles actions based on its own rules.
+See gpu_xid_injector.py for complete XID descriptions.
+"""
+import logging
+import os
+import subprocess
+from datetime import datetime, timezone
+from typing import Any, Optional, Type
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Import kernel-level XID injector (for XID 79 via nsenter+kmsg)
+GPUXIDInjectorKernel: Optional[Type[Any]] = None
+try:
+    from gpu_xid_injector import GPUXIDInjectorKernel  # type: ignore[assignment]
+    KERNEL_XID_AVAILABLE = True
+except ImportError:
+    logger.warning("Kernel-level XID injector not available")
+    KERNEL_XID_AVAILABLE = False
+# ============================================================================
+# Models and Enums
+# ============================================================================
+class XIDInjectRequest(BaseModel):
+    """Request model for XID error injection via nsenter+kmsg"""
+    fault_id: str
+    xid_type: int
+    gpu_id: int = 0
+    duration: Optional[int] = None
+# ============================================================================
+# GPU Fault Injector
+# ============================================================================
+class GPUFaultInjector:
+    """
+    GPU fault injection operations with DCGM integration.
+    Supports ANY XID injection via nsenter+kmsg (27+ pre-defined messages).
+    Accepts any XID value (1-1000) for comprehensive fault tolerance testing.
+    """
+    def __init__(self):
+        self.active_faults: dict[str, dict[str, Any]] = {}
+        self.node_name = os.getenv("NODE_NAME", "unknown")
+        self.dcgm_available = self._check_dcgm()
+        self.gpu_count = self._get_gpu_count()
+        # Initialize kernel-level XID injector (XID 79 via nsenter+kmsg)
+        self.kernel_xid_injector = None
+        self.kernel_xid_available = False
+        if KERNEL_XID_AVAILABLE and GPUXIDInjectorKernel is not None:
+            try:
+                self.kernel_xid_injector = GPUXIDInjectorKernel()
+                self.kernel_xid_available = self.kernel_xid_injector.privileged
+                logger.info(
+                    f"Kernel-level XID injector initialized (privileged: {self.kernel_xid_available})"
+                )
+            except Exception as e:
+                logger.warning(f"Kernel XID injector not available: {e}")
+        logger.info(f"GPU Fault Injector initialized on node: {self.node_name}")
+        logger.info(f"DCGM available: {self.dcgm_available}")
+        logger.info(f"GPU count: {self.gpu_count}")
+        logger.info(f"XID 79 injection (nsenter+kmsg): {self.kernel_xid_available}")
+    def _check_dcgm(self) -> bool:
+        """Check if DCGM is available"""
+        try:
+            result = subprocess.run(
+                ["dcgmi", "discovery", "-l"], capture_output=True, text=True, timeout=5
+            )
+            return result.returncode == 0
+        except Exception as e:
+            logger.warning(f"DCGM not available: {e}")
+            return False
+    def _get_gpu_count(self) -> int:
+        """Get number of GPUs on this node"""
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if result.returncode == 0:
+                return int(result.stdout.strip().split("\n")[0])
+            return 0
+        except Exception as e:
+            logger.error(f"Failed to get GPU count: {e}")
+            return 0
+    def _run_command(self, command: list[str], timeout: int = 30) -> tuple[bool, str]:
+        """Run shell command with timeout"""
+        try:
+            result = subprocess.run(
+                command, capture_output=True, text=True, timeout=timeout
+            )
+            success = result.returncode == 0
+            output = result.stdout if success else result.stderr
+            return success, output.strip()
+        except subprocess.TimeoutExpired:
+            return False, "Command timed out"
+        except Exception as e:
+            return False, str(e)
+# ============================================================================
+# FastAPI Application
+# ============================================================================
+app = FastAPI(title="GPU Fault Injector Agent", version="1.0.0")
+injector = GPUFaultInjector()
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "node": injector.node_name,
+        "gpu_count": injector.gpu_count,
+        "dcgm_available": injector.dcgm_available,
+        "active_faults": len(injector.active_faults),
+    }
+@app.post("/inject-xid")
+async def inject_xid(request: XIDInjectRequest):
+    """
+    Inject ANY XID error via nsenter+kmsg (triggers NVSentinel detection).
+    Accepts any XID error code (1-1000) for maximum testing flexibility.
+    Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+    Devastating (always FAIL):
+    - 79: GPU fell off bus | 74: NVLink error | 48: ECC DBE | 94/95: ECC errors
+    - 119/120: GSP errors | 140: ECC unrecovered
+    Subsystem (may WARN/escalate):
+    - Memory: 31, 32, 43, 63, 64 (MMU, PBDMA, page retirement)
+    - PCIe: 38, 39, 42 (bus, fabric, replay rate)
+    - Thermal: 60, 61, 62 (temperature limits)
+    - Power: 54, 56, 57 (power/clock state)
+    - Graphics: 13, 45, 69 (SM exceptions)
+    Unknown XIDs use generic error message - NVSentinel will parse and handle
+    based on its own XID database.
+    """
+    logger.info(
+        f"Received XID {request.xid_type} injection request for GPU {request.gpu_id}"
+    )
+    # Validate XID type is a reasonable integer (basic sanity check)
+    if (
+        not isinstance(request.xid_type, int)
+        or request.xid_type < 1
+        or request.xid_type > 1000
+    ):
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                f"Invalid XID type: {request.xid_type}. "
+                f"XID must be an integer between 1-1000. "
+                f"Common XIDs: 79 (bus error), 74 (NVLink), 48/94/95 (ECC errors)."
+            ),
+        )
+    if not injector.kernel_xid_available or not injector.kernel_xid_injector:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Kernel-level XID injector not available. XID {request.xid_type} requires privileged access to syslog/kmsg.",
+        )
+    # Use the generic inject_xid method which supports multiple XID types
+    success, message = injector.kernel_xid_injector.inject_xid(
+        xid_type=request.xid_type, gpu_id=request.gpu_id
+    )
+    if not success:
+        raise HTTPException(status_code=500, detail=message)
+    # Track the fault
+    injector.active_faults[request.fault_id] = {
+        "type": f"xid_{request.xid_type}",
+        "gpu_id": request.gpu_id,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+    return {
+        "status": "injected",
+        "node": injector.node_name,
+        "fault_id": request.fault_id,
+        "xid_type": request.xid_type,
+        "gpu_id": request.gpu_id,
+        "message": message,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+@app.get("/faults")
+async def list_active_faults():
+    """List active faults on this node"""
+    return {
+        "node": injector.node_name,
+        "active_faults": list(injector.active_faults.keys()),
+        "count": len(injector.active_faults),
+    }
+if __name__ == "__main__":
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8083,
+        log_level="info",
+    )
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/gpu_xid_injector.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+"""
+GPU XID Error Injector via nsenter+kmsg.
+Injects fake XID messages to host's /dev/kmsg to trigger NVSentinel detection.
+Uses nsenter to enter host namespaces and write kernel messages that NVSentinel
+syslog-health-monitor can detect naturally.
+Method: nsenter --target 1 (all namespaces) → echo to /dev/kmsg → NVSentinel detection
+Supported XIDs:
+===============
+This injector accepts ANY XID error code (1-255+) for maximum testing flexibility.
+Pre-defined Messages for All DCGM/NVSentinel Monitored XIDs:
+-------------------------------------------------------------
+Based on DCGM health monitoring subsystems and NVSentinel detection rules.
+DEVASTATING XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored):
+- 79:  GPU fell off bus (most critical - node-level action)
+- 74:  NVLink uncorrectable error (multi-GPU communication failure)
+- 48:  Double-bit ECC error (severe memory error)
+- 94:  Contained ECC error (less severe memory error)
+- 95:  Uncontained error (very severe, GPU reset required)
+- 119: GSP RPC Timeout (GPU Service Processor communication)
+- 120: GSP Error (GPU Service Processor internal error)
+- 140: ECC unrecovered error (persistent memory issue)
+SUBSYSTEM XIDs (DCGM_HEALTH_RESULT_WARN - may escalate):
+Memory Subsystem (DCGM_HEALTH_WATCH_MEM):
+- 31:  MMU Error
+- 32:  PBDMA Error
+- 43:  Reset Channel Verification Error
+- 63:  Pending Page Retirements
+- 64:  Row Remap Failure
+PCIe Subsystem (DCGM_HEALTH_WATCH_PCIE):
+- 38:  PCIe Bus Error
+- 39:  PCIe Fabric Error
+- 42:  PCIe Replay Rate exceeded
+Thermal Subsystem (DCGM_HEALTH_WATCH_THERMAL):
+- 60:  Clocks Event: Thermal limit exceeded
+- 61:  EDPP Power Brake: Thermal limit
+- 62:  Thermal Violations detected
+Power Subsystem (DCGM_HEALTH_WATCH_POWER):
+- 54:  Power state change event
+- 56:  Clock change event
+- 57:  Clocks Event: Power limit exceeded
+Graphics/Common XIDs:
+- 13:  Graphics Engine Exception
+- 45:  Preemptive Cleanup (due to previous errors)
+- 69:  Graphics Exception: Class Error
+Unknown XIDs:
+-------------
+Any XID not in XID_MESSAGES dict will use a generic error message format.
+NVSentinel will parse and handle based on its own XID database and rules.
+Note: XIDs 43, 48, 74, 94, 95 are already supported via CUDA interception
+(cuda_intercept.c LD_PRELOAD). kmsg injection adds complementary syslog-based
+detection path for NVSentinel's syslog-health-monitor.
+"""
+import logging
+import os
+import subprocess
+from typing import Dict, Tuple
+logger = logging.getLogger(__name__)
+# XID error code to descriptive message mapping
+# Based on DCGM XID database and NVSentinel monitoring rules
+# Source: DCGM/modules/health/DcgmHealthWatch.cpp BuildXidMappings()
+XID_MESSAGES: Dict[int, str] = {
+    # Devastating XIDs (DCGM_HEALTH_RESULT_FAIL - always monitored)
+    79: "GPU has fallen off the bus",
+    48: "DBE (Double Bit Error) ECC Error",
+    74: "NVLink: Uncorrectable error",
+    94: "Contained ECC error",
+    95: "Uncontained error - GPU requires reset",
+    119: "GSP RPC Timeout",
+    120: "GSP Error",
+    140: "ECC unrecovered error",
+    # Memory Subsystem XIDs (DCGM_HEALTH_WATCH_MEM)
+    31: "MMU Error",
+    32: "PBDMA Error",
+    43: "Reset Channel Verification Error",
+    63: "Pending Page Retirements",
+    64: "Row Remap Failure",
+    # PCIe Subsystem XIDs (DCGM_HEALTH_WATCH_PCIE)
+    38: "PCIe Bus Error",
+    39: "PCIe Fabric Error",
+    42: "PCIe Replay Rate exceeded",
+    # 74 already defined above (can be PCIe or NVLink context)
+    # Thermal Subsystem XIDs (DCGM_HEALTH_WATCH_THERMAL)
+    60: "Clocks Event: Thermal limit exceeded",
+    61: "EDPP Power Brake: Thermal limit",
+    62: "Thermal Violations detected",
+    # 63 can be thermal or memory context ("Thermal diode detects short")
+    # Power Subsystem XIDs (DCGM_HEALTH_WATCH_POWER)
+    54: "Power state change event",
+    56: "Clock change event",
+    57: "Clocks Event: Power limit exceeded",
+    # Common Graphics XIDs (often seen in test environments)
+    13: "Graphics Engine Exception",
+    31: "GPU stopped responding",  # Can be both MMU or timeout context
+    45: "Preemptive Cleanup, due to previous errors",
+    69: "Graphics Exception: Class Error",
+}
+class GPUXIDInjectorKernel:
+    """
+    XID injector via nsenter+kmsg (triggers NVSentinel detection).
+    Accepts ANY XID error code for maximum flexibility in testing.
+    Pre-defined messages exist for common critical XIDs, but any XID value
+    can be injected - NVSentinel will parse and handle based on its own rules.
+    Pre-defined messages for all DCGM/NVSentinel monitored XIDs:
+    Devastating XIDs (always trigger FAIL):
+    - 79: GPU fell off bus, 74: NVLink error, 48: ECC DBE, 94/95: ECC errors
+    - 119/120: GSP errors, 140: ECC unrecovered
+    Subsystem XIDs (trigger WARN, may escalate):
+    - Memory (31, 32, 43, 63, 64): MMU, PBDMA, page retirement errors
+    - PCIe (38, 39, 42): Bus, fabric, replay rate errors
+    - Thermal (60, 61, 62, 63): Temperature limit violations
+    - Power (54, 56, 57): Power/clock state changes
+    - Graphics (13, 45, 69): SM exceptions, preemptive cleanup
+    Unknown XIDs use a generic error message format.
+    """
+    def __init__(self):
+        self.node_name = os.getenv("NODE_NAME", "unknown")
+        self.privileged = self._check_privileged()
+        logger.info(f"XID Injector initialized on {self.node_name}")
+        logger.info(f"Privileged: {self.privileged}")
+        logger.info(f"Known XIDs with specific messages: {sorted(XID_MESSAGES.keys())}")
+        logger.info("Method: nsenter+kmsg → NVSentinel detection → Full FT workflow")
+        logger.info("Note: Accepts ANY XID value - unknown XIDs use generic message")
+    def _check_privileged(self) -> bool:
+        """Check if we have privileged access (required for nsenter)"""
+        return os.geteuid() == 0
+    def _normalize_pci_address(self, pci_addr: str) -> str:
+        """
+        Normalize PCI address from nvidia-smi format to kernel sysfs format.
+        nvidia-smi returns: 00000001:00:00.0 (8-digit domain)
+        kernel expects:     0001:00:00.0     (4-digit domain)
+        Azure VMs use extended PCI addresses, but the kernel shortens them.
+        """
+        parts = pci_addr.split(":")
+        if len(parts) >= 3:
+            # Keep only last 4 digits of domain
+            domain = parts[0][-4:] if len(parts[0]) > 4 else parts[0]
+            normalized = f"{domain}:{parts[1]}:{parts[2]}"
+            logger.debug(f"Normalized PCI address: {pci_addr} -> {normalized}")
+            return normalized
+        return pci_addr
+    def inject_xid(self, xid_type: int, gpu_id: int = 0) -> Tuple[bool, str]:
+        """
+        Inject ANY XID error code via nsenter+kmsg.
+        This method accepts any integer XID value for maximum testing flexibility.
+        Pre-defined messages exist for well-known XIDs (79, 74, 48, etc.), but
+        any XID can be injected. Unknown XIDs use a generic error message.
+        Args:
+            xid_type: XID error code (any integer, commonly 1-255)
+            gpu_id: GPU device ID (default: 0)
+        Returns:
+            Tuple of (success: bool, message: str)
+        """
+        logger.info(f"Injecting XID {xid_type} for GPU {gpu_id}")
+        if not self.privileged:
+            return (
+                False,
+                f"XID {xid_type} injection requires privileged mode (nsenter needs root)",
+            )
+        success, msg = self._inject_fake_xid_to_kmsg(gpu_id, xid_type)
+        if success:
+            logger.info(f"XID {xid_type} injected successfully: {msg}")
+            return True, msg
+        else:
+            logger.error(f"XID {xid_type} injection failed: {msg}")
+            return False, msg
+    # Convenience methods for specific XIDs (backward compatibility)
+    def inject_xid_79_gpu_fell_off_bus(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 79 (GPU Fell Off Bus) - most critical hardware failure."""
+        return self.inject_xid(79, gpu_id)
+    def inject_xid_74_nvlink_error(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 74 (NVLink error) - multi-GPU communication failure."""
+        return self.inject_xid(74, gpu_id)
+    def inject_xid_48_ecc_dbe(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 48 (Double-bit ECC error) - severe memory error."""
+        return self.inject_xid(48, gpu_id)
+    def inject_xid_94_ecc_contained(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 94 (Contained ECC error) - less severe memory error."""
+        return self.inject_xid(94, gpu_id)
+    def inject_xid_95_uncontained(self, gpu_id: int = 0) -> Tuple[bool, str]:
+        """Inject XID 95 (Uncontained error) - very severe, GPU reset required."""
+        return self.inject_xid(95, gpu_id)
+    def _inject_fake_xid_to_kmsg(self, gpu_id: int, xid: int) -> Tuple[bool, str]:
+        """
+        Inject fake XID message to host's /dev/kmsg via nsenter.
+        Uses nsenter to enter all host namespaces (PID 1) and write to /dev/kmsg.
+        Creates real kernel messages with proper metadata that NVSentinel can detect.
+        Message format: "NVRM: NVRM: Xid (PCI:address): xid, message"
+        Duplicate "NVRM:" needed because /dev/kmsg splits on first colon.
+        Args:
+            gpu_id: GPU device ID (from nvidia-smi)
+            xid: XID error code (currently only 79 is used by public API)
+        Returns:
+            Tuple of (success: bool, message: str)
+        Note: This method accepts any XID code as a parameter for extensibility.
+        To add support for other XIDs (74, 48, 95, etc.), create corresponding
+        public methods like inject_xid_74_nvlink_error() and update the error
+        message template for each XID type.
+        """
+        try:
+            # Get PCI address for the GPU
+            pci_result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=pci.bus_id",
+                    "--format=csv,noheader",
+                    "-i",
+                    str(gpu_id),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if pci_result.returncode != 0:
+                return (
+                    False,
+                    f"Failed to get PCI address for GPU {gpu_id}: {pci_result.stderr}",
+                )
+            pci_addr_full = pci_result.stdout.strip()
+            pci_addr = self._normalize_pci_address(pci_addr_full)
+            # Get appropriate error message for this XID type
+            # If XID is known, use specific message; otherwise use generic format
+            error_msg = XID_MESSAGES.get(
+                xid, f"Graphics Exception: XID {xid} occurred on GPU"
+            )
+            # Format XID message (duplicate "NVRM:" for /dev/kmsg parsing)
+            # Format matches NVSentinel pattern: NVRM: Xid (PCI:addr): code, description
+            xid_message = f"NVRM: NVRM: Xid (PCI:{pci_addr}): {xid}, {error_msg}"
+            logger.debug(f"Formatted XID message: {xid_message}")
+            # Write to host's /dev/kmsg via nsenter
+            kmsg_message = f"<3>{xid_message}"  # <3> = kernel error priority
+            nsenter_cmd = [
+                "nsenter",
+                "--target",
+                "1",  # Target host PID 1 (init)
+                "--mount",  # Enter mount namespace (for /dev/kmsg access)
+                "--uts",  # Enter UTS namespace (hostname)
+                "--ipc",  # Enter IPC namespace
+                "--pid",  # Enter PID namespace (appear as host process)
+                "--",
+                "sh",
+                "-c",
+                f"echo '{kmsg_message}' > /dev/kmsg",
+            ]
+            nsenter_result = subprocess.run(
+                nsenter_cmd, capture_output=True, text=True, timeout=5
+            )
+            if nsenter_result.returncode != 0:
+                return (
+                    False,
+                    f"Failed to write to host /dev/kmsg: {nsenter_result.stderr}",
+                )
+            return (
+                True,
+                f"XID {xid} injected for GPU {gpu_id} (PCI: {pci_addr}) → NVSentinel",
+            )
+        except Exception as e:
+            logger.error(f"XID injection failed: {type(e).__name__}: {e}")
+            return False, f"Failed to inject XID: {e}"
--- a/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
+++ b/tests/fault_tolerance/hardware/fault-injection-service/agents/gpu-fault-injector/requirements.txt
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+fastapi==0.111.0
+httpx==0.26.0
+kubernetes==28.1.0
+pydantic==2.5.3
+python-multipart==0.0.20
+pyyaml==6.0.1
+uvicorn[standard]==0.27.0