feat(fault-injection): Add CUDA fault injection library foundation (#4038)

Signed-off-by: nv-oviya <oseeniraj@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

feat(fault-injection): Add CUDA fault injection library foundation (#4038)
Signed-off-by: nv-oviya <oseeniraj@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
78934278 · nv-oviya · GitHub · 2e5e68b4 · 78934278 · 78934278
Unverified Commit 78934278 authored Nov 26, 2025 by nv-oviya Committed by GitHub Nov 26, 2025
5 changed files
--- a/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/.gitignore
+++ b/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/.gitignore
+# Compiled library
+*.so
+*.o
+*.a
+
+# Build artifacts
+*.dylib
+*.dll
+
+# Editor files
+*.swp
+*~
+.DS_Store
+
+# Test outputs
+test_output.txt
+
--- a/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/Makefile
+++ b/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/Makefile
+# Makefile for CUDA Intercept Library
+# Simulates various XID errors via LD_PRELOAD
+#
+# GCC Version: Requires gcc 4.8+ (any modern gcc works)
+#              Tested with gcc 7.5, 9.4, 11.x, 13.x
+#              Uses standard C99 features only
+
+.PHONY: all clean test help
+
+# Compiler settings
+CC = gcc
+CFLAGS = -fPIC -Wall -Wextra -std=c99
+LDFLAGS = -shared
+LDLIBS = -ldl
+TARGET = cuda_intercept.so
+SOURCE = cuda_intercept.c
+
+all: $(TARGET)
+
+$(TARGET): $(SOURCE)
+	@echo "Building CUDA fault injection library..."
+	$(CC) $(CFLAGS) $(SOURCE) $(LDFLAGS) -o $(TARGET) $(LDLIBS)
+	@echo "✓ Built: $(TARGET)"
+	@echo ""
+	@echo "Usage:"
+	@echo "  LD_PRELOAD=./$(TARGET) python -c 'import torch; print(torch.cuda.is_available())'"
+	@echo ""
+
+clean:
+	@echo "Cleaning..."
+	rm -f $(TARGET)
+	@echo "✓ Cleaned"
+
+test: $(TARGET)
+	@echo "Testing CUDA fault injection..."
+	@echo ""
+	@echo "Test 1: With fault injection enabled (default)"
+	@CUDA_FAULT_INJECTION_ENABLED=1 LD_PRELOAD=./$(TARGET) python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("Device count:", torch.cuda.device_count())' || echo "✓ Expected failure"
+	@echo ""
+	@echo "Test 2: With fault injection disabled"
+	@CUDA_FAULT_INJECTION_ENABLED=0 LD_PRELOAD=./$(TARGET) python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("Device count:", torch.cuda.device_count())'
+	@echo ""
+
+help:
+	@echo "CUDA Fault Injection Library - Makefile"
+	@echo ""
+	@echo "Targets:"
+	@echo "  make          - Build the library"
+	@echo "  make test     - Run basic tests"
+	@echo "  make clean    - Remove built files"
+	@echo "  make help     - Show this help"
+	@echo ""
+	@echo "Usage in tests:"
+	@echo "  export LD_PRELOAD=/path/to/fake_cuda_xid79.so"
+	@echo "  export CUDA_FAULT_INJECTION_ENABLED=1"
+	@echo "  python -m vllm.entrypoints.api_server"
+
--- a/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/README.md
+++ b/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/README.md
+# CUDA Fault Injection - Test Library
+
+**Purpose**: Safely simulate GPU failures (XID errors) in tests without breaking real hardware.
+
+> **⚠️ Note**: This directory contains the **C library source code** only. The library is **compiled in-pod** during Kubernetes tests for Linux compatibility. You do **not** need to build it locally unless doing standalone local testing.
+
+## What This Does
+
+Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
+
+```
+Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
+```
+
+**Result**: Realistic GPU failure testing without hardware damage.
+
+## Scope
+
+This library simulates **software/orchestration-level failures** that occur when GPU hardware becomes inaccessible or unusable:
+- ✅ **In scope**: CUDA API failures due to GPU becoming unavailable (XID errors, device not found, ECC errors)
+- ✅ **Use case**: Testing Kubernetes pod rescheduling, inference failover, recovery orchestration
+- ❌ **Out of scope**: Bit-level Silent Data Corruption (SDC), compute errors, incorrect results
+- ❌ **Not modeled**: General GPU faulting phenomena at the computation/memory level
+
+**Note**: SDC detection mechanisms will not trigger with this approach, as we intercept at the CUDA API layer, not at the hardware/computation layer.
+
+## Supported XID Errors
+
+| XID | Description | CUDA Error | Use Case |
+|-----|-------------|------------|----------|
+| **79** | GPU fell off bus | `CUDA_ERROR_NO_DEVICE` | Most common, node-level failure |
+| **48** | Double-bit ECC error | `CUDA_ERROR_ECC_UNCORRECTABLE` | Memory corruption |
+| **94** | Contained ECC error | `CUDA_ERROR_ECC_UNCORRECTABLE` | Recoverable memory error |
+| **95** | Uncontained error | `CUDA_ERROR_UNKNOWN` | Fatal GPU error |
+| **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
+| **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
+
+## Files in This Directory
+
+| File | Purpose |
+|------|---------|
+| `cuda_intercept.c` | C library source that intercepts CUDA calls |
+| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
+| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
+
+## Prerequisites
+
+- **gcc compiler** (for building the library)
+- **kubectl** with cluster access
+- Python packages: `kubernetes`, `requests`
+- No local compilation needed (compiled in-pod)
+
+### For Standalone Local Testing (Optional)
+- **gcc** (version 7.5+ recommended, any modern gcc works)
+- **CUDA development headers** (optional, uses runtime API only)
+
+## Writing Your Own Test
+
+### Import Helper Functions
+
+```python
+import sys
+from pathlib import Path
+
+# Add cuda-fault-injection to path
+cuda_injection_dir = Path(__file__).parent.parent / "cuda-fault-injection"
+sys.path.insert(0, str(cuda_injection_dir))
+
+from inject_into_pods import (
+    create_cuda_fault_configmap,      # Step 1: Create ConfigMap with library source
+    patch_deployment_env,             # Step 2: Patch deployment to use it
+    delete_cuda_fault_configmap       # Cleanup: Remove ConfigMap
+)
+```
\ No newline at end of file
--- a/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/cuda_intercept.c
+++ b/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/cuda_intercept.c
+/*
+ * CUDA Intercept Library
+ *
+ * This library intercepts CUDA calls and returns appropriate error codes
+ * to simulate various GPU failures (XIDs).
+ *
+ * Supported XID types (set via CUDA_XID_TYPE environment variable):
+ *   79  - GPU fell off bus (CUDA_ERROR_NO_DEVICE) - DEFAULT
+ *   48  - Double-bit ECC error (CUDA_ERROR_ECC_UNCORRECTABLE)
+ *   94  - Contained ECC error (CUDA_ERROR_ECC_UNCORRECTABLE)
+ *   95  - Uncontained error (CUDA_ERROR_UNKNOWN)
+ *   43  - GPU stopped responding (CUDA_ERROR_LAUNCH_TIMEOUT)
+ *   74  - NVLink error (CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
+ *
+ * Compile:
+ *   gcc -shared -fPIC -ldl cuda_intercept.c -o cuda_intercept.so
+ *
+ * Use:
+ *   export CUDA_FAULT_INJECTION_ENABLED=1
+ *   export CUDA_XID_TYPE=79  # or 48, 94, 95, 43, 74
+ *   LD_PRELOAD=/path/to/cuda_intercept.so python -m vllm.entrypoints.api_server
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef int cudaError_t;
+typedef struct cudaDeviceProp_st {
+  char name[256];
+  size_t totalGlobalMem;
+  // ... other fields (we don't need them)
+} cudaDeviceProp;
+
+// CUDA error codes (from cuda_runtime_api.h)
+#define cudaSuccess 0
+#define cudaErrorNoDevice 100               // XID 79: GPU fell off bus
+#define cudaErrorEccUncorrectable 214       // XID 48, 94: ECC errors
+#define cudaErrorUnknown 999                // XID 95: Uncontained error
+#define cudaErrorLaunchTimeout 6            // XID 43: GPU stopped responding
+#define cudaErrorPeerAccessUnsupported 217  // XID 74: NVLink error
+
+// XID error type mapping
+typedef struct {
+  int xid;
+  cudaError_t cuda_error;
+  const char* description;
+} xid_mapping_t;
+
+static const xid_mapping_t xid_mappings[] = {
+    {79, cudaErrorNoDevice, "GPU fell off bus"},
+    {48, cudaErrorEccUncorrectable, "Double-bit ECC error"},
+    {94, cudaErrorEccUncorrectable, "Contained ECC error"},
+    {95, cudaErrorUnknown, "Uncontained error"},
+    {43, cudaErrorLaunchTimeout, "GPU stopped responding"},
+    {74, cudaErrorPeerAccessUnsupported, "NVLink error"},
+    {0, 0, NULL}  // Sentinel
+};
+
+// Get XID type and corresponding CUDA error
+static void
+get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
+{
+  static int initialized = 0;
+  static int cached_inject = 0;
+  static int cached_xid = 79;  // Default to XID 79
+  static cudaError_t cached_error = cudaErrorNoDevice;
+
+  if (!initialized) {
+    // Check if injection is enabled
+    char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
+    if (env) {
+      cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+    }
+
+    // Get XID type
+    char* xid_env = getenv("CUDA_XID_TYPE");
+    if (xid_env) {
+      cached_xid = atoi(xid_env);
+
+      // Find corresponding CUDA error
+      int found = 0;
+      for (int i = 0; xid_mappings[i].description != NULL; i++) {
+        if (xid_mappings[i].xid == cached_xid) {
+          cached_error = xid_mappings[i].cuda_error;
+          fprintf(
+              stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
+              xid_mappings[i].description);
+          found = 1;
+          break;
+        }
+      }
+
+      if (!found) {
+        fprintf(stderr, "[CUDA FAULT INJECTION] WARNING: Unknown XID %d, defaulting to XID 79\n", cached_xid);
+        cached_xid = 79;
+        cached_error = cudaErrorNoDevice;
+      }
+    } else {
+      fprintf(
+          stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
+          cached_inject ? "ENABLED" : "DISABLED");
+    }
+
+    initialized = 1;
+  }
+
+  *inject = cached_inject;
+  *xid_type = cached_xid;
+  *error_code = cached_error;
+}
+
+// Check if fault should be injected
+static int
+should_inject_fault()
+{
+  int inject, xid;
+  cudaError_t error;
+  get_fault_config(&inject, &xid, &error);
+  return inject;
+}
+
+// Get the error code to return
+static cudaError_t
+get_error_code()
+{
+  int inject, xid;
+  cudaError_t error;
+  get_fault_config(&inject, &xid, &error);
+  return error;
+}
+
+// Log helper
+static void
+log_intercept(const char* func_name, cudaError_t error_code)
+{
+  if (should_inject_fault()) {
+    int inject, xid;
+    cudaError_t err;
+    get_fault_config(&inject, &xid, &err);
+    fprintf(stderr, "[XID %d SIM] %s() intercepted -> error %d\n", xid, func_name, error_code);
+  }
+}
+
+// Intercept: Get device count
+cudaError_t
+cudaGetDeviceCount(int* count)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaGetDeviceCount", error);
+    if (count)
+      *count = 0;
+    return error;
+  }
+
+  // If disabled, call real function
+  typedef cudaError_t (*real_func_t)(int*);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDeviceCount");
+  if (real_func) {
+    return real_func(count);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Set device
+cudaError_t
+cudaSetDevice(int device)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaSetDevice", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(int);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaSetDevice");
+  if (real_func) {
+    return real_func(device);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Get device
+cudaError_t
+cudaGetDevice(int* device)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaGetDevice", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(int*);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDevice");
+  if (real_func) {
+    return real_func(device);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Malloc
+cudaError_t
+cudaMalloc(void** devPtr, size_t size)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaMalloc", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(void**, size_t);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaMalloc");
+  if (real_func) {
+    return real_func(devPtr, size);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Free
+cudaError_t
+cudaFree(void* devPtr)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaFree", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(void*);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaFree");
+  if (real_func) {
+    return real_func(devPtr);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Memcpy
+cudaError_t
+cudaMemcpy(void* dst, const void* src, size_t count, int kind)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaMemcpy", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(void*, const void*, size_t, int);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaMemcpy");
+  if (real_func) {
+    return real_func(dst, src, count, kind);
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Device synchronize
+cudaError_t
+cudaDeviceSynchronize(void)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaDeviceSynchronize", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(void);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaDeviceSynchronize");
+  if (real_func) {
+    return real_func();
+  }
+  return cudaErrorNoDevice;
+}
+
+// Intercept: Get device properties
+cudaError_t
+cudaGetDeviceProperties(cudaDeviceProp* prop, int device)
+{
+  if (should_inject_fault()) {
+    cudaError_t error = get_error_code();
+    log_intercept("cudaGetDeviceProperties", error);
+    return error;
+  }
+
+  typedef cudaError_t (*real_func_t)(cudaDeviceProp*, int);
+  real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDeviceProperties");
+  if (real_func) {
+    return real_func(prop, device);
+  }
+  return cudaErrorNoDevice;
+}
--- a/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/inject_into_pods.py
+++ b/tests/fault_tolerance/hardware/fault-injection-service/cuda-fault-injection/inject_into_pods.py