"lib/bindings/vscode:/vscode.git/clone" did not exist on "d0a63635849ab1c29f4b3cbe419a19730a575da1"
Unverified Commit 78934278 authored by nv-oviya's avatar nv-oviya Committed by GitHub
Browse files

feat(fault-injection): Add CUDA fault injection library foundation (#4038)


Signed-off-by: default avatarnv-oviya <oseeniraj@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent 2e5e68b4
# Compiled library
*.so
*.o
*.a
# Build artifacts
*.dylib
*.dll
# Editor files
*.swp
*~
.DS_Store
# Test outputs
test_output.txt
# Makefile for CUDA Intercept Library
# Simulates various XID errors via LD_PRELOAD
#
# GCC Version: Requires gcc 4.8+ (any modern gcc works)
# Tested with gcc 7.5, 9.4, 11.x, 13.x
# Uses standard C99 features only
.PHONY: all clean test help
# Compiler settings
CC = gcc
CFLAGS = -fPIC -Wall -Wextra -std=c99
LDFLAGS = -shared
LDLIBS = -ldl
TARGET = cuda_intercept.so
SOURCE = cuda_intercept.c
all: $(TARGET)
$(TARGET): $(SOURCE)
@echo "Building CUDA fault injection library..."
$(CC) $(CFLAGS) $(SOURCE) $(LDFLAGS) -o $(TARGET) $(LDLIBS)
@echo "✓ Built: $(TARGET)"
@echo ""
@echo "Usage:"
@echo " LD_PRELOAD=./$(TARGET) python -c 'import torch; print(torch.cuda.is_available())'"
@echo ""
clean:
@echo "Cleaning..."
rm -f $(TARGET)
@echo "✓ Cleaned"
test: $(TARGET)
@echo "Testing CUDA fault injection..."
@echo ""
@echo "Test 1: With fault injection enabled (default)"
@CUDA_FAULT_INJECTION_ENABLED=1 LD_PRELOAD=./$(TARGET) python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("Device count:", torch.cuda.device_count())' || echo "✓ Expected failure"
@echo ""
@echo "Test 2: With fault injection disabled"
@CUDA_FAULT_INJECTION_ENABLED=0 LD_PRELOAD=./$(TARGET) python3 -c 'import torch; print("CUDA available:", torch.cuda.is_available()); print("Device count:", torch.cuda.device_count())'
@echo ""
help:
@echo "CUDA Fault Injection Library - Makefile"
@echo ""
@echo "Targets:"
@echo " make - Build the library"
@echo " make test - Run basic tests"
@echo " make clean - Remove built files"
@echo " make help - Show this help"
@echo ""
@echo "Usage in tests:"
@echo " export LD_PRELOAD=/path/to/fake_cuda_xid79.so"
@echo " export CUDA_FAULT_INJECTION_ENABLED=1"
@echo " python -m vllm.entrypoints.api_server"
# CUDA Fault Injection - Test Library
**Purpose**: Safely simulate GPU failures (XID errors) in tests without breaking real hardware.
> **⚠️ Note**: This directory contains the **C library source code** only. The library is **compiled in-pod** during Kubernetes tests for Linux compatibility. You do **not** need to build it locally unless doing standalone local testing.
## What This Does
Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls.
```
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes
```
**Result**: Realistic GPU failure testing without hardware damage.
## Scope
This library simulates **software/orchestration-level failures** that occur when GPU hardware becomes inaccessible or unusable:
-**In scope**: CUDA API failures due to GPU becoming unavailable (XID errors, device not found, ECC errors)
-**Use case**: Testing Kubernetes pod rescheduling, inference failover, recovery orchestration
-**Out of scope**: Bit-level Silent Data Corruption (SDC), compute errors, incorrect results
-**Not modeled**: General GPU faulting phenomena at the computation/memory level
**Note**: SDC detection mechanisms will not trigger with this approach, as we intercept at the CUDA API layer, not at the hardware/computation layer.
## Supported XID Errors
| XID | Description | CUDA Error | Use Case |
|-----|-------------|------------|----------|
| **79** | GPU fell off bus | `CUDA_ERROR_NO_DEVICE` | Most common, node-level failure |
| **48** | Double-bit ECC error | `CUDA_ERROR_ECC_UNCORRECTABLE` | Memory corruption |
| **94** | Contained ECC error | `CUDA_ERROR_ECC_UNCORRECTABLE` | Recoverable memory error |
| **95** | Uncontained error | `CUDA_ERROR_UNKNOWN` | Fatal GPU error |
| **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
| **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
## Files in This Directory
| File | Purpose |
|------|---------|
| `cuda_intercept.c` | C library source that intercepts CUDA calls |
| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments |
| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) |
## Prerequisites
- **gcc compiler** (for building the library)
- **kubectl** with cluster access
- Python packages: `kubernetes`, `requests`
- No local compilation needed (compiled in-pod)
### For Standalone Local Testing (Optional)
- **gcc** (version 7.5+ recommended, any modern gcc works)
- **CUDA development headers** (optional, uses runtime API only)
## Writing Your Own Test
### Import Helper Functions
```python
import sys
from pathlib import Path
# Add cuda-fault-injection to path
cuda_injection_dir = Path(__file__).parent.parent / "cuda-fault-injection"
sys.path.insert(0, str(cuda_injection_dir))
from inject_into_pods import (
create_cuda_fault_configmap, # Step 1: Create ConfigMap with library source
patch_deployment_env, # Step 2: Patch deployment to use it
delete_cuda_fault_configmap # Cleanup: Remove ConfigMap
)
```
\ No newline at end of file
/*
* CUDA Intercept Library
*
* This library intercepts CUDA calls and returns appropriate error codes
* to simulate various GPU failures (XIDs).
*
* Supported XID types (set via CUDA_XID_TYPE environment variable):
* 79 - GPU fell off bus (CUDA_ERROR_NO_DEVICE) - DEFAULT
* 48 - Double-bit ECC error (CUDA_ERROR_ECC_UNCORRECTABLE)
* 94 - Contained ECC error (CUDA_ERROR_ECC_UNCORRECTABLE)
* 95 - Uncontained error (CUDA_ERROR_UNKNOWN)
* 43 - GPU stopped responding (CUDA_ERROR_LAUNCH_TIMEOUT)
* 74 - NVLink error (CUDA_ERROR_PEER_ACCESS_UNSUPPORTED)
*
* Compile:
* gcc -shared -fPIC -ldl cuda_intercept.c -o cuda_intercept.so
*
* Use:
* export CUDA_FAULT_INJECTION_ENABLED=1
* export CUDA_XID_TYPE=79 # or 48, 94, 95, 43, 74
* LD_PRELOAD=/path/to/cuda_intercept.so python -m vllm.entrypoints.api_server
*/
#include <dlfcn.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef int cudaError_t;
typedef struct cudaDeviceProp_st {
char name[256];
size_t totalGlobalMem;
// ... other fields (we don't need them)
} cudaDeviceProp;
// CUDA error codes (from cuda_runtime_api.h)
#define cudaSuccess 0
#define cudaErrorNoDevice 100 // XID 79: GPU fell off bus
#define cudaErrorEccUncorrectable 214 // XID 48, 94: ECC errors
#define cudaErrorUnknown 999 // XID 95: Uncontained error
#define cudaErrorLaunchTimeout 6 // XID 43: GPU stopped responding
#define cudaErrorPeerAccessUnsupported 217 // XID 74: NVLink error
// XID error type mapping
typedef struct {
int xid;
cudaError_t cuda_error;
const char* description;
} xid_mapping_t;
static const xid_mapping_t xid_mappings[] = {
{79, cudaErrorNoDevice, "GPU fell off bus"},
{48, cudaErrorEccUncorrectable, "Double-bit ECC error"},
{94, cudaErrorEccUncorrectable, "Contained ECC error"},
{95, cudaErrorUnknown, "Uncontained error"},
{43, cudaErrorLaunchTimeout, "GPU stopped responding"},
{74, cudaErrorPeerAccessUnsupported, "NVLink error"},
{0, 0, NULL} // Sentinel
};
// Get XID type and corresponding CUDA error
static void
get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
{
static int initialized = 0;
static int cached_inject = 0;
static int cached_xid = 79; // Default to XID 79
static cudaError_t cached_error = cudaErrorNoDevice;
if (!initialized) {
// Check if injection is enabled
char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
if (env) {
cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
}
// Get XID type
char* xid_env = getenv("CUDA_XID_TYPE");
if (xid_env) {
cached_xid = atoi(xid_env);
// Find corresponding CUDA error
int found = 0;
for (int i = 0; xid_mappings[i].description != NULL; i++) {
if (xid_mappings[i].xid == cached_xid) {
cached_error = xid_mappings[i].cuda_error;
fprintf(
stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid,
xid_mappings[i].description);
found = 1;
break;
}
}
if (!found) {
fprintf(stderr, "[CUDA FAULT INJECTION] WARNING: Unknown XID %d, defaulting to XID 79\n", cached_xid);
cached_xid = 79;
cached_error = cudaErrorNoDevice;
}
} else {
fprintf(
stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
cached_inject ? "ENABLED" : "DISABLED");
}
initialized = 1;
}
*inject = cached_inject;
*xid_type = cached_xid;
*error_code = cached_error;
}
// Check if fault should be injected
static int
should_inject_fault()
{
int inject, xid;
cudaError_t error;
get_fault_config(&inject, &xid, &error);
return inject;
}
// Get the error code to return
static cudaError_t
get_error_code()
{
int inject, xid;
cudaError_t error;
get_fault_config(&inject, &xid, &error);
return error;
}
// Log helper
static void
log_intercept(const char* func_name, cudaError_t error_code)
{
if (should_inject_fault()) {
int inject, xid;
cudaError_t err;
get_fault_config(&inject, &xid, &err);
fprintf(stderr, "[XID %d SIM] %s() intercepted -> error %d\n", xid, func_name, error_code);
}
}
// Intercept: Get device count
cudaError_t
cudaGetDeviceCount(int* count)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaGetDeviceCount", error);
if (count)
*count = 0;
return error;
}
// If disabled, call real function
typedef cudaError_t (*real_func_t)(int*);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDeviceCount");
if (real_func) {
return real_func(count);
}
return cudaErrorNoDevice;
}
// Intercept: Set device
cudaError_t
cudaSetDevice(int device)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaSetDevice", error);
return error;
}
typedef cudaError_t (*real_func_t)(int);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaSetDevice");
if (real_func) {
return real_func(device);
}
return cudaErrorNoDevice;
}
// Intercept: Get device
cudaError_t
cudaGetDevice(int* device)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaGetDevice", error);
return error;
}
typedef cudaError_t (*real_func_t)(int*);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDevice");
if (real_func) {
return real_func(device);
}
return cudaErrorNoDevice;
}
// Intercept: Malloc
cudaError_t
cudaMalloc(void** devPtr, size_t size)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaMalloc", error);
return error;
}
typedef cudaError_t (*real_func_t)(void**, size_t);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaMalloc");
if (real_func) {
return real_func(devPtr, size);
}
return cudaErrorNoDevice;
}
// Intercept: Free
cudaError_t
cudaFree(void* devPtr)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaFree", error);
return error;
}
typedef cudaError_t (*real_func_t)(void*);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaFree");
if (real_func) {
return real_func(devPtr);
}
return cudaErrorNoDevice;
}
// Intercept: Memcpy
cudaError_t
cudaMemcpy(void* dst, const void* src, size_t count, int kind)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaMemcpy", error);
return error;
}
typedef cudaError_t (*real_func_t)(void*, const void*, size_t, int);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaMemcpy");
if (real_func) {
return real_func(dst, src, count, kind);
}
return cudaErrorNoDevice;
}
// Intercept: Device synchronize
cudaError_t
cudaDeviceSynchronize(void)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaDeviceSynchronize", error);
return error;
}
typedef cudaError_t (*real_func_t)(void);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaDeviceSynchronize");
if (real_func) {
return real_func();
}
return cudaErrorNoDevice;
}
// Intercept: Get device properties
cudaError_t
cudaGetDeviceProperties(cudaDeviceProp* prop, int device)
{
if (should_inject_fault()) {
cudaError_t error = get_error_code();
log_intercept("cudaGetDeviceProperties", error);
return error;
}
typedef cudaError_t (*real_func_t)(cudaDeviceProp*, int);
real_func_t real_func = (real_func_t)dlsym(RTLD_NEXT, "cudaGetDeviceProperties");
if (real_func) {
return real_func(prop, device);
}
return cudaErrorNoDevice;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment