Unverified Commit d2c23e41 authored by nv-oviya's avatar nv-oviya Committed by GitHub
Browse files

feat(fault-injection): Enable runtime CUDA fault injection toggling without pod restarts (#4679)

parent 300e5d55
...@@ -6,13 +6,16 @@ ...@@ -6,13 +6,16 @@
## What This Does ## What This Does
Makes CUDA calls return error codes to simulate various GPU failures. Uses LD_PRELOAD to intercept CUDA library calls. Intercepts CUDA calls to simulate GPU failures using LD_PRELOAD. Faults persist across pod restarts via hostPath volumes, enabling realistic hardware failure testing.
``` ```
Pod calls cudaMalloc() → LD_PRELOAD intercepts → Returns error → Pod crashes Pod calls cudaMalloc() → LD_PRELOAD intercepts → Checks /host-fault/cuda_fault_enabled → Returns error → Pod crashes
``` ```
**Result**: Realistic GPU failure testing without hardware damage. **Key Features**:
- **Persistent faults**: hostPath volume (`/var/lib/cuda-fault-test`) survives pod restarts on same node
- **Runtime toggle**: Enable/disable faults without pod restarts via `/host-fault/cuda_fault_enabled`
- **Node-specific**: Faults only on target node, healthy nodes unaffected
## Scope ## Scope
...@@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when ...@@ -35,13 +38,20 @@ This library simulates **software/orchestration-level failures** that occur when
| **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel | | **43** | GPU stopped responding | `CUDA_ERROR_LAUNCH_TIMEOUT` | Hung kernel |
| **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure | | **74** | NVLink error | `CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` | Multi-GPU communication failure |
## How It Works
1. **Deployment patching**: Adds hostPath volume + init container to compile library
2. **LD_PRELOAD injection**: Environment variable loads library before CUDA
3. **Runtime control**: Toggle file (`/host-fault/cuda_fault_enabled`) controls fault state
4. **Node persistence**: hostPath ensures faults survive pod restarts on same node
## Files in This Directory ## Files in This Directory
| File | Purpose | | File | Purpose |
|------|---------| |------|---------|
| `cuda_intercept.c` | C library source that intercepts CUDA calls | | `cuda_intercept.c` | C library that intercepts CUDA calls and checks fault markers |
| `inject_into_pods.py` | Helper functions for patching Kubernetes deployments | | `inject_into_pods.py` | Kubernetes deployment patcher (adds hostPath volume + library) |
| `Makefile` | Builds the `.so` library locally (optional, for standalone testing) | | `Makefile` | Local build (optional, for testing) |
## Prerequisites ## Prerequisites
......
...@@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = { ...@@ -59,19 +59,20 @@ static const xid_mapping_t xid_mappings[] = {
}; };
// Get XID type and corresponding CUDA error // Get XID type and corresponding CUDA error
// Supports runtime toggling via /tmp/cuda_fault_enabled file
static void static void
get_fault_config(int* inject, int* xid_type, cudaError_t* error_code) get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
{ {
static int initialized = 0; static int initialized = 0;
static int cached_inject = 0; static int env_inject = 0; // From environment variable
static int cached_xid = 79; // Default to XID 79 static int cached_xid = 79; // Default to XID 79
static cudaError_t cached_error = cudaErrorNoDevice; static cudaError_t cached_error = cudaErrorNoDevice;
if (!initialized) { if (!initialized) {
// Check if injection is enabled // Check if injection is enabled via environment
char* env = getenv("CUDA_FAULT_INJECTION_ENABLED"); char* env = getenv("CUDA_FAULT_INJECTION_ENABLED");
if (env) { if (env) {
cached_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0); env_inject = (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
} }
// Get XID type // Get XID type
...@@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code) ...@@ -85,8 +86,7 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
if (xid_mappings[i].xid == cached_xid) { if (xid_mappings[i].xid == cached_xid) {
cached_error = xid_mappings[i].cuda_error; cached_error = xid_mappings[i].cuda_error;
fprintf( fprintf(
stderr, "[CUDA FAULT INJECTION] ENABLED - Simulating XID %d (%s)\n", cached_xid, stderr, "[CUDA FAULT INJECTION] Library loaded - XID %d (%s)\n", cached_xid, xid_mappings[i].description);
xid_mappings[i].description);
found = 1; found = 1;
break; break;
} }
...@@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code) ...@@ -97,16 +97,37 @@ get_fault_config(int* inject, int* xid_type, cudaError_t* error_code)
cached_xid = 79; cached_xid = 79;
cached_error = cudaErrorNoDevice; cached_error = cudaErrorNoDevice;
} }
} else {
fprintf(
stderr, "[CUDA FAULT INJECTION] %s (default: XID 79 - GPU fell off bus)\n",
cached_inject ? "ENABLED" : "DISABLED");
} }
initialized = 1; initialized = 1;
} }
*inject = cached_inject; // Runtime toggle: Check node-persistent fault marker on EVERY call
// Use hostPath (/host-fault) so fault persists across pod restarts on same node
// Pod reschedules to different node → no file there → automatic recovery!
int runtime_inject = env_inject; // Default to env var
// Check hostPath first (persistent across restarts on same node)
FILE* toggle_file = fopen("/host-fault/cuda_fault_enabled", "r");
if (toggle_file) {
char toggle_value[4] = {0};
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
runtime_inject = (toggle_value[0] == '1');
}
fclose(toggle_file);
} else {
// Fallback to ephemeral /tmp for backwards compatibility
toggle_file = fopen("/tmp/cuda_fault_enabled", "r");
if (toggle_file) {
char toggle_value[4] = {0};
if (fgets(toggle_value, sizeof(toggle_value), toggle_file)) {
runtime_inject = (toggle_value[0] == '1');
}
fclose(toggle_file);
}
}
*inject = runtime_inject;
*xid_type = cached_xid; *xid_type = cached_xid;
*error_code = cached_error; *error_code = cached_error;
} }
......
...@@ -201,6 +201,18 @@ def _patch_service_for_injection( ...@@ -201,6 +201,18 @@ def _patch_service_for_injection(
{"name": "cuda-fault-lib", "emptyDir": {}} {"name": "cuda-fault-lib", "emptyDir": {}}
) )
# Add hostPath volume for persistent fault marker (survives pod restarts on same node)
# This simulates persistent hardware failure!
service["extraPodSpec"]["volumes"].append(
{
"name": "node-fault-marker",
"hostPath": {
"path": "/var/lib/cuda-fault-test",
"type": "DirectoryOrCreate",
},
}
)
# Add init container to decode base64 # Add init container to decode base64
if "initContainers" not in service["extraPodSpec"]: if "initContainers" not in service["extraPodSpec"]:
service["extraPodSpec"]["initContainers"] = [] service["extraPodSpec"]["initContainers"] = []
...@@ -247,7 +259,7 @@ def _patch_service_for_injection( ...@@ -247,7 +259,7 @@ def _patch_service_for_injection(
if vm.get("name") != "cuda-fault-lib" if vm.get("name") != "cuda-fault-lib"
] ]
# Add mount # Add mount for compiled library
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append( service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
{ {
"name": "cuda-fault-lib", "name": "cuda-fault-lib",
...@@ -256,8 +268,18 @@ def _patch_service_for_injection( ...@@ -256,8 +268,18 @@ def _patch_service_for_injection(
} }
) )
# Add mount for persistent fault marker (hostPath)
service["extraPodSpec"]["mainContainer"]["volumeMounts"].append(
{
"name": "node-fault-marker",
"mountPath": "/host-fault",
"readOnly": False, # Need write access
}
)
print(" ✓ Added init container to compile library") print(" ✓ Added init container to compile library")
print(" ✓ Added ConfigMap volume mount") print(" ✓ Added ConfigMap volume mount")
print(" ✓ Added hostPath volume for persistent fault marker")
# Add node affinity to pin pods to target node (simulates real XID 79 behavior) # Add node affinity to pin pods to target node (simulates real XID 79 behavior)
if target_node and enable: if target_node and enable:
...@@ -287,14 +309,15 @@ def _patch_service_for_injection( ...@@ -287,14 +309,15 @@ def _patch_service_for_injection(
service["extraPodSpec"]["volumes"] = [ service["extraPodSpec"]["volumes"] = [
v v
for v in service["extraPodSpec"]["volumes"] for v in service["extraPodSpec"]["volumes"]
if v.get("name") not in ["cuda-fault-lib", "cuda-fault-lib-source"] if v.get("name")
not in ["cuda-fault-lib", "cuda-fault-lib-source", "node-fault-marker"]
] ]
if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}): if "volumeMounts" in service["extraPodSpec"].get("mainContainer", {}):
service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [ service["extraPodSpec"]["mainContainer"]["volumeMounts"] = [
vm vm
for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"] for vm in service["extraPodSpec"]["mainContainer"]["volumeMounts"]
if vm.get("name") != "cuda-fault-lib" if vm.get("name") not in ["cuda-fault-lib", "node-fault-marker"]
] ]
# Remove init container # Remove init container
...@@ -323,6 +346,7 @@ def patch_deployment_env( ...@@ -323,6 +346,7 @@ def patch_deployment_env(
use_configmap=True, use_configmap=True,
target_node=None, target_node=None,
xid_type=79, xid_type=79,
passthrough_mode=False,
): ):
"""Patch deployment to add/remove LD_PRELOAD environment variable. """Patch deployment to add/remove LD_PRELOAD environment variable.
...@@ -334,6 +358,8 @@ def patch_deployment_env( ...@@ -334,6 +358,8 @@ def patch_deployment_env(
target_node: If provided, adds node affinity to pin pods to this node target_node: If provided, adds node affinity to pin pods to this node
(simulates real XID where pods crash on the faulty node) (simulates real XID where pods crash on the faulty node)
xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79 xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0 (library loaded but disabled)
Allows baseline testing before enabling faults via toggle
""" """
custom_api = client.CustomObjectsApi() custom_api = client.CustomObjectsApi()
apps_api = client.AppsV1Api() apps_api = client.AppsV1Api()
...@@ -385,9 +411,14 @@ def patch_deployment_env( ...@@ -385,9 +411,14 @@ def patch_deployment_env(
# Prepare environment variables # Prepare environment variables
new_envs = [] new_envs = []
if enable: if enable:
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
fault_enabled_value = "0" if passthrough_mode else "1"
new_envs = [ new_envs = [
{"name": "LD_PRELOAD", "value": lib_path}, {"name": "LD_PRELOAD", "value": lib_path},
{"name": "CUDA_FAULT_INJECTION_ENABLED", "value": "1"}, {
"name": "CUDA_FAULT_INJECTION_ENABLED",
"value": fault_enabled_value,
},
{"name": "CUDA_XID_TYPE", "value": str(xid_type)}, {"name": "CUDA_XID_TYPE", "value": str(xid_type)},
] ]
...@@ -400,6 +431,28 @@ def patch_deployment_env( ...@@ -400,6 +431,28 @@ def patch_deployment_env(
available_services = list(services.keys()) available_services = list(services.keys())
print(f" → Available services: {available_services}") print(f" → Available services: {available_services}")
# Set aggressive update strategy when enabling (allow all pods to update at once)
# This ensures all pods get CUDA faults, not just the first few
if enable:
if "updateStrategy" not in spec:
spec["updateStrategy"] = {}
if "rollingUpdate" not in spec["updateStrategy"]:
spec["updateStrategy"]["rollingUpdate"] = {}
# Allow all pods to be unavailable during update
spec["updateStrategy"]["rollingUpdate"]["maxUnavailable"] = "100%"
# Don't create surge pods
spec["updateStrategy"]["rollingUpdate"]["maxSurge"] = 0
print(" → Set update strategy: maxUnavailable=100%, maxSurge=0")
print(" (All pods will update simultaneously)")
else:
# Restore default update strategy when disabling
if "updateStrategy" in spec:
spec["updateStrategy"] = {
"rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}
}
print(" → Restored default update strategy (maxUnavailable=25%)")
for service_name in services_to_patch: for service_name in services_to_patch:
if service_name in services: if service_name in services:
print(f" → Patching service: {service_name}") print(f" → Patching service: {service_name}")
...@@ -465,6 +518,38 @@ def patch_deployment_env( ...@@ -465,6 +518,38 @@ def patch_deployment_env(
print(f" Services patched: {', '.join(patched_services)}") print(f" Services patched: {', '.join(patched_services)}")
if use_configmap and enable: if use_configmap and enable:
print(f" Library mounted at: {lib_path}") print(f" Library mounted at: {lib_path}")
# Force restart all worker pods when enabling to apply changes immediately
if enable:
print(
" → Force-deleting all worker pods to apply changes immediately..."
)
core_api = client.CoreV1Api()
try:
worker_pods = core_api.list_namespaced_pod(
namespace=namespace,
label_selector=f"nvidia.com/dynamo-graph-deployment-name={deployment_name},nvidia.com/dynamo-component-type=worker",
)
deleted_count = 0
for pod in worker_pods.items:
try:
core_api.delete_namespaced_pod(
name=pod.metadata.name,
namespace=namespace,
grace_period_seconds=0,
)
deleted_count += 1
except Exception as e:
print(
f" ⚠ Could not delete pod {pod.metadata.name}: {e}"
)
print(
f" ✓ Deleted {deleted_count} pod(s) - they will restart with CUDA library"
)
except Exception as e:
print(f" ⚠ Could not list/delete pods: {e}")
print(" Pods will eventually restart, but may take longer")
return True return True
except ApiException as e: except ApiException as e:
...@@ -505,11 +590,15 @@ def patch_deployment_env( ...@@ -505,11 +590,15 @@ def patch_deployment_env(
if enable: if enable:
# Add new env vars # Add new env vars
# Set CUDA_FAULT_INJECTION_ENABLED based on passthrough_mode
fault_enabled_value = "0" if passthrough_mode else "1"
container.env.append( container.env.append(
client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so") client.V1EnvVar(name="LD_PRELOAD", value="/tmp/cuda_intercept.so")
) )
container.env.append( container.env.append(
client.V1EnvVar(name="CUDA_FAULT_INJECTION_ENABLED", value="1") client.V1EnvVar(
name="CUDA_FAULT_INJECTION_ENABLED", value=fault_enabled_value
)
) )
container.env.append( container.env.append(
client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type)) client.V1EnvVar(name="CUDA_XID_TYPE", value=str(xid_type))
......
...@@ -37,7 +37,7 @@ class CUDAFaultInjector: ...@@ -37,7 +37,7 @@ class CUDAFaultInjector:
lib_dir = Path(__file__).parent.parent / "cuda_fault_injection" lib_dir = Path(__file__).parent.parent / "cuda_fault_injection"
self.lib_dir = lib_dir self.lib_dir = lib_dir
self.lib_path = lib_dir / "fake_cuda_xid79.so" self.lib_path = lib_dir / "cuda_intercept.so"
self.lib_built = False self.lib_built = False
def build_library(self) -> bool: def build_library(self) -> bool:
...@@ -101,12 +101,57 @@ class CUDAFaultInjector: ...@@ -101,12 +101,57 @@ class CUDAFaultInjector:
traceback.print_exc() traceback.print_exc()
return False return False
def check_if_cuda_library_deployed(
self, deployment_name: str, namespace: str
) -> bool:
"""
Check if CUDA fault injection is already deployed to the deployment.
Args:
deployment_name: Name of the deployment
namespace: Kubernetes namespace
Returns:
True if CUDA fault library is already deployed, False otherwise
"""
try:
k8s_custom = client.CustomObjectsApi()
# Get the DynamoGraphDeployment
dgd = k8s_custom.get_namespaced_custom_object(
group="nvidia.com",
version="v1alpha1",
namespace=namespace,
plural="dynamographdeployments",
name=deployment_name,
)
# Check for LD_PRELOAD in worker container env
spec = dgd.get("spec", {})
worker_spec = spec.get("workerSpec", {})
pod_spec = worker_spec.get("podSpec", {})
containers = pod_spec.get("containers", [])
for container in containers:
if container.get("name") in ["vllm-worker", "worker"]:
env = container.get("env", [])
for env_var in env:
if env_var.get("name") == "LD_PRELOAD":
return True
return False
except Exception:
# If we can't read the deployment, assume it's not deployed
return False
def patch_deployment_for_cuda_fault( def patch_deployment_for_cuda_fault(
self, self,
deployment_name: str, deployment_name: str,
namespace: str, namespace: str,
target_node: Optional[str] = None, target_node: Optional[str] = None,
xid_type: int = 79, xid_type: int = 79,
passthrough_mode: bool = False,
) -> bool: ) -> bool:
""" """
Patch deployment to enable CUDA fault injection. Patch deployment to enable CUDA fault injection.
...@@ -116,6 +161,7 @@ class CUDAFaultInjector: ...@@ -116,6 +161,7 @@ class CUDAFaultInjector:
- Init container to compile library - Init container to compile library
- LD_PRELOAD environment variable - LD_PRELOAD environment variable
- CUDA_XID_TYPE environment variable - CUDA_XID_TYPE environment variable
- CUDA_FAULT_INJECTION_ENABLED (0 in passthrough mode, 1 otherwise)
- Node affinity (if target_node specified) - Node affinity (if target_node specified)
Args: Args:
...@@ -123,6 +169,8 @@ class CUDAFaultInjector: ...@@ -123,6 +169,8 @@ class CUDAFaultInjector:
namespace: Kubernetes namespace namespace: Kubernetes namespace
target_node: Node to pin pods to (simulates real XID behavior) target_node: Node to pin pods to (simulates real XID behavior)
xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79 xid_type: XID error type to simulate (79, 48, 94, 95, 43, 74). Default: 79
passthrough_mode: If True, set CUDA_FAULT_INJECTION_ENABLED=0
(library loaded but faults disabled for baseline)
Returns: Returns:
True if patch succeeded True if patch succeeded
...@@ -149,6 +197,7 @@ class CUDAFaultInjector: ...@@ -149,6 +197,7 @@ class CUDAFaultInjector:
use_configmap=True, use_configmap=True,
target_node=target_node, target_node=target_node,
xid_type=xid_type, xid_type=xid_type,
passthrough_mode=passthrough_mode,
) )
except Exception as e: except Exception as e:
...@@ -339,6 +388,248 @@ class CUDAFaultInjector: ...@@ -339,6 +388,248 @@ class CUDAFaultInjector:
traceback.print_exc() traceback.print_exc()
return False return False
def enable_cuda_faults_via_toggle(
self, pods: List[client.V1Pod], namespace: str, enable: bool = True
) -> bool:
"""
Enable or disable CUDA faults on running pods via environment variable toggle.
This modifies the CUDA_FAULT_INJECTION_ENABLED env var in running pods
without restarting them. Requires the CUDA library to already be loaded.
Args:
pods: List of pods to toggle faults on
namespace: Kubernetes namespace
enable: True to enable faults, False to disable
Returns:
True if toggle succeeded
"""
if not pods:
return False
toggle_value = "1" if enable else "0"
action = "Enabling" if enable else "Disabling"
print(f"\n[→] {action} CUDA faults via toggle on {len(pods)} pods...")
success_count = 0
failed_pods = []
for pod in pods:
pod_name = pod.metadata.name
try:
# Get the main container name from pod spec
container_name = (
pod.spec.containers[0].name if pod.spec.containers else None
)
if not container_name:
failed_pods.append((pod_name, "No container found"))
continue
# Write toggle file to hostPath (persists across pod restarts on same node)
# This simulates persistent hardware failure!
exec_command = [
"sh",
"-c",
f'mkdir -p /host-fault && echo "{toggle_value}" > /host-fault/cuda_fault_enabled && cat /host-fault/cuda_fault_enabled',
]
result = subprocess.run(
[
"kubectl",
"exec",
"-n",
namespace,
pod_name,
"-c",
container_name,
"--",
]
+ exec_command,
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
actual_value = result.stdout.strip()
if actual_value == toggle_value:
print(
f" ✓ Toggle={toggle_value} in {pod_name}/{container_name}"
)
success_count += 1
else:
failed_pods.append(
(
pod_name,
f"Verify failed: expected '{toggle_value}', got '{actual_value}'",
)
)
else:
failed_pods.append(
(pod_name, f"Exec failed: {result.stderr.strip()}")
)
except Exception as e:
failed_pods.append((pod_name, str(e)))
continue
if failed_pods:
print(f" ⚠ Failed to toggle {len(failed_pods)} pods:")
for pod_name, error in failed_pods:
print(f" - {pod_name}: {error}")
print(f" → Result: {success_count}/{len(pods)} pods toggled successfully")
return success_count > 0
def disable_cuda_faults_via_toggle(
self, pods: List[client.V1Pod], namespace: str
) -> bool:
"""
Disable CUDA faults on running pods via toggle.
Args:
pods: List of pod objects to disable faults on
namespace: Kubernetes namespace
Returns:
True if disable succeeded
"""
return self.enable_cuda_faults_via_toggle(pods, namespace, enable=False)
def cleanup_node_fault_markers(
self, pods: List[client.V1Pod], namespace: str
) -> bool:
"""
Remove persistent fault marker files from node hostPath.
This cleans up /host-fault/cuda_fault_enabled to prevent future tests from failing.
Args:
pods: List of pods (to access nodes)
namespace: Kubernetes namespace
Returns:
True if cleanup succeeded
"""
if not pods:
return True
print(" [->] Cleaning persistent fault markers from nodes...")
success_count = 0
nodes_cleaned = set()
for pod in pods:
pod_name = pod.metadata.name
node_name = pod.spec.node_name
# Skip if we already cleaned this node
if node_name in nodes_cleaned:
continue
try:
container_name = (
pod.spec.containers[0].name if pod.spec.containers else None
)
if not container_name:
continue
# Remove the persistent marker file from hostPath
exec_command = [
"sh",
"-c",
'rm -f /host-fault/cuda_fault_enabled 2>/dev/null; echo "ok"',
]
result = subprocess.run(
[
"kubectl",
"exec",
"-n",
namespace,
pod_name,
"-c",
container_name,
"--",
]
+ exec_command,
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
print(f" ✓ Cleaned fault marker on node {node_name}")
nodes_cleaned.add(node_name)
success_count += 1
except Exception:
continue
return success_count > 0
def verify_env_var_set(
self,
deployment_name: str,
namespace: str,
expected_value: str,
max_wait: int = 30,
) -> bool:
"""
Verify that CUDA_FAULT_INJECTION_ENABLED env var is set to expected value.
Polls until the value matches or timeout.
Args:
deployment_name: Name of the DynamoGraphDeployment
namespace: Kubernetes namespace
expected_value: Expected value ("0" or "1")
max_wait: Maximum seconds to wait
Returns:
True if verified
"""
k8s_custom = client.CustomObjectsApi()
start_time = time.time()
while time.time() - start_time < max_wait:
try:
dgd = k8s_custom.get_namespaced_custom_object(
group="nvidia.com",
version="v1alpha1",
namespace=namespace,
plural="dynamographdeployments",
name=deployment_name,
)
# Check both worker services
for service_name in ["VllmDecodeWorker", "VllmPrefillWorker"]:
if service_name in dgd["spec"]["services"]:
service = dgd["spec"]["services"][service_name]
env_vars = (
service.get("extraPodSpec", {})
.get("mainContainer", {})
.get("env", [])
)
for env_var in env_vars:
if env_var.get("name") == "CUDA_FAULT_INJECTION_ENABLED":
if env_var.get("value") != expected_value:
time.sleep(1)
break # Try again
else:
continue # This service is good
break # Inner loop broke, try again
else:
# All services verified
return True
except Exception:
time.sleep(1)
return False
def trigger_pod_restart(self, pods: List[client.V1Pod], namespace: str): def trigger_pod_restart(self, pods: List[client.V1Pod], namespace: str):
""" """
Delete pods to trigger restart with new env vars. Delete pods to trigger restart with new env vars.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment