Unverified Commit 8f0ac731 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add runtime sanity checks for runtime images (#4788)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 3bad9666
...@@ -151,6 +151,26 @@ runs: ...@@ -151,6 +151,26 @@ runs:
# Exit with the build's exit code # Exit with the build's exit code
exit ${BUILD_EXIT_CODE} exit ${BUILD_EXIT_CODE}
- name: Run Sanity Check on Runtime Image
if: inputs.target == 'runtime'
shell: bash
run: |
IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
echo "Running sanity check on image: $IMAGE_TAG"
# Run the sanity check script inside the container
# The script is located in /workspace/deploy/sanity_check.py in runtime containers
set +e
docker run --rm "$IMAGE_TAG" python /workspace/deploy/sanity_check.py --runtime-check --no-gpu-check
SANITY_CHECK_EXIT_CODE=$?
set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
exit ${SANITY_CHECK_EXIT_CODE}
else
echo "✅ Sanity check passed"
fi
- name: Capture Build Metrics - name: Capture Build Metrics
id: metrics id: metrics
shell: bash shell: bash
......
...@@ -92,6 +92,7 @@ Options: ...@@ -92,6 +92,7 @@ Options:
--thorough-check Enable thorough checking (file permissions, directory sizes, HuggingFace model details) --thorough-check Enable thorough checking (file permissions, directory sizes, HuggingFace model details)
--terse Enable terse output mode (show only essential info and errors) --terse Enable terse output mode (show only essential info and errors)
--runtime-check Skip compile-time dependency checks (Rust, Cargo, Maturin) for runtime containers --runtime-check Skip compile-time dependency checks (Rust, Cargo, Maturin) for runtime containers
and validate ai-dynamo packages (ai-dynamo-runtime and ai-dynamo)
""" """
import datetime import datetime
...@@ -299,10 +300,12 @@ class SystemInfo(NodeInfo): ...@@ -299,10 +300,12 @@ class SystemInfo(NodeInfo):
thorough_check: bool = False, thorough_check: bool = False,
terse: bool = False, terse: bool = False,
runtime_check: bool = False, runtime_check: bool = False,
no_gpu_check: bool = False,
): ):
self.thorough_check = thorough_check self.thorough_check = thorough_check
self.terse = terse self.terse = terse
self.runtime_check = runtime_check self.runtime_check = runtime_check
self.no_gpu_check = no_gpu_check
if hostname is None: if hostname is None:
hostname = platform.node() hostname = platform.node()
...@@ -325,7 +328,8 @@ class SystemInfo(NodeInfo): ...@@ -325,7 +328,8 @@ class SystemInfo(NodeInfo):
self.add_child(OSInfo()) self.add_child(OSInfo())
self.add_child(UserInfo()) self.add_child(UserInfo())
# Add GPU info (always show, even if not found) # Add GPU info (always show, even if not found) unless --no-gpu-check
if not self.no_gpu_check:
gpu_info = GPUInfo() gpu_info = GPUInfo()
self.add_child(gpu_info) self.add_child(gpu_info)
...@@ -359,7 +363,11 @@ class SystemInfo(NodeInfo): ...@@ -359,7 +363,11 @@ class SystemInfo(NodeInfo):
self._add_error_only_components() self._add_error_only_components()
# Add Dynamo workspace info (always show, even if not found) # Add Dynamo workspace info (always show, even if not found)
self.add_child(DynamoInfo(thorough_check=self.thorough_check)) self.add_child(
DynamoInfo(
thorough_check=self.thorough_check, runtime_check=self.runtime_check
)
)
def _get_ip_address(self) -> Optional[str]: def _get_ip_address(self) -> Optional[str]:
"""Get the primary IP address of the system.""" """Get the primary IP address of the system."""
...@@ -1094,6 +1102,16 @@ class FilePermissionsInfo(NodeInfo): ...@@ -1094,6 +1102,16 @@ class FilePermissionsInfo(NodeInfo):
dynamo_root = DynamoInfo.find_workspace() dynamo_root = DynamoInfo.find_workspace()
if not dynamo_root: if not dynamo_root:
# In runtime check mode, workspace not being found is expected
if self.runtime_check:
self.add_child(
NodeInfo(
label="Dynamo workspace",
desc="not needed for runtime container",
status=NodeStatus.INFO,
)
)
else:
self.add_child( self.add_child(
NodeInfo( NodeInfo(
label="Dynamo workspace", label="Dynamo workspace",
...@@ -1840,25 +1858,78 @@ class FrameworkInfo(NodeInfo): ...@@ -1840,25 +1858,78 @@ class FrameworkInfo(NodeInfo):
] ]
frameworks_found = 0 frameworks_found = 0
gpu_dependent_found = 0
for module_name, display_name in frameworks_to_check: for module_name, display_name in frameworks_to_check:
# Regular import for all frameworks # First check if module exists without importing (for GPU-dependent modules)
import importlib.metadata
import importlib.util
spec = importlib.util.find_spec(module_name)
if not spec:
# Module not installed at all
continue
# Module exists, try to get version from metadata (doesn't require import)
version = None
try: try:
module = __import__(module_name) version = importlib.metadata.version(module_name)
version = getattr(module, "__version__", "installed") except Exception:
frameworks_found += 1 # Try alternative package names
alt_names = {
"tensorrt_llm": "tensorrt-llm",
"sglang": "sglang",
"vllm": "vllm",
}
if module_name in alt_names:
try:
version = importlib.metadata.version(alt_names[module_name])
except Exception:
pass
# Get module path # Get module path from spec
module_path = None module_path = None
if hasattr(module, "__file__") and module.__file__: if spec.origin:
module_path = self._replace_home_with_var(module.__file__) module_path = self._replace_home_with_var(spec.origin)
# Get executable path # Get executable path (special handling for each framework)
exec_path = None exec_path = None
exec_path_raw = shutil.which(module_name) exec_names = {
"vllm": "vllm",
"sglang": "sglang",
"tensorrt_llm": "trtllm-build",
}
if module_name in exec_names:
exec_path_raw = shutil.which(exec_names[module_name])
if exec_path_raw: if exec_path_raw:
exec_path = self._replace_home_with_var(exec_path_raw) exec_path = self._replace_home_with_var(exec_path_raw)
# Now try to import to get runtime version if needed
gpu_required = False
try:
module = __import__(module_name)
# Get version from module if not already found
if not version:
version = getattr(module, "__version__", "installed")
except ImportError as e:
# Check if it's a GPU-related error
error_msg = str(e).lower()
if "libcuda" in error_msg or "cuda" in error_msg:
gpu_required = True
gpu_dependent_found += 1
except Exception:
pass
# If we found the module (either importable or just installed)
if spec:
frameworks_found += 1
if not version:
version = "installed"
# Add status indicator to version for GPU-dependent modules
if gpu_required:
version = f"{version} (requires GPU)"
package_info = PythonPackageInfo( package_info = PythonPackageInfo(
package_name=display_name, package_name=display_name,
version=version, version=version,
...@@ -1868,9 +1939,6 @@ class FrameworkInfo(NodeInfo): ...@@ -1868,9 +1939,6 @@ class FrameworkInfo(NodeInfo):
is_installed=True, is_installed=True,
) )
self.add_child(package_info) self.add_child(package_info)
except (ImportError, Exception):
# Framework not installed - don't add it
pass
# If no frameworks found, set status to ERROR (X) and show what's missing # If no frameworks found, set status to ERROR (X) and show what's missing
if frameworks_found == 0: if frameworks_found == 0:
...@@ -1881,6 +1949,9 @@ class FrameworkInfo(NodeInfo): ...@@ -1881,6 +1949,9 @@ class FrameworkInfo(NodeInfo):
missing_frameworks.append(f"no {module_name}") missing_frameworks.append(f"no {module_name}")
missing_text = ", ".join(missing_frameworks) missing_text = ", ".join(missing_frameworks)
self.desc = missing_text self.desc = missing_text
elif gpu_dependent_found > 0:
# At least one framework needs GPU
self.status = NodeStatus.WARNING
class PythonPackageInfo(NodeInfo): class PythonPackageInfo(NodeInfo):
...@@ -1962,8 +2033,14 @@ class PythonPathInfo(NodeInfo): ...@@ -1962,8 +2033,14 @@ class PythonPathInfo(NodeInfo):
class DynamoRuntimeInfo(NodeInfo): class DynamoRuntimeInfo(NodeInfo):
"""Dynamo runtime components information""" """Dynamo runtime components information"""
def __init__(self, workspace_dir: str, thorough_check: bool = False): def __init__(
self,
workspace_dir: Optional[str],
thorough_check: bool = False,
runtime_check: bool = False,
):
self.thorough_check = thorough_check self.thorough_check = thorough_check
self.runtime_check = runtime_check
# Try to get package version # Try to get package version
import importlib.metadata import importlib.metadata
...@@ -1993,7 +2070,8 @@ class DynamoRuntimeInfo(NodeInfo): ...@@ -1993,7 +2070,8 @@ class DynamoRuntimeInfo(NodeInfo):
if pth_file: if pth_file:
self.add_child(pth_file) self.add_child(pth_file)
# Check for multiple _core*.so files # Check for multiple _core*.so files (only if workspace exists)
if workspace_dir:
multiple_so_warning = self._check_multiple_core_so(workspace_dir) multiple_so_warning = self._check_multiple_core_so(workspace_dir)
if multiple_so_warning: if multiple_so_warning:
self.add_child(multiple_so_warning) self.add_child(multiple_so_warning)
...@@ -2001,12 +2079,21 @@ class DynamoRuntimeInfo(NodeInfo): ...@@ -2001,12 +2079,21 @@ class DynamoRuntimeInfo(NodeInfo):
# Discover runtime components from source # Discover runtime components from source
components = self._discover_runtime_components(workspace_dir) components = self._discover_runtime_components(workspace_dir)
# For runtime check, always try to import the core modules
if self.runtime_check:
# Force check of essential runtime modules
essential_components = ["dynamo._core", "dynamo.runtime"]
for comp in essential_components:
if comp not in components:
components.append(comp)
# Find where each component actually is and add them # Find where each component actually is and add them
if components: if components:
# Calculate max width for alignment # Calculate max width for alignment
max_len = max(len(comp) for comp in components) max_len = max(len(comp) for comp in components)
components_found = False components_found = False
import_failures = []
for component in components: for component in components:
try: try:
# Try to import to find actual location # Try to import to find actual location
...@@ -2041,16 +2128,31 @@ class DynamoRuntimeInfo(NodeInfo): ...@@ -2041,16 +2128,31 @@ class DynamoRuntimeInfo(NodeInfo):
label=padded_name, desc=error_msg, status=NodeStatus.ERROR label=padded_name, desc=error_msg, status=NodeStatus.ERROR
) )
self.add_child(module_node) self.add_child(module_node)
import_failures.append(component)
# Don't set components_found to True for failed imports # Don't set components_found to True for failed imports
# Update status and value based on whether we found components # Update status and value based on whether we found components
if components_found: if components_found:
# For runtime check, fail if any essential component failed to import
if self.runtime_check and import_failures:
essential_failed = any(
comp in import_failures
for comp in ["dynamo._core", "dynamo.runtime"]
)
if essential_failed:
self.status = NodeStatus.ERROR
self.desc = "ai-dynamo-runtime - FAILED (essential modules not importable)"
else:
self.status = NodeStatus.OK
else:
self.status = NodeStatus.OK self.status = NodeStatus.OK
# If not installed but components work via PYTHONPATH, update the message # If not installed but components work via PYTHONPATH, update the message
if not is_installed: if not is_installed and self.status == NodeStatus.OK:
self.desc = "ai-dynamo-runtime (via PYTHONPATH)" self.desc = "ai-dynamo-runtime (via PYTHONPATH)"
else: else:
self.status = NodeStatus.ERROR self.status = NodeStatus.ERROR
if self.runtime_check:
self.desc = "ai-dynamo-runtime - FAILED (no components found)"
else: else:
# No components discovered at all # No components discovered at all
self.status = NodeStatus.ERROR self.status = NodeStatus.ERROR
...@@ -2102,7 +2204,7 @@ class DynamoRuntimeInfo(NodeInfo): ...@@ -2102,7 +2204,7 @@ class DynamoRuntimeInfo(NodeInfo):
return None return None
def _discover_runtime_components(self, workspace_dir: str) -> list: def _discover_runtime_components(self, workspace_dir: Optional[str]) -> list:
"""Discover ai-dynamo-runtime components from filesystem. """Discover ai-dynamo-runtime components from filesystem.
Returns: Returns:
...@@ -2195,8 +2297,14 @@ class DynamoRuntimeInfo(NodeInfo): ...@@ -2195,8 +2297,14 @@ class DynamoRuntimeInfo(NodeInfo):
class DynamoFrameworkInfo(NodeInfo): class DynamoFrameworkInfo(NodeInfo):
"""Dynamo framework components information""" """Dynamo framework components information"""
def __init__(self, workspace_dir: str, thorough_check: bool = False): def __init__(
self,
workspace_dir: Optional[str],
thorough_check: bool = False,
runtime_check: bool = False,
):
self.thorough_check = thorough_check self.thorough_check = thorough_check
self.runtime_check = runtime_check
# Try to get package version # Try to get package version
import importlib.metadata import importlib.metadata
...@@ -2248,6 +2356,16 @@ class DynamoFrameworkInfo(NodeInfo): ...@@ -2248,6 +2356,16 @@ class DynamoFrameworkInfo(NodeInfo):
# Discover framework components from source # Discover framework components from source
components = self._discover_framework_components(workspace_dir) components = self._discover_framework_components(workspace_dir)
# For runtime check, always try to import at least one framework component
if self.runtime_check and not components:
# Try common framework components even if not discovered
components = [
"dynamo.frontend",
"dynamo.vllm",
"dynamo.sglang",
"dynamo.trtllm",
]
# Find where each component actually is and add them # Find where each component actually is and add them
if components: if components:
# Sort components for consistent output # Sort components for consistent output
...@@ -2257,6 +2375,7 @@ class DynamoFrameworkInfo(NodeInfo): ...@@ -2257,6 +2375,7 @@ class DynamoFrameworkInfo(NodeInfo):
max_len = max(len(comp) for comp in components) max_len = max(len(comp) for comp in components)
components_found = False components_found = False
import_failures = []
for component in components: for component in components:
try: try:
# Try to import to find actual location # Try to import to find actual location
...@@ -2281,21 +2400,29 @@ class DynamoFrameworkInfo(NodeInfo): ...@@ -2281,21 +2400,29 @@ class DynamoFrameworkInfo(NodeInfo):
label=padded_name, desc=error_msg, status=NodeStatus.ERROR label=padded_name, desc=error_msg, status=NodeStatus.ERROR
) )
self.add_child(component_node) self.add_child(component_node)
import_failures.append(component)
# Don't set components_found to True for failed imports # Don't set components_found to True for failed imports
# Update status and value based on whether we found components # Update status and value based on whether we found components
if components_found: if components_found:
# For runtime check, we need at least one component to work
if self.runtime_check and len(import_failures) == len(components):
self.status = NodeStatus.ERROR
self.desc = "ai-dynamo - FAILED (no components importable)"
else:
self.status = NodeStatus.OK self.status = NodeStatus.OK
# If not installed but components work via PYTHONPATH, update the message # If not installed but components work via PYTHONPATH, update the message
if not is_installed: if not is_installed and self.status == NodeStatus.OK:
self.desc = "ai-dynamo (via PYTHONPATH)" self.desc = "ai-dynamo (via PYTHONPATH)"
else: else:
self.status = NodeStatus.ERROR self.status = NodeStatus.ERROR
if self.runtime_check:
self.desc = "ai-dynamo - FAILED (no components found)"
else: else:
# No components discovered at all # No components discovered at all
self.status = NodeStatus.ERROR self.status = NodeStatus.ERROR
def _discover_framework_components(self, workspace_dir: str) -> list: def _discover_framework_components(self, workspace_dir: Optional[str]) -> list:
"""Discover ai-dynamo framework components from filesystem. """Discover ai-dynamo framework components from filesystem.
Returns: Returns:
...@@ -2326,12 +2453,37 @@ class DynamoFrameworkInfo(NodeInfo): ...@@ -2326,12 +2453,37 @@ class DynamoFrameworkInfo(NodeInfo):
class DynamoInfo(NodeInfo): class DynamoInfo(NodeInfo):
"""Dynamo workspace information""" """Dynamo workspace information"""
def __init__(self, thorough_check: bool = False): def __init__(self, thorough_check: bool = False, runtime_check: bool = False):
self.thorough_check = thorough_check self.thorough_check = thorough_check
self.runtime_check = runtime_check
# Find workspace directory # Find workspace directory
workspace_dir = DynamoInfo.find_workspace() workspace_dir = DynamoInfo.find_workspace()
# For runtime check, we don't need a workspace - just check packages
if self.runtime_check and not workspace_dir:
super().__init__(
label="Dynamo",
desc="Runtime container - checking installed packages",
status=NodeStatus.INFO,
)
# Check runtime components even without workspace
runtime_info = DynamoRuntimeInfo(
None,
thorough_check=self.thorough_check,
runtime_check=self.runtime_check,
)
self.add_child(runtime_info)
# Check framework components even without workspace
framework_info = DynamoFrameworkInfo(
None,
thorough_check=self.thorough_check,
runtime_check=self.runtime_check,
)
self.add_child(framework_info)
return
if not workspace_dir: if not workspace_dir:
# Show error when workspace is not found # Show error when workspace is not found
super().__init__( super().__init__(
...@@ -2368,13 +2520,17 @@ class DynamoInfo(NodeInfo): ...@@ -2368,13 +2520,17 @@ class DynamoInfo(NodeInfo):
# Always add runtime components # Always add runtime components
runtime_info = DynamoRuntimeInfo( runtime_info = DynamoRuntimeInfo(
workspace_dir, thorough_check=self.thorough_check workspace_dir,
thorough_check=self.thorough_check,
runtime_check=self.runtime_check,
) )
self.add_child(runtime_info) self.add_child(runtime_info)
# Always add framework components # Always add framework components
framework_info = DynamoFrameworkInfo( framework_info = DynamoFrameworkInfo(
workspace_dir, thorough_check=self.thorough_check workspace_dir,
thorough_check=self.thorough_check,
runtime_check=self.runtime_check,
) )
self.add_child(framework_info) self.add_child(framework_info)
...@@ -2513,8 +2669,14 @@ def main(): ...@@ -2513,8 +2669,14 @@ def main():
) )
parser.add_argument( parser.add_argument(
"--runtime-check", "--runtime-check",
"--runtime",
action="store_true",
help="Skip compile-time dependency checks (Rust, Cargo, Maturin) for runtime containers and validate ai-dynamo packages",
)
parser.add_argument(
"--no-gpu-check",
action="store_true", action="store_true",
help="Skip compile-time dependency checks (Rust, Cargo, Maturin) for runtime containers", help="Skip GPU detection and information collection (useful for CI environments without GPU access)",
) )
args = parser.parse_args() args = parser.parse_args()
...@@ -2527,6 +2689,7 @@ def main(): ...@@ -2527,6 +2689,7 @@ def main():
thorough_check=args.thorough_check, thorough_check=args.thorough_check,
terse=args.terse, terse=args.terse,
runtime_check=args.runtime_check, runtime_check=args.runtime_check,
no_gpu_check=args.no_gpu_check,
) )
tree.print_tree() tree.print_tree()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment