"docs/backends/vscode:/vscode.git/clone" did not exist on "7604295938ba67f27d9e258c8759e2944318a89b"
Unverified Commit be5b8a58 authored by Tushar Sharma's avatar Tushar Sharma Committed by GitHub
Browse files

test: improve deploy test logging for CI debuggability (#7243)


Signed-off-by: default avatarTushar Sharma <tusharma@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent b6a3b0c6
......@@ -93,6 +93,7 @@ runs:
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
IMAGE: ${{ inputs.image }}
DYN_TEST_OUTPUT_PATH: ${{ github.workspace }}/test-output
TEST_NAME: ${{ inputs.test_name }}
EXTRA_PYTEST_ARGS: ${{ inputs.extra_pytest_args }}
run: |
......@@ -174,3 +175,12 @@ runs:
name: test-results-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_deploy_${{ steps.test-name.outputs.name }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7
- name: Upload Pod Logs
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
with:
name: pod-logs-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: ${{ github.workspace }}/test-output/
if-no-files-found: warn
retention-days: 7
......@@ -8,7 +8,7 @@ import re
import secrets
import time
from dataclasses import dataclass, field
from typing import Any, List, Optional
from typing import Any, List, Literal, Optional
import kr8s
import requests
......@@ -476,6 +476,31 @@ class PodProcess:
return False # Timed out
@dataclass
class PodStatusDetail:
"""Container-level status snapshot for a single container in a pod."""
pod_name: str
container_name: str
state: Literal["Waiting", "Terminated", "Running", "Unknown"]
reason: str = ""
message: str = ""
exit_code: Optional[int] = None
restart_count: int = 0
def format(self) -> str:
result = f"{self.pod_name}/{self.container_name}: {self.state}"
if self.reason:
result += f": {self.reason}"
if self.message:
result += f" ({self.message})"
if self.exit_code is not None:
result += f" (exit_code={self.exit_code})"
if self.restart_count > 0:
result += f" [restarts={self.restart_count}]"
return result
@dataclass
class ManagedDeployment:
log_dir: str
......@@ -684,7 +709,89 @@ class ManagedDeployment:
f"Unexpected exception while checking deployment status: {e}"
)
await asyncio.sleep(sleep)
raise TimeoutError("Deployment failed to become ready within timeout")
# Collect pod diagnostics before raising
pod_details = await self._get_pod_status_details()
elapsed = time.time() - start_time
msg = (
f"Deployment {self._deployment_name} failed to reach "
f"Ready={desired_ready_condition_val}, state={desired_state_val} "
f"within {elapsed:.0f}s (timeout={timeout}s)"
)
if pod_details:
detail_lines = "\n".join(f" {d.format()}" for d in pod_details)
msg += f"\n\nPod status at timeout:\n{detail_lines}"
raise TimeoutError(msg)
async def _get_pod_status_details(self) -> List[PodStatusDetail]:
"""Collect container-level status for all pods owned by this deployment.
Returns a list of PodStatusDetail objects. Returns empty list on any
API failure so callers never need to guard against exceptions.
"""
try:
assert self._core_api is not None, "Kubernetes API not initialized"
label = f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name}"
pods = await self._core_api.list_namespaced_pod(
self.namespace, label_selector=label
)
details: List[PodStatusDetail] = []
for pod in pods.items:
pod_name = pod.metadata.name
pod_status = pod.status
phase = pod_status.phase if pod_status else "Unknown"
container_statuses = (
pod_status.container_statuses if pod_status else None
)
if not container_statuses:
details.append(
PodStatusDetail(
pod_name=pod_name,
container_name="*",
state="Unknown",
reason=f"{phase} (no container status)",
)
)
continue
for cs in container_statuses:
state: Literal[
"Waiting", "Terminated", "Running", "Unknown"
] = "Unknown"
reason = ""
message = ""
exit_code: Optional[int] = None
if cs.state and cs.state.waiting:
state = "Waiting"
reason = cs.state.waiting.reason or ""
message = cs.state.waiting.message or ""
elif cs.state and cs.state.terminated:
state = "Terminated"
reason = cs.state.terminated.reason or ""
exit_code = cs.state.terminated.exit_code
elif cs.state and cs.state.running:
state = "Running"
details.append(
PodStatusDetail(
pod_name=pod_name,
container_name=cs.name,
state=state,
reason=reason,
message=message,
exit_code=exit_code,
restart_count=cs.restart_count or 0,
)
)
return details
except exceptions.ApiException as e:
self._logger.debug(f"Failed to collect pod status details: {e}")
return []
async def _restart_nats(self):
NATS_STS_NAME = "dynamo-platform-nats"
......@@ -778,9 +885,10 @@ class ManagedDeployment:
pod_names: list[str] = []
for service_name in service_names:
for original_name in service_names:
label_selector = (
f"nvidia.com/selector={self._deployment_name}-{service_name.lower()}"
f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name},"
f"nvidia.com/dynamo-component={original_name}"
)
assert self._core_api is not None, "Kubernetes API not initialized"
pods: client.V1PodList = await self._core_api.list_namespaced_pod(
......@@ -812,11 +920,11 @@ class ManagedDeployment:
if not service_names:
service_names = [service.name for service in self.deployment_spec.services]
for service_name in service_names:
# List pods for this service using the selector label
# nvidia.com/selector: deployment-name-service
for original_name in service_names:
# List pods using stable labels that are not affected by worker hash suffixes.
label_selector = (
f"nvidia.com/selector={self._deployment_name}-{service_name.lower()}"
f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name},"
f"nvidia.com/dynamo-component={original_name}"
)
pods: list[Pod] = []
......@@ -826,7 +934,7 @@ class ManagedDeployment:
):
pods.append(pod) # type: ignore[arg-type]
result[service_name] = pods
result[original_name] = pods
return result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment