Unverified Commit be5b8a58 authored by Tushar Sharma's avatar Tushar Sharma Committed by GitHub
Browse files

test: improve deploy test logging for CI debuggability (#7243)


Signed-off-by: default avatarTushar Sharma <tusharma@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent b6a3b0c6
...@@ -93,6 +93,7 @@ runs: ...@@ -93,6 +93,7 @@ runs:
FRAMEWORK: ${{ inputs.framework }} FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }} PROFILE: ${{ inputs.profile }}
IMAGE: ${{ inputs.image }} IMAGE: ${{ inputs.image }}
DYN_TEST_OUTPUT_PATH: ${{ github.workspace }}/test-output
TEST_NAME: ${{ inputs.test_name }} TEST_NAME: ${{ inputs.test_name }}
EXTRA_PYTEST_ARGS: ${{ inputs.extra_pytest_args }} EXTRA_PYTEST_ARGS: ${{ inputs.extra_pytest_args }}
run: | run: |
...@@ -174,3 +175,12 @@ runs: ...@@ -174,3 +175,12 @@ runs:
name: test-results-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }} name: test-results-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_deploy_${{ steps.test-name.outputs.name }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml path: test-results/pytest_deploy_${{ steps.test-name.outputs.name }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7 retention-days: 7
- name: Upload Pod Logs
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
with:
name: pod-logs-${{ steps.test-name.outputs.name }}-${{ inputs.platform_arch }}-${{ github.run_id }}-${{ job.check_run_id }}
path: ${{ github.workspace }}/test-output/
if-no-files-found: warn
retention-days: 7
...@@ -8,7 +8,7 @@ import re ...@@ -8,7 +8,7 @@ import re
import secrets import secrets
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, List, Optional from typing import Any, List, Literal, Optional
import kr8s import kr8s
import requests import requests
...@@ -476,6 +476,31 @@ class PodProcess: ...@@ -476,6 +476,31 @@ class PodProcess:
return False # Timed out return False # Timed out
@dataclass
class PodStatusDetail:
"""Container-level status snapshot for a single container in a pod."""
pod_name: str
container_name: str
state: Literal["Waiting", "Terminated", "Running", "Unknown"]
reason: str = ""
message: str = ""
exit_code: Optional[int] = None
restart_count: int = 0
def format(self) -> str:
result = f"{self.pod_name}/{self.container_name}: {self.state}"
if self.reason:
result += f": {self.reason}"
if self.message:
result += f" ({self.message})"
if self.exit_code is not None:
result += f" (exit_code={self.exit_code})"
if self.restart_count > 0:
result += f" [restarts={self.restart_count}]"
return result
@dataclass @dataclass
class ManagedDeployment: class ManagedDeployment:
log_dir: str log_dir: str
...@@ -684,7 +709,89 @@ class ManagedDeployment: ...@@ -684,7 +709,89 @@ class ManagedDeployment:
f"Unexpected exception while checking deployment status: {e}" f"Unexpected exception while checking deployment status: {e}"
) )
await asyncio.sleep(sleep) await asyncio.sleep(sleep)
raise TimeoutError("Deployment failed to become ready within timeout")
# Collect pod diagnostics before raising
pod_details = await self._get_pod_status_details()
elapsed = time.time() - start_time
msg = (
f"Deployment {self._deployment_name} failed to reach "
f"Ready={desired_ready_condition_val}, state={desired_state_val} "
f"within {elapsed:.0f}s (timeout={timeout}s)"
)
if pod_details:
detail_lines = "\n".join(f" {d.format()}" for d in pod_details)
msg += f"\n\nPod status at timeout:\n{detail_lines}"
raise TimeoutError(msg)
async def _get_pod_status_details(self) -> List[PodStatusDetail]:
"""Collect container-level status for all pods owned by this deployment.
Returns a list of PodStatusDetail objects. Returns empty list on any
API failure so callers never need to guard against exceptions.
"""
try:
assert self._core_api is not None, "Kubernetes API not initialized"
label = f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name}"
pods = await self._core_api.list_namespaced_pod(
self.namespace, label_selector=label
)
details: List[PodStatusDetail] = []
for pod in pods.items:
pod_name = pod.metadata.name
pod_status = pod.status
phase = pod_status.phase if pod_status else "Unknown"
container_statuses = (
pod_status.container_statuses if pod_status else None
)
if not container_statuses:
details.append(
PodStatusDetail(
pod_name=pod_name,
container_name="*",
state="Unknown",
reason=f"{phase} (no container status)",
)
)
continue
for cs in container_statuses:
state: Literal[
"Waiting", "Terminated", "Running", "Unknown"
] = "Unknown"
reason = ""
message = ""
exit_code: Optional[int] = None
if cs.state and cs.state.waiting:
state = "Waiting"
reason = cs.state.waiting.reason or ""
message = cs.state.waiting.message or ""
elif cs.state and cs.state.terminated:
state = "Terminated"
reason = cs.state.terminated.reason or ""
exit_code = cs.state.terminated.exit_code
elif cs.state and cs.state.running:
state = "Running"
details.append(
PodStatusDetail(
pod_name=pod_name,
container_name=cs.name,
state=state,
reason=reason,
message=message,
exit_code=exit_code,
restart_count=cs.restart_count or 0,
)
)
return details
except exceptions.ApiException as e:
self._logger.debug(f"Failed to collect pod status details: {e}")
return []
async def _restart_nats(self): async def _restart_nats(self):
NATS_STS_NAME = "dynamo-platform-nats" NATS_STS_NAME = "dynamo-platform-nats"
...@@ -778,9 +885,10 @@ class ManagedDeployment: ...@@ -778,9 +885,10 @@ class ManagedDeployment:
pod_names: list[str] = [] pod_names: list[str] = []
for service_name in service_names: for original_name in service_names:
label_selector = ( label_selector = (
f"nvidia.com/selector={self._deployment_name}-{service_name.lower()}" f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name},"
f"nvidia.com/dynamo-component={original_name}"
) )
assert self._core_api is not None, "Kubernetes API not initialized" assert self._core_api is not None, "Kubernetes API not initialized"
pods: client.V1PodList = await self._core_api.list_namespaced_pod( pods: client.V1PodList = await self._core_api.list_namespaced_pod(
...@@ -812,11 +920,11 @@ class ManagedDeployment: ...@@ -812,11 +920,11 @@ class ManagedDeployment:
if not service_names: if not service_names:
service_names = [service.name for service in self.deployment_spec.services] service_names = [service.name for service in self.deployment_spec.services]
for service_name in service_names: for original_name in service_names:
# List pods for this service using the selector label # List pods using stable labels that are not affected by worker hash suffixes.
# nvidia.com/selector: deployment-name-service
label_selector = ( label_selector = (
f"nvidia.com/selector={self._deployment_name}-{service_name.lower()}" f"nvidia.com/dynamo-graph-deployment-name={self._deployment_name},"
f"nvidia.com/dynamo-component={original_name}"
) )
pods: list[Pod] = [] pods: list[Pod] = []
...@@ -826,7 +934,7 @@ class ManagedDeployment: ...@@ -826,7 +934,7 @@ class ManagedDeployment:
): ):
pods.append(pod) # type: ignore[arg-type] pods.append(pod) # type: ignore[arg-type]
result[service_name] = pods result[original_name] = pods
return result return result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment