name: 'Dynamo Graph Deployment Test' description: 'Deploy a DynamoGraphDeployment to Kubernetes, validate it serves requests, and cleanup' inputs: # Kubernetes Configuration kubeconfig_base64: description: 'Base64-encoded kubeconfig for cluster access' required: true namespace: description: 'Kubernetes namespace for deployment' required: true # Deployment Configuration deployment_file: description: 'Path to the DynamoGraphDeployment YAML file (relative to examples/backends/)' required: true framework: description: 'Framework name (vllm, sglang, trtllm)' required: true framework_runtime_image: description: 'Full container image reference for the framework runtime' required: true # Model Configuration model_name: description: 'Model name to test (e.g., Qwen/Qwen3-0.6B)' required: false default: 'Qwen/Qwen3-0.6B' # Test Configuration pod_ready_timeout: description: 'Timeout for pods to become ready (kubectl wait format)' required: false default: '300s' model_available_max_attempts: description: 'Maximum attempts to wait for model availability' required: false default: '30' model_available_retry_delay: description: 'Delay between model availability checks (seconds)' required: false default: '5' port_forward_delay: description: 'Delay after port-forward to allow connection (seconds)' required: false default: '10' test_identifier: description: 'Unique identifier for test output (used for log file and artifact naming)' required: true # Request Configuration max_tokens: description: 'Maximum tokens for test request' required: false default: '30' temperature: description: 'Temperature for test request' required: false default: '0.0' test_prompt: description: 'Test prompt to send (optional, uses default if not provided)' required: false default: '' # Validation Configuration min_response_length: description: 'Minimum expected response content length' required: false default: '100' skip_cleanup: description: 'Skip cleanup step (useful for debugging)' required: false default: 'false' outputs: graph_name: description: 'Name of the deployed DynamoGraphDeployment' value: ${{ steps.deploy.outputs.graph_name }} test_result: description: 'Test result (0=pass, 1=fail)' value: ${{ steps.test.outputs.test_result }} test_log_path: description: 'Path to test output log' value: ${{ steps.setup-test-names.outputs.test_output_log_file }} artifact_name: description: 'Name of the uploaded artifact' value: ${{ steps.setup-test-names.outputs.artifact_name }} runs: using: "composite" steps: - name: Setup Kubeconfig id: setup-kubeconfig shell: bash run: | echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig chmod 600 ${{ github.workspace }}/.kubeconfig echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV export KUBECONFIG=${{ github.workspace }}/.kubeconfig kubectl config set-context --current --namespace=${{ inputs.namespace }} kubectl config get-contexts - name: Setup Test Output Names id: setup-test-names shell: bash run: | TEST_IDENTIFIER="${{ inputs.test_identifier }}" echo "test_output_log_file=deploy_test_output_${TEST_IDENTIFIER}.log" >> $GITHUB_OUTPUT # Replace underscores with dashes for artifact name (GitHub artifact naming convention) ARTIFACT_NAME="test-results-${TEST_IDENTIFIER//_/-}" echo "artifact_name=${ARTIFACT_NAME}" >> $GITHUB_OUTPUT - name: Deploy and Test id: deploy shell: bash working-directory: ${{ github.workspace }}/examples/backends/${{ inputs.framework }} env: NAMESPACE: ${{ inputs.namespace }} FRAMEWORK: ${{ inputs.framework }} FRAMEWORK_RUNTIME_IMAGE: ${{ inputs.framework_runtime_image }} DEPLOYMENT_FILE: ${{ inputs.deployment_file }} MODEL_NAME: ${{ inputs.model_name }} POD_READY_TIMEOUT: ${{ inputs.pod_ready_timeout }} run: | set -x export KUBECONFIG=${{ github.workspace }}/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE # Redirect all output to a log file while still showing it exec > >(tee -a "${{ steps.setup-test-names.outputs.test_output_log_file }}") 2>&1 export KUBE_NS=$NAMESPACE export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE) echo "graph_name=${GRAPH_NAME}" >> $GITHUB_OUTPUT # Update the deployment file with the runtime image # Use strenv() to ensure the image string is treated as plain string, not parsed as YAML yq -i '.spec.services.[].extraPodSpec.mainContainer.image = strenv(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE echo "=== DEPLOYMENT CONFIGURATION ===" echo "Framework: ${FRAMEWORK}" echo "Runtime Image: ${FRAMEWORK_RUNTIME_IMAGE}" echo "Graph Name: ${GRAPH_NAME}" echo "Namespace: ${KUBE_NS}" echo "" echo "=== UPDATED DEPLOYMENT FILE ===" cat $DEPLOYMENT_FILE # Apply the deployment kubectl apply -n $KUBE_NS -f $DEPLOYMENT_FILE # Wait for pods to be ready echo "=== WAITING FOR PODS ===" sleep 20 echo "Waiting for pods with label nvidia.com/dynamo-graph-deployment-name: $GRAPH_NAME" if ! kubectl wait --for=condition=ready pod \ -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" \ -n ${KUBE_NS} \ --timeout=${POD_READY_TIMEOUT}; then echo "::error::Pods failed to become ready within timeout" echo "deploy_failed=true" >> $GITHUB_OUTPUT exit 1 fi echo "=== FINAL POD STATUSES ===" kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $KUBE_NS -o wide echo "" kubectl get all -n $KUBE_NS - name: Debug Pod Failure id: debug-failure if: failure() && steps.deploy.outputs.deploy_failed == 'true' shell: bash env: NAMESPACE: ${{ inputs.namespace }} FRAMEWORK: ${{ inputs.framework }} DEPLOYMENT_FILE: ${{ inputs.deployment_file }} GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }} run: | export KUBECONFIG=${{ github.workspace }}/.kubeconfig echo "## ❌ Pod Readiness Failure: ${FRAMEWORK} / ${DEPLOYMENT_FILE}" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "**Graph Name:** \`${GRAPH_NAME}\`" >> "$GITHUB_STEP_SUMMARY" echo "**Namespace:** \`${NAMESPACE}\`" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" echo "### All relevant Pods in Namespace" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} -o wide >> "$GITHUB_STEP_SUMMARY" 2>&1 echo '```' >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" # echo "### Pod Descriptions" >> "$GITHUB_STEP_SUMMARY" # echo '```' >> "$GITHUB_STEP_SUMMARY" # kubectl describe pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} >> "$GITHUB_STEP_SUMMARY" 2>&1 # echo '```' >> "$GITHUB_STEP_SUMMARY" # echo "" >> "$GITHUB_STEP_SUMMARY" echo "### Pod Logs (last 30 lines per container)" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" # Get logs pod by pod for better readability PODS=$(kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} -o jsonpath='{.items[*].metadata.name}') if [ -z "$PODS" ]; then echo "_No pods found matching the deployment label_" >> "$GITHUB_STEP_SUMMARY" else for POD in $PODS; do echo "#### Pod: \`${POD}\`" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" kubectl logs --tail=30 --all-containers=true ${POD} -n ${NAMESPACE} >> "$GITHUB_STEP_SUMMARY" 2>&1 || echo "No logs available for ${POD}" >> "$GITHUB_STEP_SUMMARY" echo '```' >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" done fi - name: Run Validation Tests id: test shell: bash env: NAMESPACE: ${{ inputs.namespace }} FRAMEWORK: ${{ inputs.framework }} MODEL_NAME: ${{ inputs.model_name }} MAX_ATTEMPTS: ${{ inputs.model_available_max_attempts }} RETRY_DELAY: ${{ inputs.model_available_retry_delay }} PORT_FORWARD_DELAY: ${{ inputs.port_forward_delay }} MAX_TOKENS: ${{ inputs.max_tokens }} TEMPERATURE: ${{ inputs.temperature }} MIN_RESPONSE_LENGTH: ${{ inputs.min_response_length }} TEST_PROMPT: ${{ inputs.test_prompt }} GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }} run: | set -x export KUBECONFIG=${{ github.workspace }}/.kubeconfig # Get frontend pod and setup port-forward FRONTEND_POD=$(kubectl get pods -n ${NAMESPACE} \ -l nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME} \ -o jsonpath='{.items[0].metadata.name}') CONTAINER_PORT=$(kubectl get pod $FRONTEND_POD -n ${NAMESPACE} \ -o jsonpath='{.spec.containers[0].ports[?(@.name=="http")].containerPort}') echo "Frontend Pod: ${FRONTEND_POD}" echo "Container Port: ${CONTAINER_PORT}" kubectl port-forward pod/$FRONTEND_POD 8000:${CONTAINER_PORT} -n ${NAMESPACE} & PORT_FORWARD_PID=$! LLM_URL="http://localhost:8000" sleep ${PORT_FORWARD_DELAY} echo "LLM URL: ${LLM_URL}" echo "Model Name: ${MODEL_NAME}" # Wait for model to be available ATTEMPT=1 while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do MODELS_RESPONSE=$(curl -s --retry 5 --retry-delay 2 --retry-connrefused "${LLM_URL}/v1/models" || true) if echo "$MODELS_RESPONSE" | jq -e --arg MODEL_NAME "$MODEL_NAME" '.data[]?.id == $MODEL_NAME' >/dev/null 2>&1; then echo "Model $MODEL_NAME is available in /v1/models" break fi echo "Waiting for model $MODEL_NAME... (attempt $ATTEMPT/$MAX_ATTEMPTS)" sleep ${RETRY_DELAY} ATTEMPT=$((ATTEMPT + 1)) done if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then echo "Model $MODEL_NAME not found after $MAX_ATTEMPTS attempts" echo "Last response: $MODELS_RESPONSE" echo "test_result=1" >> $GITHUB_OUTPUT kill $PORT_FORWARD_PID 2>/dev/null || true exit 1 fi # Use default prompt if not provided if [ -z "$TEST_PROMPT" ]; then TEST_PROMPT="In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." fi # Send test request RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused \ -X POST "${LLM_URL}/v1/chat/completions" \ -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -d '{ "model": "'"${MODEL_NAME}"'", "messages": [{"role": "user", "content": "'"${TEST_PROMPT}"'"}], "stream": false, "max_tokens": '"${MAX_TOKENS}"', "temperature": '"${TEMPERATURE}"' }' 2>&1) echo "Response: $RESPONSE" # Validate response TEST_RESULT=0 if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then echo "❌ Test failed: Response is not valid JSON" echo "Got: $RESPONSE" TEST_RESULT=1 elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then echo "❌ Test failed: Message role is not 'assistant'" echo "Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')" TEST_RESULT=1 elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then echo "❌ Test failed: Model name mismatch" echo "Expected: ${MODEL_NAME}" echo "Got: $(echo "$RESPONSE" | jq '.model')" TEST_RESULT=1 elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > '"${MIN_RESPONSE_LENGTH}"'' >/dev/null 2>&1; then echo "❌ Test failed: Response too short (min: ${MIN_RESPONSE_LENGTH})" echo "Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')" TEST_RESULT=1 else echo "✅ Test passed: Response matches expected format and content" fi echo "test_result=${TEST_RESULT}" >> $GITHUB_OUTPUT # Cleanup port-forward kill $PORT_FORWARD_PID 2>/dev/null || true exit $TEST_RESULT - name: Cleanup Deployment if: always() && inputs.skip_cleanup != 'true' shell: bash env: NAMESPACE: ${{ inputs.namespace }} GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }} run: | set -x export KUBECONFIG=${{ github.workspace }}/.kubeconfig echo "=== PRE-CLEANUP STATUS ===" kubectl get dynamographdeployments -n $NAMESPACE || true kubectl get pods -n $NAMESPACE || true echo "Deleting DynamoGraphDeployment: ${GRAPH_NAME}" kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE --timeout=60s || true - name: Upload Test Results uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6 if: always() with: name: ${{ steps.setup-test-names.outputs.artifact_name }} path: ${{ steps.setup-test-names.outputs.test_output_log_file }} retention-days: 7