feat: add test metrics upload (#3648)

Signed-off-by: nv-nmailhot <nmailhot@nvidia.com>

feat: add test metrics upload (#3648)
Signed-off-by: nv-nmailhot <nmailhot@nvidia.com>
1ddb62b4 · nv-nmailhot · GitHub · fdcc8d5b · 1ddb62b4 · 1ddb62b4
Unverified Commit 1ddb62b4 authored Oct 16, 2025 by nv-nmailhot Committed by GitHub Oct 16, 2025
3 changed files
--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
@@ -12,11 +12,37 @@ inputs:
    description: 'Maximum number of cores available to docker'
    required: false
    default: '10'
+  framework:
+    description: 'Framework name for test metrics'
+    required: false
+    default: 'unknown'
+  test_type:
+    description: 'Test type (unit, e2e, integration)'
+    required: false
+    default: 'e2e'
+  platform_arch:
+    description: 'Platform architecture (amd64, arm64)'
+    required: false
+    default: 'amd64'


 runs:
  using: "composite"
  steps:
+    - name: Setup Test Environment
+      shell: bash
+      run: |
+        # Setup test directories
+        mkdir -p test-results
+
+        # Set platform architecture from input
+        PLATFORM_ARCH="${{ inputs.platform_arch }}"
+        if [[ -z "${PLATFORM_ARCH}" ]]; then
+          PLATFORM_ARCH="amd64"
+        fi
+        echo "PLATFORM_ARCH=${PLATFORM_ARCH}" >> $GITHUB_ENV
+        echo "🏗️  Platform architecture: ${PLATFORM_ARCH}"
+
    - name: Run tests
      shell: bash
      env:
@@ -25,9 +51,64 @@ runs:
        PYTEST_XML_FILE: pytest_test_report.xml
        HF_HOME: /runner/_work/_temp
      run: |
+        # Run pytest with detailed output and JUnit XML
+        set +e  # Don't exit on test failures
+
        docker run --runtime=nvidia --rm --gpus all -w /workspace \
          --cpus=${NUM_CPUS} \
          --network host \
          --name ${{ env.CONTAINER_ID }}_pytest \
+          -v "$(pwd)/test-results:/test-results" \
          ${{ inputs.image_tag }} \
-          bash -c "pytest -xsv --basetemp=/tmp --junitxml=${{ env.PYTEST_XML_FILE }} -m \"${{ inputs.pytest_marks }}\""
+          bash -c "pytest -v --tb=short --basetemp=/tmp --junitxml=/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+
+        TEST_EXIT_CODE=$?
+        echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
+        echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
+
+        # Always continue to results processing
+        exit 0
+
+    - name: Process Test Results
+      shell: bash
+      run: |
+
+        # Check for JUnit XML file and determine test status
+        JUNIT_FILE="test-results/pytest_test_report.xml"
+
+        if [[ -f "$JUNIT_FILE" ]]; then
+          echo "✅ JUnit XML generated successfully"
+          # Extract basic test counts for status determination
+          TOTAL_TESTS=$(grep -o 'tests="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
+          FAILED_TESTS=$(grep -o 'failures="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
+          ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0")
+          echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
+
+          # Create metadata file with step context information
+          METADATA_FILE="test-results/test_metadata.json"
+          echo '{' > "$METADATA_FILE"
+          echo '  "job_name": "${{ github.job }}",' >> "$METADATA_FILE"
+          echo '  "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE"
+          echo '  "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE"
+          echo '  "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE"
+          echo '  "junit_xml_file": "pytest_test_report.xml",' >> "$METADATA_FILE"
+          echo '  "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE"
+          echo '}' >> "$METADATA_FILE"
+          echo "📝 Created test metadata file"
+        else
+          echo "⚠️  JUnit XML file not found - test results may not be available for upload"
+          TOTAL_TESTS=0
+          FAILED_TESTS=1  # Treat missing XML as failure
+          ERROR_TESTS=0
+        fi
+
+        # Exit with original test result to maintain workflow behavior
+        exit ${TEST_EXIT_CODE}
+
+    - name: Upload Test Results
+      uses: actions/upload-artifact@v4
+      if: always()  # Always upload test results, even if tests failed
+      with:
+        name: test-results-${{ inputs.framework }}-${{ inputs.test_type }}-${{ env.PLATFORM_ARCH }}
+        path: test-results/${{ env.PYTEST_XML_FILE }}
+        retention-days: 7
\ No newline at end of file
--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -126,12 +126,18 @@ jobs:
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "unit and vllm and gpu_1"
+          framework: "vllm"
+          test_type: "unit"
+          platform_arch: ${{ matrix.platform.arch }}
      - name: Run e2e tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "e2e and vllm and gpu_1 and not slow"
+          framework: "vllm"
+          test_type: "e2e, gpu_1"
+          platform_arch: ${{ matrix.platform.arch }}

  sglang:
    needs: changed-files
@@ -190,6 +196,9 @@ jobs:
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "unit and sglang and gpu_1"
+          framework: "sglang"
+          test_type: "unit"
+          platform_arch: ${{ matrix.platform.arch }}
      - name: Run e2e tests
        # OPS-1140: Uncomment the below line
        # if: ${{ matrix.platform.arch != 'arm64' }}
@@ -197,6 +206,9 @@ jobs:
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "e2e and sglang and gpu_1"
+          framework: "sglang"
+          test_type: "e2e, gpu_1"
+          platform_arch: ${{ matrix.platform.arch }}

  trtllm:
    needs: changed-files
@@ -247,12 +259,18 @@ jobs:
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "unit and trtllm_marker and gpu_1"
+          framework: "trtllm"
+          test_type: "unit"
+          platform_arch: ${{ matrix.platform.arch }}
      - name: Run e2e tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "e2e and trtllm_marker and gpu_1 and not slow"
+          framework: "trtllm"
+          test_type: "e2e, gpu_1"
+          platform_arch: ${{ matrix.platform.arch }}

  # Upload metrics for this workflow and all its jobs
  upload-workflow-metrics:
@@ -283,14 +301,23 @@ jobs:
          merge-multiple: true
        continue-on-error: true  # Don't fail if artifacts don't exist

+      - name: Download test results
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-results-*
+          path: test-results/
+          merge-multiple: true
+        continue-on-error: true  # Don't fail if artifacts don't exist
+
      - name: Upload Complete Workflow Metrics
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
          JOB_INDEX: ${{ secrets.JOB_INDEX }}
          STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
-          # Container index configuration
+          # Container and test index configuration
          CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
+          TEST_INDEX: ${{ secrets.TEST_INDEX }}
        run: |
          # Upload complete workflow metrics including container metrics
          python3 .github/workflows/upload_complete_workflow_metrics.py
--- a/.github/workflows/upload_complete_workflow_metrics.py
+++ b/.github/workflows/upload_complete_workflow_metrics.py
@@ -6,10 +6,12 @@ This version runs as the final job in a workflow and captures metrics for
 the entire workflow including all previous jobs.
 """

+import glob
 import json
 import os
 import re
 import time
+import xml.etree.ElementTree as ET
 from datetime import datetime, timezone
 from typing import Any, Dict, Optional
 from urllib.parse import urlparse
@@ -66,6 +68,16 @@ FIELD_BUILD_TARGET = "s_build_target"
 FIELD_BUILD_FRAMEWORK = "s_build_framework"
 FIELD_BUILD_SIZE_BYTES = "l_build_size_bytes"

+# Test Info
+FIELD_FRAMEWORK = "s_framework"
+FIELD_ERROR_MESSAGE = "s_error_message"
+FIELD_TEST_NAME = "s_test_name"  # Test name (e.g., test_sglang_deployment[aggregated])
+FIELD_TEST_CLASSNAME = (
+    "s_test_classname"  # Test class name (e.g., tests.serve.test_sglang)
+)
+FIELD_TEST_DURATION = "l_test_duration_ms"
+FIELD_TEST_STATUS = "s_test_status"  # Test status (passed, failed, error, skipped)
+

 class BuildMetricsReader:
    """Reader for build metrics from environment variables and artifacts"""
@@ -630,6 +642,8 @@ class WorkflowMetricsUploader:

        if is_framework_job:
            self._upload_container_metrics(job_data)
+            # Also upload test metrics if available for this framework job
+            self._upload_test_metrics(job_data)

    def _upload_job_step_metrics(self, job_data: Dict[str, Any]) -> int:
        """Extract and post metrics for all steps in a job"""
@@ -801,6 +815,180 @@ class WorkflowMetricsUploader:
        except Exception as e:
            print(f"❌ Failed to upload container metrics: {e}")

+    def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None:
+        """Upload individual test metrics by parsing JUnit XML directly from test-results"""
+        test_index = os.getenv("TEST_INDEX")
+        if not test_index:
+            print("⚠️  TEST_INDEX not configured, skipping test metrics upload")
+            return
+
+        job_name = job_data.get("name", "")
+        job_id = str(job_data["id"])
+
+        print(f"🧪 Looking for test results for job '{job_name}'")
+
+        # Look for test results directory
+        test_results_dir = "test-results"
+        if not os.path.exists(test_results_dir):
+            print(f"⚠️  Test results directory not found: {test_results_dir}")
+            return
+
+        # Look for metadata files to get accurate step and framework info
+        metadata_files = glob.glob(f"{test_results_dir}/test_metadata.json")
+
+        if not metadata_files:
+            print(f"⚠️  No test metadata files found in {test_results_dir}")
+            return
+
+        print(f"📄 Found {len(metadata_files)} test metadata files")
+
+        total_tests_processed = 0
+
+        # Process each metadata file
+        for metadata_file in metadata_files:
+            try:
+                # Read metadata to get accurate step and framework info
+                with open(metadata_file, "r") as f:
+                    metadata = json.load(f)
+
+                framework = metadata.get("framework", "unknown")
+                test_type = metadata.get("test_type", "unknown")
+                step_name = metadata.get("step_name", "Run tests")
+                junit_xml_file = metadata.get(
+                    "junit_xml_file", "pytest_test_report.xml"
+                )
+
+                # Construct step ID from metadata
+                test_step_id = f"{job_id}_{step_name.lower().replace(' ', '_')}"
+
+                print("📋 Processing test results:")
+                print(f"   Framework: {framework}")
+                print(f"   Test Type: {test_type}")
+                print(f"   Step Name: {step_name}")
+                print(f"   Step ID: {test_step_id}")
+
+                # Find the corresponding XML file
+                xml_file = f"{test_results_dir}/{junit_xml_file}"
+                if not os.path.exists(xml_file):
+                    print(f"⚠️  JUnit XML file not found: {xml_file}")
+                    continue
+
+                print(f"📄 Processing JUnit XML: {xml_file}")
+
+                # Parse JUnit XML using xml.etree.ElementTree
+                tree = ET.parse(xml_file)
+                root = tree.getroot()
+
+                # Process each test case
+                for testsuite in root.findall(".//testsuite"):
+                    for testcase in testsuite.findall("testcase"):
+                        # Extract test case information
+                        test_classname = testcase.get("classname", "")
+                        test_name = testcase.get("name", "")
+                        test_time = float(testcase.get("time", 0))
+                        test_status = "passed"  # Default status
+
+                        # Create individual test data payload
+                        test_data = {}
+
+                        # Identity & Context
+                        test_full_name = (
+                            f"{test_classname}::{test_name}"
+                            if test_classname
+                            else test_name
+                        )
+                        test_data[
+                            FIELD_ID
+                        ] = f"github-test-{job_id}-{hash(test_full_name) & 0x7FFFFFFF}"  # Use hash for unique ID
+                        test_data[FIELD_STEP_ID] = test_step_id
+                        test_data[FIELD_JOB_ID] = job_id
+
+                        # Test Info
+                        test_data[FIELD_FRAMEWORK] = framework
+                        test_data[FIELD_TEST_NAME] = test_name
+                        test_data[FIELD_TEST_CLASSNAME] = test_classname
+                        test_data[FIELD_TEST_DURATION] = int(
+                            test_time * 1000
+                        )  # Convert to milliseconds
+
+                        # Check for failure, error, or skipped elements
+                        error_msg = ""
+                        if testcase.find("failure") is not None:
+                            test_status = "failed"
+                            failure_elem = testcase.find("failure")
+                            error_msg = (
+                                failure_elem.get("message", "")
+                                if failure_elem is not None
+                                else ""
+                            )
+                            if (
+                                not error_msg
+                                and failure_elem is not None
+                                and failure_elem.text
+                            ):
+                                error_msg = failure_elem.text
+                        elif testcase.find("error") is not None:
+                            test_status = "error"
+                            error_elem = testcase.find("error")
+                            error_msg = (
+                                error_elem.get("message", "")
+                                if error_elem is not None
+                                else ""
+                            )
+                            if (
+                                not error_msg
+                                and error_elem is not None
+                                and error_elem.text
+                            ):
+                                error_msg = error_elem.text
+                        elif testcase.find("skipped") is not None:
+                            test_status = "skipped"
+                            skipped_elem = testcase.find("skipped")
+                            error_msg = (
+                                skipped_elem.get("message", "")
+                                if skipped_elem is not None
+                                else ""
+                            )
+
+                        test_data[FIELD_TEST_STATUS] = test_status
+                        test_data[
+                            FIELD_STATUS
+                        ] = test_status  # Also set general status field
+
+                        if error_msg:
+                            test_data[FIELD_ERROR_MESSAGE] = error_msg[
+                                :1000
+                            ]  # Limit error message length
+
+                        # Add timing (use job completion time as more accurate timestamp)
+                        job_completed_at = job_data.get("completed_at")
+                        if job_completed_at:
+                            test_data["@timestamp"] = job_completed_at
+                        else:
+                            # Fallback to current time if job completion time not available
+                            test_data["@timestamp"] = datetime.now(
+                                timezone.utc
+                            ).isoformat()
+
+                        # Add common context fields (repo, branch, pr_id, etc.)
+                        self.add_common_context_fields(test_data)
+
+                        # Upload individual test
+                        try:
+                            self.post_to_db(test_index, test_data)
+                            print(
+                                f"✅ Uploaded test: {test_full_name} ({test_status}, {test_time:.3f}s)"
+                            )
+                            total_tests_processed += 1
+                        except Exception as e:
+                            print(f"❌ Failed to upload test {test_full_name}: {e}")
+
+            except Exception as e:
+                print(f"❌ Failed to process metadata file {metadata_file}: {e}")
+
+        print(f"📊 Processed {total_tests_processed} individual tests total")
+        print("   " + "=" * 50)
+

 def main():
    """Main function to upload complete GitHub Actions workflow metrics"""