ci: Adding nightly pipeline workflow (#4204)

Signed-off-by: pvijayakrish <pvijayakrish@nvidia.com> Signed-off-by: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>

ci: Adding nightly pipeline workflow (#4204)
Signed-off-by: pvijayakrish <pvijayakrish@nvidia.com> Signed-off-by: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>
a9078ed0 · Pavithra Vijayakrishnan · GitHub · d03c0976 · a9078ed0 · a9078ed0
Unverified Commit a9078ed0 authored Dec 05, 2025 by Pavithra Vijayakrishnan Committed by GitHub Dec 05, 2025
12 changed files
--- a/.github/actions/docker-build/action.yml
+++ b/.github/actions/docker-build/action.yml
@@ -49,6 +49,12 @@ inputs:
  torch_backend:
    description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
    required: false
+  enable_kvbm:
+    description: 'Enable KVBM support (optional)'
+    required: false
+  dynamo_base_image:
+    description: 'Pre-built Dynamo base image to use instead of building from scratch'
+    required: false

 outputs:
  image_tag:
@@ -72,14 +78,9 @@ runs:
        aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
    - name: Login to NGC
      if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
-      shell: bash
-      run: |
-        echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
-    - name: Cleanup
-      if: always()
-      shell: bash
-      run: |
-        docker system prune -af
+      uses: ./.github/actions/docker-login
+      with:
+        ngc_ci_access_token: ${{ inputs.ngc_ci_access_token }}
    - name: Build image
      id: build
      shell: bash
@@ -125,6 +126,12 @@ runs:
        if [ -n "${{ inputs.torch_backend }}" ]; then
          EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
        fi
+        if [ -n "${{ inputs.dynamo_base_image }}" ]; then
+          EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
+        fi
+        if [ -n "${{ inputs.enable_kvbm }}" ]; then
+          EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
+        fi

        # Execute build and capture output (show on console AND save to file)
        ./container/build.sh --tag "$IMAGE_TAG" \
@@ -289,7 +296,7 @@ runs:
      uses: actions/upload-artifact@v4
      if: always()
      with:
-        name: build-metrics-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
+        name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
        path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
        retention-days: 7

--- a/.github/actions/docker-login/action.yml
+++ b/.github/actions/docker-login/action.yml
+name: 'Docker Login'
+description: 'Login to multiple container registries (ECR, NGC, ACR)'
+
+inputs:
+  ngc_ci_access_token:
+    description: 'NGC CI Access Token'
+    required: false
+  aws_default_region:
+    description: 'AWS Default Region'
+    required: false
+  aws_account_id:
+    description: 'AWS Account ID'
+    required: false
+  azure_acr_hostname:
+    description: 'Azure ACR hostname'
+    required: false
+  azure_acr_user:
+    description: 'Azure ACR user'
+    required: false
+  azure_acr_password:
+    description: 'Azure ACR password'
+    required: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: ECR Login
+      shell: bash
+      if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
+      env:
+        ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
+      run: |
+        set -euo pipefail
+        aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
+    - name: NGC Login
+      if: ${{ inputs.ngc_ci_access_token != '' }}
+      shell: bash
+      run: |
+        set -euo pipefail
+        echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+    - name: ACR Login
+      shell: bash
+      if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
+      run: |
+        set -euo pipefail
+        echo "${{ inputs.azure_acr_password }}" | docker login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin
--- a/.github/actions/docker-tag-push/action.yml
+++ b/.github/actions/docker-tag-push/action.yml
+name: 'Docker Tag and Push'
 description: 'Tag and Push Docker Images'

 inputs:
  local_image:
    description: 'Local Image Name:Tag'
    required: true
-  push_tag:
-    description: 'Target Name:Tag'
+  push_tags:
+    description: 'Target Name:Tag (newline-separated list for multiple tags)'
    required: true
  aws_push:
    description: 'Push to AWS Boolean'
@@ -38,37 +39,48 @@ inputs:
    required: false

 outputs:
-  image_tag:
-    description: 'Image Tag'
-    value: ${{ inputs.push_tag }}
+  image_tags:
+    description: 'Image Tags'
+    value: ${{ inputs.push_tags }}

 runs:
  using: "composite"
  steps:
    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
-    - name: ACR Login
-      shell: bash
-      if: ${{ inputs.azure_push == 'true' }}
-      run: |
-        echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
+
    - name: ECR Tag and Push
      shell: bash
      if: ${{ inputs.aws_push == 'true' }}
      env:
        LOCAL_IMAGE: ${{ inputs.local_image }}
-        PUSH_TAG: ${{ inputs.push_tag }}
+        PUSH_TAGS: ${{ inputs.push_tags }}
        ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
      run: |
-        docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
-        docker push ${ECR_HOSTNAME}/${PUSH_TAG}
+        set -euo pipefail
+        while IFS= read -r TAG; do
+          if [ -z "$TAG" ]; then
+            continue
+          fi
+          echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
+          docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
+          docker push "${ECR_HOSTNAME}/${TAG}"
+        done <<< "$PUSH_TAGS"
+
    - name: ACR Tag and Push
      shell: bash
      if: ${{ inputs.azure_push == 'true' }}
      env:
        LOCAL_IMAGE: ${{ inputs.local_image }}
-        PUSH_TAG: ${{ inputs.push_tag }}
+        PUSH_TAGS: ${{ inputs.push_tags }}
        AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
      run: |
-        docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
-        docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
+        set -euo pipefail
+        while IFS= read -r TAG; do
+          if [ -z "$TAG" ]; then
+            continue
+          fi
+          echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
+          docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
+          docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
+        done <<< "$PUSH_TAGS"
--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
@@ -24,6 +24,10 @@ inputs:
    description: 'Platform architecture (amd64, arm64)'
    required: false
    default: 'amd64'
+  dry_run:
+    description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
+    required: false
+    default: 'false'


 runs:
@@ -54,21 +58,32 @@ runs:
        # Run pytest with detailed output and JUnit XML
        set +e  # Don't exit on test failures

+        # Determine docker runtime flags and pytest command based on dry_run mode
+        if [[ "${{ inputs.dry_run }}" == "true" ]]; then
+          echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
+          GPU_FLAGS=""
+          PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
+        else
+          echo "🚀 Running pytest in normal mode"
+          PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+
          # Detect GPU availability and conditionally add GPU flags
          GPU_FLAGS=""
          if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
-          echo "GPU detected, enabling GPU runtime"
+            echo "✓ GPU detected, enabling GPU runtime"
            GPU_FLAGS="--runtime=nvidia --gpus all"
          else
-          echo "No GPU detected, running in CPU-only mode"
+            echo "⚠️  No GPU detected, running in CPU-only mode"
+          fi
        fi

-        docker run ${GPU_FLAGS} --rm -w /workspace \
+        # Run without --rm so we can copy results even if container crashes (example SIGSEGV exit 139)
+        docker run ${GPU_FLAGS} -w /workspace \
          --cpus=${NUM_CPUS} \
          --network host \
          --name ${{ env.CONTAINER_ID }}_pytest \
          ${{ inputs.image_tag }} \
-          bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
+          bash -c "mkdir -p /workspace/test-results && ${PYTEST_CMD}"

        TEST_EXIT_CODE=$?
        echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
@@ -92,6 +107,13 @@ runs:
        STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_')
        echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV

+        # Skip XML processing if in dry-run mode
+        if [[ "${{ inputs.dry_run }}" == "true" ]]; then
+          echo "✅ Dry-run mode: Test collection completed"
+          echo "⏭️  No JUnit XML generated (dry-run mode)"
+          exit 0
+        fi
+
        # Check for JUnit XML file and determine test status
        JUNIT_FILE="test-results/pytest_test_report.xml"

@@ -133,7 +155,7 @@ runs:

    - name: Upload Test Results
      uses: actions/upload-artifact@v4
-      if: always()  # Always upload test results, even if tests failed
+      if: always() && inputs.dry_run != 'true'  # Skip upload in dry-run mode
      with:
        name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
        path: |

--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -72,11 +72,10 @@ jobs:
        with:
          driver: docker
      - name: Login to ECR
-        shell: bash
-        env:
-          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
-        run: |
-          aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
      - name: Linter
        shell: bash
        env:
@@ -120,7 +119,7 @@ jobs:
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: dynamo-operator:latest
-          push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
          aws_push: 'false'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
@@ -165,11 +164,18 @@ jobs:
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'
@@ -223,11 +229,18 @@ jobs:
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'
@@ -281,11 +294,18 @@ jobs:
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'

--- a/.github/workflows/container-validation-dynamo.yml
+++ b/.github/workflows/container-validation-dynamo.yml
@@ -33,8 +33,9 @@ jobs:
        uses: docker/setup-buildx-action@v3
      - name: Login to NGC
        if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
-        run: |
-          echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+        uses: ./.github/actions/docker-login
+        with:
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Define Image Tag
        id: define_image_tag
        run: |

--- a/.github/workflows/nightly-ci.yml
+++ b/.github/workflows/nightly-ci.yml
-name: Nightly CI
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Nightly CI pipeline

 on:
  schedule:
    - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
-  workflow_dispatch:
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash --noprofile --norc -eo pipefail {0}
+
+env:
+  REGISTRY_IMAGE: ai-dynamo/dynamo
+  NIGHTLY_IMAGE_PREFIX: nightly
+
+############################## BUILD JOBS ##############################

 jobs:
-  vllm:
+  build-amd64:
+    name: Build ${{ matrix.framework }} (amd64)
+    runs-on: cpu-amd-m5-4xlarge
+    timeout-minutes: 120
    strategy:
      fail-fast: false
      matrix:
-        platform:
-          - { arch: amd64, runner: gpu-l40-amd64 }
-          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
-    name: vllm (${{ matrix.platform.arch }})
-    runs-on: ${{ matrix.platform.runner }}
+        framework: [vllm, trtllm, sglang]
+    env:
+      ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
    steps:
-      - name: Checkout code
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 #v4.3.1
-      - name: Build vLLM Docker Image
-        id: build-vllm
+      - uses: actions/checkout@v4
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+      - name: Pull existing images for cache
+        shell: bash
+        continue-on-error: true
+        run: |
+          echo "Attempting to pull existing images for layer caching..."
+          docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64" || echo "Framework image not found in cache"
+          docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64" || echo "Runtime image not found in cache"
+          echo "Cache pull completed"
+      - name: Build Framework Image
+        id: build_framework
        uses: ./.github/actions/docker-build
        with:
-          framework: vllm
+          framework: ${{ matrix.framework }}
+          target: framework
+          platform: linux/amd64
+          base_image_tag: ''
+          runtime_image_tag: ''
+          cuda_version: ''
+          torch_backend: ''
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          ci_token: ${{ secrets.CI_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          image_tag: framework-${{ matrix.framework }}-amd64:${{ github.run_id }}
+      - name: Tag and Push Framework Images
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: framework-${{ matrix.framework }}-amd64:${{ github.run_id }}
+          push_tags: |
+            ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64
+            ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64-run-${{ github.run_id }}
+          aws_push: 'true'
+          azure_push: 'false'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+      - name: Build Runtime Image
+        id: build_runtime
+        uses: ./.github/actions/docker-build
+        with:
+          framework: ${{ matrix.framework }}
          target: runtime
-          platform: linux/${{ matrix.platform.arch }}
-          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
-          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
-          cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
-          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
+          platform: linux/amd64
+          base_image_tag: ''
+          runtime_image_tag: ''
+          cuda_version: ''
+          torch_backend: ''
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
@@ -36,70 +98,77 @@ jobs:
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          image_tag: nightly-vllm-${{ matrix.platform.arch }}
-      - name: Tag and Push vLLM Nightly Image
+          image_tag: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
+      - name: Tag and Push Runtime Images
        uses: ./.github/actions/docker-tag-push
        with:
-          local_image: ${{ steps.build-vllm.outputs.image_tag }}
-          # Tag the image nightly
-          push_tag: ai-dynamo/dynamo:nightly-vllm-${{ matrix.platform.arch }}
-          aws_push: 'false'
+          local_image: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
+          push_tags: |
+            ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64
+            ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64-run-${{ github.run_id }}
+          aws_push: 'true'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
-      - name: Run unit tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
-        uses: ./.github/actions/pytest
-        with:
-          image_tag: nightly-vllm-${{ matrix.platform.arch }}
-          pytest_marks: "vllm and unit"
-          framework: "vllm"
-          test_type: "unit"
-          platform_arch: ${{ matrix.platform.arch }}
-      - name: Run e2e tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
-        uses: ./.github/actions/pytest
-        with:
-          image_tag: nightly-vllm-${{ matrix.platform.arch }}
-          pytest_marks: "nightly and vllm and gpu_1"
-          framework: "vllm"
-          test_type: "e2e"
-          platform_arch: ${{ matrix.platform.arch }}

-  ####################
-  # Framework Builds #
-  ####################
-  vllm-framework:
+  build-arm64:
+    name: Build ${{ matrix.framework }} (arm64)
+    runs-on: cpu-arm-r8g-4xlarge
+    timeout-minutes: 120
    strategy:
      fail-fast: false
      matrix:
-        platform:
-          - { arch: amd64, runner: cpu-amd-m5-4xlarge }
-          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
-    name: vllm-framework (${{ matrix.platform.arch }})
-    runs-on: ${{ matrix.platform.runner }}
+        include:
+          - framework: vllm
+            base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
+            runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
+            cuda_version: '129'
+            torch_backend: 'cu129'
+          - framework: trtllm
+            base_image_tag: '25.06-py3'
+            runtime_image_tag: ''
+            cuda_version: '129'
+            torch_backend: 'cu129'
+          - framework: sglang
+            base_image_tag: ''
+            runtime_image_tag: ''
+            cuda_version: ''
+            torch_backend: ''
    env:
-      FRAMEWORK: vllm
-    steps: &framework-build-steps
-      - name: Checkout code
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 #v4.3.1
+      ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+    steps:
+      - uses: actions/checkout@v4
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
        with:
-          ref: main
-      - name: Build Image
-        id: build-image
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+      - name: Pull existing images for cache
+        shell: bash
+        continue-on-error: true
+        run: |
+          echo "Attempting to pull existing images for layer caching..."
+          docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64" || echo "Framework image not found in cache"
+          docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64" || echo "Runtime image not found in cache"
+          echo "Cache pull completed"
+      - name: Build Framework Image
+        id: build_framework
        uses: ./.github/actions/docker-build
        with:
-          framework: ${{ env.FRAMEWORK }}
+          framework: ${{ matrix.framework }}
          target: framework
-          platform: linux/${{ matrix.platform.arch }}
-          # Ternary operations that are specific to vllm/arm64, empty str for all other combinations
-          base_image_tag: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
-          runtime_image_tag: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '12.9.0-runtime-ubuntu24.04' || '' }}
-          cuda_version: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '129' || '' }}
-          torch_backend: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && 'cu129' || '' }}
+          platform: linux/arm64
+          base_image_tag: ${{ matrix.base_image_tag }}
+          runtime_image_tag: ${{ matrix.runtime_image_tag }}
+          cuda_version: ${{ matrix.cuda_version }}
+          torch_backend: ${{ matrix.torch_backend }}
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
@@ -107,39 +176,630 @@ jobs:
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-      - name: Docker Tag and Push
+          image_tag: framework-${{ matrix.framework }}-arm64:${{ github.run_id }}
+      - name: Tag and Push Framework Images
        uses: ./.github/actions/docker-tag-push
        with:
-          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tag: ai-dynamo/dynamo:main-${{ env.FRAMEWORK }}-framework-${{ matrix.platform.arch }}
+          local_image: framework-${{ matrix.framework }}-arm64:${{ github.run_id }}
+          push_tags: |
+            ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64
+            ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64-run-${{ github.run_id }}
          aws_push: 'true'
          azure_push: 'false'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+      - name: Build Runtime Image
+        id: build_runtime
+        uses: ./.github/actions/docker-build
+        with:
+          framework: ${{ matrix.framework }}
+          target: runtime
+          platform: linux/arm64
+          base_image_tag: ${{ matrix.base_image_tag }}
+          runtime_image_tag: ${{ matrix.runtime_image_tag }}
+          cuda_version: ${{ matrix.cuda_version }}
+          torch_backend: ${{ matrix.torch_backend }}
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          ci_token: ${{ secrets.CI_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          image_tag: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
+      - name: Tag and Push Runtime Images
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
+          push_tags: |
+            ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64
+            ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64-run-${{ github.run_id }}
+          aws_push: 'true'
+          azure_push: 'true'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
-  sglang-framework:
+
+############################## TEST JOBS ##############################
+
+  unit-tests:
+    name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-unit
+    needs: [build-amd64, build-arm64]
+    if: always()
+    runs-on: ${{ matrix.arch.runner }}
+    timeout-minutes: 45
+    strategy:
+      fail-fast: false
+      matrix:
+        framework: [vllm, trtllm, sglang]
+        arch:
+          - arch: amd64
+            runner: gpu-l40-amd64
+          - arch: arm64
+            runner: cpu-arm-r8g-4xlarge
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check if build succeeded
+        id: check_build
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +x
+          echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          # Determine which build job to check
+          if [ "${{ matrix.arch.arch }}" = "amd64" ]; then
+            BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)"
+          else
+            BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)"
+          fi
+          # Query GitHub API for job status using curl (token from env to avoid log exposure)
+          JOBS=$(curl -s -S -L --fail-with-body \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
+          if [ $? -ne 0 ]; then
+            echo "Error: Failed to query GitHub API"
+            exit 1
+          fi
+          # Find the specific build job and check its conclusion
+          BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
+          echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
+          if [ "$BUILD_STATUS" != "success" ]; then
+            echo "Build failed or did not complete successfully. Failing tests."
+            exit 1
+          fi
+          echo "Build succeeded. Proceeding with tests."
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+      - name: Pull nightly image
+        shell: bash
+        env:
+          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+          IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+        run: |
+          docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
+          docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
+      - name: Run Unit Tests
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+          pytest_marks: "unit and (nightly or post_merge or pre_merge)"
+          framework: ${{ matrix.framework }}
+          test_type: unit
+          platform_arch: ${{ matrix.arch.arch }}
+          cpu_limit: '8'
+          dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
+
+  integration-tests:
+    name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-integ
+    needs: [build-amd64, build-arm64]
+    if: always()
+    runs-on: ${{ matrix.arch.runner }}
+    timeout-minutes: ${{ matrix.arch.timeout }}
    strategy:
      fail-fast: false
      matrix:
-        platform:
-          - { arch: amd64, runner: cpu-amd-m5-4xlarge }
-          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
-    name: sglang-framework (${{ matrix.platform.arch }})
-    runs-on: ${{ matrix.platform.runner }}
+        framework: [vllm, trtllm, sglang]
+        arch:
+          - arch: amd64
+            runner: gpu-l40-amd64
+            timeout: 90
+          - arch: arm64
+            runner: cpu-arm-r8g-4xlarge
+            timeout: 90
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check if build succeeded
+        id: check_build
        env:
-      FRAMEWORK: sglang
-    steps: *framework-build-steps
-  trtllm-framework:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +x
+          echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          JOBS=$(curl -s -S -L --fail-with-body \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
+          if [ $? -ne 0 ]; then
+            echo "Error: Failed to query GitHub API"
+            exit 1
+          fi
+          BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
+          echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
+          if [ "$BUILD_STATUS" != "success" ]; then
+            echo "Build failed or did not complete successfully. Marking tests as failed."
+            exit 1
+          fi
+          echo "Build succeeded. Proceeding with tests."
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+      - name: Pull nightly image
+        shell: bash
+        env:
+          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+          IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+        run: |
+          docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
+          docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
+      - name: Run Integration Tests
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+          pytest_marks: "integration and (nightly or post_merge or pre_merge)"
+          framework: ${{ matrix.framework }}
+          test_type: integration
+          platform_arch: ${{ matrix.arch.arch }}
+          dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
+
+  e2e-single-gpu-tests:
+    name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-1gpu-e2e
+    needs: [build-amd64, build-arm64]
+    if: always()
+    runs-on: ${{ matrix.arch.runner }}
+    timeout-minutes: ${{ matrix.arch.timeout }}
    strategy:
      fail-fast: false
      matrix:
-        platform:
-          - { arch: amd64, runner: cpu-amd-m5-4xlarge }
-          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
-    name: trtllm-framework (${{ matrix.platform.arch }})
-    runs-on: ${{ matrix.platform.runner }}
+        framework: [vllm, trtllm, sglang]
+        arch:
+          - arch: amd64
+            runner: gpu-l40-amd64
+            timeout: 120
+          - arch: arm64
+            runner: cpu-arm-r8g-4xlarge
+            timeout: 120
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check if build succeeded
+        id: check_build
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +x
+          echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          JOBS=$(curl -s -S -L --fail-with-body \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
+          if [ $? -ne 0 ]; then
+            echo "Error: Failed to query GitHub API"
+            echo "skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
+          echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
+          if [ "$BUILD_STATUS" != "success" ]; then
+            echo "Build failed or did not complete successfully. Failing tests."
+            exit 1
+          fi
+          echo "Build succeeded. Proceeding with tests."
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+      - name: Pull nightly image
+        shell: bash
+        env:
+          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+          IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+        run: |
+          docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
+          docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
+      - name: Run E2E Tests (gpu_1)
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+          pytest_marks: "${{ matrix.framework }} and e2e and gpu_1"
+          framework: ${{ matrix.framework }}
+          test_type: e2e-single-gpu
+          platform_arch: ${{ matrix.arch.arch }}
+          dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
+
+  e2e-multi-gpu-tests:
+    name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-2gpu-e2e
+    needs: [build-amd64, build-arm64]
+    if: always()
+    runs-on: ${{ matrix.arch.runner }}
+    timeout-minutes: ${{ matrix.arch.timeout }}
+    strategy:
+      fail-fast: false
+      matrix:
+        framework: [vllm, trtllm, sglang]
+        arch:
+          - arch: amd64
+            runner: gpu-l40-amd64
+            timeout: 150
+          - arch: arm64
+            runner: cpu-arm-r8g-4xlarge
+            timeout: 150
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check if build succeeded
+        id: check_build
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +x
+          echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+          JOBS=$(curl -s -S -L --fail-with-body \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
+          if [ $? -ne 0 ]; then
+            echo "Error: Failed to query GitHub API"
+            echo "skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
+          echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
+          if [ "$BUILD_STATUS" != "success" ]; then
+            echo "Build failed or did not complete successfully. Marking tests as failed."
+            exit 1
+          fi
+          echo "Build succeeded. Proceeding with tests."
+      - name: Login to Container Registries
+        uses: ./.github/actions/docker-login
+        with:
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+      - name: Pull nightly image
+        shell: bash
        env:
-      FRAMEWORK: trtllm
-    steps: *framework-build-steps
+          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+          IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+        run: |
+          docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
+          docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
+      - name: Run E2E Tests (gpu_2)
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+          pytest_marks: "(nightly or post_merge or pre_merge) and e2e and gpu_2"
+          framework: ${{ matrix.framework }}
+          test_type: e2e-multi-gpu
+          platform_arch: ${{ matrix.arch.arch }}
+          dry_run: 'true'
+
+  # component-tests:
+  #   name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-${{ matrix.component }}
+  #   needs: [build-amd64, build-arm64]
+  #   if: always()
+  #   runs-on: ${{ matrix.arch.runner }}
+  #   timeout-minutes: ${{ matrix.arch.timeout }}
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       framework: [vllm, trtllm, sglang]
+  #       arch:
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 90
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: amd64
+  #           runner: gpu-l40-amd64
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: router
+  #           marks: "nightly and router"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 60
+  #           component: planner
+  #           marks: "nightly and planner"
+  #         - arch: arm64
+  #           runner: cpu-arm-r8g-4xlarge
+  #           timeout: 150
+  #           component: kvbm
+  #           marks: "nightly and (kvbm or kvbm_v2)"
+
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Check if build succeeded
+  #       id: check_build
+  #       env:
+  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #       run: |
+  #         set +x
+  #         echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
+
+  #         if [ "${{ matrix.arch.arch }}" = "amd64" ]; then
+  #           BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)"
+  #         else
+  #           BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)"
+  #         fi
+
+  #         JOBS=$(curl -s -S -L --fail-with-body \
+  #           -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+  #           -H "Accept: application/vnd.github.v3+json" \
+  #           "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
+
+  #         if [ $? -ne 0 ]; then
+  #           echo "Error: Failed to query GitHub API"
+  #           echo "skip=true" >> $GITHUB_OUTPUT
+  #           exit 0
+  #         fi
+
+  #         BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
+
+  #         echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
+
+  #         if [ "$BUILD_STATUS" != "success" ]; then
+  #           echo "Build failed or did not complete successfully. Marking tests as failed."
+  #           exit 1
+  #         fi
+
+  #         echo "Build succeeded. Proceeding with tests."
+  #     - name: Login to Container Registries
+  #       uses: ./.github/actions/docker-login
+  #       with:
+  #         aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+  #         aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+  #     - name: Pull nightly image
+  #       shell: bash
+  #       env:
+  #         ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
+  #         IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+  #       run: |
+  #         docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
+  #         docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
+  #     - name: Run Component Tests (${{ matrix.component }})
+  #       uses: ./.github/actions/pytest
+  #       with:
+  #         image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
+  #         pytest_marks: "${{ matrix.marks }}"
+  #         framework: ${{ matrix.framework }}
+  #         test_type: component-${{ matrix.component }}
+  #         platform_arch: ${{ matrix.arch.arch }}
+
+  ############################## RESULTS SUMMARY ##############################
+  results-summary:
+    name: Results Summary
+    runs-on: ubuntu-latest
+    if: always()
+    needs: [build-amd64, build-arm64, unit-tests, integration-tests, e2e-single-gpu-tests, e2e-multi-gpu-tests] # component-tests
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Gather job metadata
+        id: gather
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set +x -e
+          echo "# Nightly CI Results Summary" > results.md
+          echo "" >> results.md
+          echo "| Stage | Status | Runner | Duration (min) | Artifacts |" >> results.md
+          echo "|-------|--------|--------|----------------|-----------|" >> results.md
+
+          curl -s -S -L --fail-with-body \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
+            2>/dev/null | jq -c '.jobs[] | {id, name, runner_name, conclusion, started_at, completed_at}' > jobs.jsonl
+
+          while read job_entry; do
+            job_id=$(echo "$job_entry" | jq -r '.id')
+            name=$(echo "$job_entry" | jq -r '.name')
+            runner=$(echo "$job_entry" | jq -r '.runner_name')
+            status=$(echo "$job_entry" | jq -r '.conclusion')
+            started=$(echo "$job_entry" | jq -r '.started_at')
+            completed=$(echo "$job_entry" | jq -r '.completed_at')
+            minutes="N/A"
+            if [[ "$started" != "null" && "$completed" != "null" ]]; then
+              start_epoch=$(date -d "$started" +%s)
+              end_epoch=$(date -d "$completed" +%s)
+              minutes=$(( (end_epoch - start_epoch)/60 ))
+            fi
+            artifact_link="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#job-$job_id"
+            printf "| %s | %s | %s | %s | [Log & Artifacts](%s) |\n" "$name" "$status" "$runner" "$minutes" "$artifact_link" >> results.md
+          done < jobs.jsonl
+
+          echo "" >> results.md
+          echo "---" >> results.md
+      - name: Display workflow summary
+        run: cat results.md
+      - name: Upload results summary as job summary
+        run: cat results.md >> $GITHUB_STEP_SUMMARY
+      - name: Upload results as artifact for Slack
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: nightly-results-summary
+          path: results.md
+          retention-days: 7
+
+  ############################## SLACK NOTIFICATION ##############################
+  notify-slack:
+    name: Notify Slack
+    runs-on: cpu-amd-m5-4xlarge
+    if: always() && github.event_name == 'schedule' && !github.event.repository.fork
+    needs: results-summary
+    permissions:
+      contents: read
+    env:
+      HAS_SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL != '' }}
+    steps:
+      - name: Send Slack notification
+        if: env.HAS_SLACK_WEBHOOK == 'true'
+        continue-on-error: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          set -euo pipefail
+
+          JOBS_JSON=$(mktemp)
+          trap 'rm -f "$JOBS_JSON"' EXIT
+
+          if ! curl -sSL \
+            -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github+json" \
+            "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
+            > "$JOBS_JSON"; then
+            echo "Error: Failed to fetch job data from GitHub API"
+            exit 1
+          fi
+
+          if [ ! -s "$JOBS_JSON" ]; then
+            echo "Error: No job data received"
+            exit 1
+          fi
+
+          TOTAL_JOBS=$(jq '[.jobs[]] | length' "$JOBS_JSON")
+          SUCCESS_COUNT=$(jq '[.jobs[] | select(.conclusion == "success")] | length' "$JOBS_JSON")
+          FAILED_COUNT=$(jq '[.jobs[] | select(.conclusion == "failure")] | length' "$JOBS_JSON")
+
+          if [ "$FAILED_COUNT" -eq 0 ]; then
+            STATUS="Success ✅"
+            STATUS_EMOJI=":white_check_mark:"
+          else
+            STATUS="Failed ❌"
+            STATUS_EMOJI=":x:"
+          fi
+
+          # Main message with summary
+          SUMMARY_TEXT="*Nightly CI Pipeline - ${STATUS}*"$'\n'"Summary: ${SUCCESS_COUNT}/${TOTAL_JOBS} jobs passed"$'\n'"<${RUN_URL}|View Workflow Summary>"
+
+          if [ "$FAILED_COUNT" -eq 0 ]; then
+            # Success - simple message
+            PAYLOAD=$(jq -n \
+              --arg text "$SUMMARY_TEXT" \
+              '{text: $text}')
+          else
+            # Failed - message with blocks
+            FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | "• " + .name' "$JOBS_JSON")
+            FAILED_JOBS_TEXT="*Failed Jobs (${FAILED_COUNT}):*"$'\n'"${FAILED_JOBS}"
+
+            PAYLOAD=$(jq -n \
+              --arg summary "$SUMMARY_TEXT" \
+              --arg failed "$FAILED_JOBS_TEXT" \
+              '{
+                text: $summary,
+                blocks: [
+                  {
+                    type: "section",
+                    text: {
+                      type: "mrkdwn",
+                      text: $summary
+                    }
+                  },
+                  {
+                    type: "section",
+                    text: {
+                      type: "mrkdwn",
+                      text: $failed
+                    }
+                  }
+                ]
+              }')
+          fi
+
+          if curl -sSf -X POST -H "Content-Type: application/json" -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"; then
+            echo "Slack notification sent successfully"
+          else
+            echo "Warning: Failed to send Slack notification"
+            exit 1
+          fi
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -384,10 +384,15 @@ RUN uv pip install \
    /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
    /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
    if [ "$ENABLE_KVBM" = "true" ]; then \
-        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
-    fi \
-    && cd /workspace/benchmarks \
-    && UV_GIT_LFS=1 uv pip install --no-cache .
+        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
+        if [ -z "$KVBM_WHEEL" ]; then \
+            echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$KVBM_WHEEL"; \
+    fi && \
+    cd /workspace/benchmarks && \
+    UV_GIT_LFS=1 uv pip install --no-cache .

 # Setup launch banner in common directory accessible to all users
 RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -376,6 +376,7 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
 USER dynamo

 # Copy tests, benchmarks, deploy and components for CI with correct ownership
+COPY --chown=dynamo: pyproject.toml /workspace/
 COPY --chown=dynamo: tests /workspace/tests
 COPY --chown=dynamo: examples /workspace/examples
 COPY --chown=dynamo: benchmarks /workspace/benchmarks
@@ -477,7 +478,7 @@ RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format https://git
    && rm -rf clangd_18.1.3 clangd.zip

 # Editable install of dynamo
-COPY pyproject.toml README.md hatch_build.py /workspace/
+COPY README.md hatch_build.py /workspace/
 RUN python3 -m pip install --no-deps -e .

 # Install Python development packages

--- a/container/Dockerfile.trtllm
+++ b/container/Dockerfile.trtllm
@@ -334,12 +334,17 @@ RUN uv pip install \
      --no-cache \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
-      /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-    && if [ "${ENABLE_KVBM}" = "true" ]; then \
-        uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
-       fi \
-    && cd /workspace/benchmarks \
-    && UV_GIT_LFS=1 uv pip install --no-cache .
+      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_KVBM}" = "true" ]; then \
+        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
+        if [ -z "$KVBM_WHEEL" ]; then \
+            echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install --no-cache "$KVBM_WHEEL"; \
+    fi && \
+    cd /workspace/benchmarks && \
+    UV_GIT_LFS=1 uv pip install --no-cache .

 # Install common and test dependencies
 RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
@@ -352,7 +357,8 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
        --requirement /tmp/requirements.test.txt \
        cupy-cuda13x

-# Copy tests, benchmarks, deploy and components for CI
+# Copy tests, benchmarks, deploy and components for CI with correct ownership
+COPY --chown=dynamo: pyproject.toml /workspace/
 COPY --chown=dynamo: tests /workspace/tests
 COPY --chown=dynamo: examples /workspace/examples
 COPY --chown=dynamo: deploy /workspace/deploy
@@ -442,7 +448,7 @@ COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
 RUN uv pip install --no-cache maturin[patchelf]

 # Editable install of dynamo
-COPY pyproject.toml README.md hatch_build.py /workspace/
+COPY README.md hatch_build.py /workspace/
 RUN uv pip install --no-cache --no-deps -e .

 CMD []
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -275,12 +275,17 @@ COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/whee
 RUN uv pip install \
      /opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
      /opt/dynamo/wheelhouse/ai_dynamo*any.whl \
-      /opt/dynamo/wheelhouse/nixl/nixl*.whl \
-    && if [ "${ENABLE_KVBM}" = "true" ]; then \
-        uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
-       fi \
-    && cd /workspace/benchmarks \
-    && UV_GIT_LFS=1 uv pip install --no-cache .
+      /opt/dynamo/wheelhouse/nixl/nixl*.whl && \
+    if [ "${ENABLE_KVBM}" = "true" ]; then \
+        KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
+        if [ -z "$KVBM_WHEEL" ]; then \
+            echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
+            exit 1; \
+        fi; \
+        uv pip install "$KVBM_WHEEL"; \
+    fi && \
+    cd /workspace/benchmarks && \
+    UV_GIT_LFS=1 uv pip install --no-cache .

 # Install common and test dependencies
 RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \

--- a/container/build.sh
+++ b/container/build.sh
@@ -898,7 +898,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
        # Use BuildKit for enhanced metadata
        if [ -z "$RUN_PREFIX" ]; then
            if docker buildx version &>/dev/null; then
-                docker buildx build --progress=plain --load -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
+                docker buildx build --builder default --progress=plain --load -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
                BUILD_EXIT_CODE=${PIPESTATUS[0]}
            else
                DOCKER_BUILDKIT=1 docker build --progress=plain -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
@@ -924,7 +924,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
        # Use BuildKit for enhanced metadata
        if [ -z "$RUN_PREFIX" ]; then
            if docker buildx version &>/dev/null; then
-                docker buildx build --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
+                docker buildx build --builder default --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
                BUILD_EXIT_CODE=${PIPESTATUS[0]}
            else
                DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
@@ -946,7 +946,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
        # Use BuildKit for enhanced metadata
        if [ -z "$RUN_PREFIX" ]; then
            if docker buildx version &>/dev/null; then
-                docker buildx build --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
+                docker buildx build --builder default --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
                BUILD_EXIT_CODE=${PIPESTATUS[0]}
            else
                DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"