# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Build, Test, and Copy Framework Image on: workflow_call: inputs: framework: description: 'Framework name (vllm, sglang, trtllm)' required: true type: string target: description: 'Target stage for Docker rendering' required: true type: string platform: description: 'Platform to build (amd64 or arm64)' required: true type: string cuda_version: description: 'CUDA version to build (e.g., 12.9, 13.0)' required: true type: string build_timeout_minutes: description: 'Timeout in minutes for the build step' required: false type: number default: 60 run_cpu_only_tests: description: 'Whether to run CPU-only tests' required: false type: boolean default: true cpu_only_test_markers: description: 'CPU-only pytest markers' required: false type: string cpu_only_test_timeout_minutes: description: 'Timeout in minutes for CPU tests' required: false type: number default: 10 run_single_gpu_tests: description: 'Whether to run single GPU tests' required: false type: boolean default: true single_gpu_test_markers: description: 'Single GPU pytest markers' required: false type: string single_gpu_test_timeout_minutes: description: 'Timeout in minutes for single GPU tests' required: false type: number default: 30 run_multi_gpu_tests: description: 'Whether to run multi-gpu tests' required: false type: boolean default: true multi_gpu_test_markers: description: 'Multi GPU pytest markers' required: false type: string multi_gpu_test_timeout_minutes: description: 'Timeout in minutes for multi GPU tests' required: false type: number default: 30 copy_to_acr: description: 'Whether to copy images to ACR' required: false type: boolean default: true copy_timeout_minutes: description: 'Timeout in minutes for the copy to ACR step' required: false type: number default: 10 builder_name: description: 'Buildkit builder name' required: true type: string extra_tags: description: 'Additional tags (newline-separated, -$platform suffix auto-appended)' required: false type: string default: '' build_image: description: 'Whether to build image' required: false type: boolean default: true no_cache: description: 'Disable Docker build cache' required: false type: boolean default: false push_image: description: 'Push image to registry' required: false type: boolean default: true no_load: description: 'Do not load the image into docker (you must have dind installed if you want to load the image)' required: false type: boolean default: true show_summary: description: 'Show summary' required: false type: boolean default: false make_efa: description: 'Enable AWS EFA support in the build' required: false type: boolean default: false build_only: description: 'Build and push only — skip all tests, show summary' required: false type: boolean default: false sanitized_ref_name: description: 'Sanitized git ref name for branch-tagged images (used with build_only)' required: false type: string default: '' secrets: AWS_DEFAULT_REGION: required: true AWS_ACCOUNT_ID: required: true AZURE_ACR_HOSTNAME: required: true AZURE_ACR_USER: required: true AZURE_ACR_PASSWORD: required: true SCCACHE_S3_BUCKET: required: false AWS_ACCESS_KEY_ID: required: false AWS_SECRET_ACCESS_KEY: required: false HF_TOKEN: required: false jobs: # ============================================================================ # BUILD # ============================================================================ build: if: inputs.build_image name: Build cuda${{ inputs.cuda_version }}-${{ inputs.platform }} runs-on: prod-builder-v3 timeout-minutes: ${{ inputs.build_timeout_minutes }} outputs: target_tag_plain: ${{ steps.build.outputs.target_tag_plain }} test_tag_plain: ${{ steps.build.outputs.test_tag_plain }} env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 with: lfs: true - name: Build id: build uses: ./.github/actions/build-flavor with: framework: ${{ inputs.framework }} target: ${{ inputs.target }} platform: ${{ inputs.platform }} cuda_version: ${{ inputs.cuda_version }} builder_name: ${{ inputs.builder_name }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} hf_token: ${{ secrets.HF_TOKEN }} build_timeout_minutes: ${{ inputs.build_timeout_minutes }} push_image: ${{ inputs.push_image }} no_load: ${{ inputs.no_load }} no_cache: ${{ inputs.no_cache }} make_efa: ${{ inputs.make_efa }} extra_tags: ${{ inputs.extra_tags }} build_only: ${{ inputs.build_only }} sanitized_ref_name: ${{ inputs.sanitized_ref_name }} show_summary: ${{ inputs.push_image && inputs.show_summary }} # ============================================================================ # TEST # ============================================================================ test: if: | !inputs.build_only && ( inputs.run_cpu_only_tests || inputs.run_single_gpu_tests ) && inputs.build_image needs: [build] name: Test cuda${{ inputs.cuda_version }}-${{ inputs.platform }} runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }} env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Calculate target tag id: calculate-target-tag shell: bash run: | CUDA_VERSION_RAW=${{ inputs.cuda_version }} CUDA_VERSION=${CUDA_VERSION_RAW%%.*} echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull relevant images shell: bash run: | source ./.github/scripts/retry_docker.sh start_time=$(date +%s) retry_pull ${{ steps.calculate-target-tag.outputs.runtime_image }} retry_pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull quay.io/minio/minio end_time=$(date +%s) duration=$((end_time - start_time)) echo "⏱️ Image pull duration: ${duration}s" - name: Run Sanity Check on Runtime Image shell: bash run: | echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.runtime_image }}" # Run the sanity check script inside the container # The script is located in /workspace/deploy/sanity_check.py in runtime containers export WORKSPACE=/workspace set +e docker run --rm "${{ steps.calculate-target-tag.outputs.runtime_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check SANITY_CHECK_EXIT_CODE=$? set -e if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed" exit ${SANITY_CHECK_EXIT_CODE} else echo "✅ Sanity check passed" fi # Run CPU-only tests first (parallelized for speed) # These are unit tests marked with gpu_0 that don't require GPU hardware - name: Run CPU-only tests (parallelized) if: ${{ inputs.run_cpu_only_tests }} timeout-minutes: ${{ inputs.cpu_only_test_timeout_minutes }} uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.cpu_only_test_markers }} framework: ${{ inputs.framework }} test_type: "pre_merge_cpu" platform_arch: ${{ inputs.platform }} hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'auto' dind_as_sidecar: 'true' # Run GPU tests sequentially (only on amd64 runners with GPU) # These are e2e tests marked with gpu_1 that require GPU hardware - name: Run GPU tests (sequential) timeout-minutes: ${{ inputs.single_gpu_test_timeout_minutes }} if: ( inputs.platform == 'amd64' && inputs.run_single_gpu_tests == true ) # We only run GPU tests on amd64 uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.single_gpu_test_markers }} framework: ${{ inputs.framework }} test_type: "pre_merge_gpu" platform_arch: ${{ inputs.platform }} hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'none' dind_as_sidecar: 'true' # ============================================================================ # MULTI-GPU TESTS # ============================================================================ multi-gpu-test: # Multi-GPU support limited to AMD64 only if: | !inputs.build_only && inputs.run_multi_gpu_tests && inputs.build_image && ( inputs.platform != 'arm64' ) needs: [build] name: Multi-gpu test cuda${{ inputs.cuda_version }}-${{ inputs.platform }} runs-on: prod-tester-amd-gpu-4-v1 env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Calculate target tag id: calculate-target-tag shell: bash run: | CUDA_VERSION_RAW=${{ inputs.cuda_version }} CUDA_VERSION=${CUDA_VERSION_RAW%%.*} echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull relevant images shell: bash run: | source ./.github/scripts/retry_docker.sh start_time=$(date +%s) retry_pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull quay.io/minio/minio end_time=$(date +%s) duration=$((end_time - start_time)) echo "⏱️ Image pull duration: ${duration}s" # Run GPU tests sequentially (only on amd64 runners with GPU) # These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware - name: Run GPU tests (sequential) timeout-minutes: ${{ inputs.multi_gpu_test_timeout_minutes }} uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.multi_gpu_test_markers }} framework: ${{ inputs.framework }} test_type: "pre_merge_gpu" platform_arch: ${{ inputs.platform }} hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'none' dind_as_sidecar: 'true' # ============================================================================ # COMPLIANCE — Generate attribution CSVs for dpkg and Python packages # ============================================================================ compliance: if: inputs.build_image && inputs.push_image needs: [build] name: Compliance cuda${{ inputs.cuda_version }}-${{ inputs.platform }} runs-on: prod-builder-v3 steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Calculate image URI id: images shell: bash run: | CUDA_VERSION_RAW=${{ inputs.cuda_version }} CUDA_VERSION=${CUDA_VERSION_RAW%%.*} echo "cuda_major=${CUDA_VERSION}" >> $GITHUB_OUTPUT RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT - name: Compliance scan uses: ./.github/actions/compliance-scan with: image: ${{ steps.images.outputs.runtime_image }} artifact_name: compliance-${{ inputs.framework }}-${{ inputs.target }}${{ inputs.make_efa && '-efa' || '' }}-cuda${{ steps.images.outputs.cuda_major }}-${{ inputs.platform }} arch: ${{ inputs.platform }} framework: ${{ inputs.framework }} cuda_version: ${{ inputs.cuda_version }} # ============================================================================ # COPY TO ACR # ============================================================================ copy-to-acr: needs: [build, test] # Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped) if: | always() && !inputs.build_only && inputs.copy_to_acr && needs.build.result == 'success' && (needs.test.result == 'success' || needs.test.result == 'skipped') name: copy-to-acr cuda${{ inputs.cuda_version }}-${{ inputs.platform }} runs-on: prod-default-small-v2 outputs: target_tag_plain: ${{ needs.build.outputs.target_tag_plain }} steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Calculate target tag id: calculate-target-tag shell: bash run: | CUDA_VERSION_RAW=${{ inputs.cuda_version }} CUDA_VERSION=${CUDA_VERSION_RAW%%.*} echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT - name: Copy image to target registry timeout-minutes: ${{ inputs.copy_timeout_minutes }} uses: ./.github/actions/skopeo-copy with: source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com source_image: ai-dynamo/dynamo source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }} target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }} target_image: ai-dynamo/dynamo target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }} source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }} target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}