# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Build, Test, and Copy Framework Image on: workflow_call: inputs: framework: description: 'Framework name (e.g. dynamo, vllm, sglang, trtllm)' required: true type: string builder_flavor: description: 'Optional BuildKit routing flavor override (vllm, sglang, trtllm, general)' required: false type: string default: '' target: description: 'Target stage for Docker rendering' required: true type: string platform: description: 'Docker platform(s) to build (e.g. linux/amd64,linux/arm64)' required: true type: string cuda_version: description: 'CUDA version to build (e.g., 12.9, 13.0)' required: false type: string default: '' cpu_only: description: 'Build and test this target as a CPU-only image variant' required: false type: boolean default: false build_timeout_minutes: description: 'Timeout in minutes for the build step' required: false type: number default: 60 run_cpu_only_tests: description: 'Whether to run CPU-only tests' required: false type: boolean default: true cpu_only_test_markers: description: 'CPU-only pytest markers' required: false type: string cpu_only_test_timeout_minutes: description: 'Timeout in minutes for CPU tests' required: false type: number default: 10 run_single_gpu_tests: description: 'Whether to run single GPU tests' required: false type: boolean default: true single_gpu_test_markers: description: 'Single GPU pytest markers' required: false type: string single_gpu_test_timeout_minutes: description: 'Timeout in minutes for single GPU tests' required: false type: number default: 30 run_multi_gpu_tests: description: 'Whether to run multi-gpu tests' required: false type: boolean default: true multi_gpu_test_markers: description: 'Multi GPU pytest markers' required: false type: string multi_gpu_test_timeout_minutes: description: 'Timeout in minutes for multi GPU tests' required: false type: number default: 30 copy_to_acr: description: 'Whether to copy images to ACR' required: false type: boolean default: true copy_timeout_minutes: description: 'Timeout in minutes for the copy to ACR step' required: false type: number default: 10 builder_name: description: 'Buildkit builder name' required: true type: string extra_tags: description: 'Additional tags (newline-separated)' required: false type: string default: '' build_image: description: 'Whether to build image' required: false type: boolean default: true no_cache: description: 'Disable Docker build cache' required: false type: boolean default: false push_image: description: 'Push image to registry' required: false type: boolean default: true no_load: description: 'Do not load the image into docker (you must have dind installed if you want to load the image)' required: false type: boolean default: true show_summary: description: 'Show summary' required: false type: boolean default: false make_efa: description: 'Enable AWS EFA support in the build' required: false type: boolean default: false fresh_builder: description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)' required: false type: boolean default: false secrets: AWS_DEFAULT_REGION: required: true AWS_ACCOUNT_ID: required: true AZURE_ACR_HOSTNAME: required: true AZURE_ACR_USER: required: true AZURE_ACR_PASSWORD: required: true SCCACHE_S3_BUCKET: required: false AWS_ACCESS_KEY_ID: required: false AWS_SECRET_ACCESS_KEY: required: false HF_TOKEN: required: false jobs: # ============================================================================ # BUILD # ============================================================================ build: if: inputs.build_image name: Build ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }} runs-on: prod-builder-v3 timeout-minutes: ${{ inputs.build_timeout_minutes }} outputs: target_tag_plain: ${{ steps.build.outputs.target_tag_plain }} test_tag_plain: ${{ steps.build.outputs.test_tag_plain }} image_variant_label: ${{ steps.build.outputs.image_variant_label }} image_tag_suffix: ${{ steps.build.outputs.image_tag_suffix }} compliance_arches: ${{ steps.compliance-arches.outputs.arches }} test_runners: ${{ steps.test-runners.outputs.runners }} env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: lfs: true - name: Compute compliance arches id: compliance-arches shell: bash run: | PLATFORM="${{ inputs.platform }}" # Convert comma-separated Docker platform string to JSON array # "linux/amd64,linux/arm64" -> ["linux/amd64","linux/arm64"] # "linux/amd64" -> ["linux/amd64"] JSON=$(printf '%s\n' ${PLATFORM//,/ } | jq -Rsc '[splits("\n") | select(length>0)]') echo "arches=${JSON}" >> $GITHUB_OUTPUT - name: Compute test runners id: test-runners shell: bash run: | PLATFORM="${{ inputs.platform }}" # Emit a JSON array of {arch, runner} objects for platforms actually being built # "linux/amd64,linux/arm64" -> both runners # "linux/amd64" -> amd64 runner only (covers EFA and other single-arch) # "linux/arm64" -> arm64 runner only if [[ "$PLATFORM" == *"amd64"* && "$PLATFORM" == *"arm64"* ]]; then echo 'runners=[{"arch":"amd64","runner":"prod-tester-amd-gpu-v1"},{"arch":"arm64","runner":"prod-tester-arm-v1"}]' >> $GITHUB_OUTPUT elif [[ "$PLATFORM" == *"arm64"* ]]; then echo 'runners=[{"arch":"arm64","runner":"prod-tester-arm-v1"}]' >> $GITHUB_OUTPUT else echo 'runners=[{"arch":"amd64","runner":"prod-tester-amd-gpu-v1"}]' >> $GITHUB_OUTPUT fi - name: Build id: build uses: ./.github/actions/build-flavor with: framework: ${{ inputs.framework }} builder_flavor: ${{ inputs.builder_flavor }} target: ${{ inputs.target }} platform: ${{ inputs.platform }} cuda_version: ${{ inputs.cuda_version }} cpu_only: ${{ inputs.cpu_only }} builder_name: ${{ inputs.builder_name }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} hf_token: ${{ secrets.HF_TOKEN }} build_timeout_minutes: ${{ inputs.build_timeout_minutes }} push_image: ${{ inputs.push_image }} no_load: ${{ inputs.no_load }} no_cache: ${{ inputs.no_cache }} make_efa: ${{ inputs.make_efa }} fresh_builder: ${{ inputs.fresh_builder }} extra_tags: ${{ inputs.extra_tags }} show_summary: ${{ inputs.push_image && inputs.show_summary }} # ============================================================================ # TEST # ============================================================================ test: if: | ( inputs.run_cpu_only_tests || inputs.run_single_gpu_tests ) && inputs.build_image needs: [build] name: Test ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }} (${{ matrix.arch }}) strategy: fail-fast: false matrix: include: ${{ fromJson(needs.build.outputs.test_runners) }} runs-on: ${{ matrix.runner }} env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Calculate target tag id: calculate-target-tag shell: bash run: | RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull relevant images shell: bash run: | source ./.github/scripts/retry_docker.sh start_time=$(date +%s) retry_pull ${{ steps.calculate-target-tag.outputs.runtime_image }} retry_pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull quay.io/minio/minio end_time=$(date +%s) duration=$((end_time - start_time)) echo "⏱️ Image pull duration: ${duration}s" # Run CPU-only tests first (parallelized for speed) # These are unit tests marked with gpu_0 that don't require GPU hardware - name: Run CPU-only tests (parallelized) if: ${{ inputs.run_cpu_only_tests }} timeout-minutes: ${{ inputs.cpu_only_test_timeout_minutes }} uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.cpu_only_test_markers }} test_suite_name: ${{ inputs.framework }} test_type: "pre_merge_cpu" platform_arch: ${{ matrix.arch }} hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'auto' dind_as_sidecar: 'true' # Run GPU tests sequentially (only on amd64 runners with GPU) # These are e2e tests marked with gpu_1 that require GPU hardware - name: Run GPU tests (sequential) timeout-minutes: ${{ inputs.single_gpu_test_timeout_minutes }} if: inputs.run_single_gpu_tests && matrix.arch == 'amd64' uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.single_gpu_test_markers }} test_suite_name: ${{ inputs.framework }} test_type: "pre_merge_gpu" platform_arch: ${{ matrix.arch }} hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'none' dind_as_sidecar: 'true' # ============================================================================ # MULTI-GPU TESTS # ============================================================================ multi-gpu-test: # Multi-GPU support limited to AMD64 only if: | inputs.run_multi_gpu_tests && inputs.build_image needs: [build] name: Multi-gpu test ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }} runs-on: prod-tester-amd-gpu-4-v1 env: FRAMEWORK: ${{ inputs.framework }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Calculate target tag id: calculate-target-tag shell: bash run: | RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.test_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull relevant images shell: bash run: | source ./.github/scripts/retry_docker.sh start_time=$(date +%s) retry_pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull quay.io/minio/minio end_time=$(date +%s) duration=$((end_time - start_time)) echo "⏱️ Image pull duration: ${duration}s" # Run GPU tests sequentially (only on amd64 runners with GPU) # These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware - name: Run GPU tests (sequential) timeout-minutes: ${{ inputs.multi_gpu_test_timeout_minutes }} uses: ./.github/actions/pytest with: image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} pytest_marks: ${{ inputs.multi_gpu_test_markers }} test_suite_name: ${{ inputs.framework }} test_type: "pre_merge_gpu" platform_arch: amd64 hf_token: ${{ secrets.HF_TOKEN }} parallel_mode: 'none' dind_as_sidecar: 'true' # ============================================================================ # COMPLIANCE — Generate attribution CSVs for dpkg and Python packages # ============================================================================ compliance: if: inputs.build_image && inputs.push_image needs: [build] strategy: fail-fast: false matrix: arch: ${{ fromJson(needs.build.outputs.compliance_arches) }} name: Compliance ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }}-${{ matrix.arch }} runs-on: prod-builder-v3 steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Calculate image URI and arch suffix id: images shell: bash run: | echo "image_variant_label=${{ needs.build.outputs.image_variant_label }}" >> $GITHUB_OUTPUT RUNTIME_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT # Sanitize arch for artifact name: linux/amd64 -> amd64 (artifact names can't contain /) ARCH="${{ matrix.arch }}" echo "arch_suffix=${ARCH#linux/}" >> $GITHUB_OUTPUT - name: Compliance scan uses: ./.github/actions/compliance-scan with: image: ${{ steps.images.outputs.runtime_image }} artifact_name: compliance-${{ inputs.framework }}-${{ inputs.target }}${{ inputs.make_efa && '-efa' || '' }}-${{ steps.images.outputs.image_variant_label }}-${{ steps.images.outputs.arch_suffix }} arch: ${{ matrix.arch }} framework: ${{ inputs.framework }} target: ${{ inputs.target }} cuda_version: ${{ inputs.cuda_version }} # ============================================================================ # COPY TO ACR # ============================================================================ copy-to-acr: needs: [build, test] # Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped) if: | always() && inputs.copy_to_acr && needs.build.result == 'success' && (needs.test.result == 'success' || needs.test.result == 'skipped') name: copy-to-acr ${{ inputs.cpu_only && 'cpu' || format('cuda{0}', inputs.cuda_version) }} runs-on: prod-skopeo-v1 outputs: target_tag_plain: ${{ needs.build.outputs.target_tag_plain }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Copy image to target registry timeout-minutes: ${{ inputs.copy_timeout_minutes }} uses: ./.github/actions/skopeo-copy with: source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com source_image: ai-dynamo/dynamo source_tag: ${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }} target_image: ai-dynamo/dynamo target_tag: ${{ needs.build.outputs.target_tag_plain }}${{ needs.build.outputs.image_tag_suffix }} source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }} target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} override_arch: amd64 # We are using AMD64 images only on the rest of the clusters.