Unverified Commit 6ac17b99 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci: Using Dynamo Builder (#5914)

parent 50e17783
name: 'Bootstrap Buildkit'
description: 'Bootstrap buildkit builders using remote workers or Kubernetes driver'
# This action supports two buildkit driver modes:
#
# 1. Remote Driver (when buildkit_worker_addresses is provided)
# Uses pre-provisioned remote buildkit workers specified via buildkit_worker_addresses.
# This is the preferred mode for faster, more reliable builds.
#
# 2. Kubernetes Driver (fallback, when buildkit_worker_addresses is empty)
# Dynamically creates buildkit pods in Kubernetes. Use this as a fallback when
# remote buildkit workers are unavailable or unreachable. The Kubernetes driver
# is slower due to pod startup time but provides on-demand build capacity.
#
# Options:
# - skip_bootstrap: Set to 'true' to only create the builder without bootstrapping.
# Useful for cleanup jobs that need to remove the builder but not bootstrap it.
inputs:
builder_name:
description: 'Name for the buildx builder'
required: true
buildkit_worker_addresses:
description: 'Comma-separated list of remote buildkit worker addresses. If empty, falls back to Kubernetes driver.'
required: false
default: ''
# Kubernetes driver inputs (used when remote_builder is false)
ephemeral_storage:
description: 'Ephemeral storage request for Kubernetes driver'
required: false
default: '400Gi'
namespace:
description: 'Kubernetes namespace for buildkit pods'
required: false
default: 'buildkit'
replicas:
description: 'Number of buildkit replicas'
required: false
default: '1'
requests_cpu:
description: 'CPU requests for buildkit pods'
required: false
default: '12'
requests_memory:
description: 'Memory requests for buildkit pods'
required: false
default: '26Gi'
limits_memory:
description: 'Memory limits for buildkit pods'
required: false
default: '29Gi'
tolerations:
description: 'Tolerations for buildkit pods'
required: false
default: "key=buildkit-fallback-worker,value=true,operator=Equal,effect=NoSchedule"
skip_bootstrap:
description: 'Skip the bootstrap step (only create the builder)'
required: false
default: 'false'
runs:
using: "composite"
steps:
- name: Define remote buildkit builders
if: inputs.buildkit_worker_addresses != ''
shell: bash
run: |
ADDRS="${{ inputs.buildkit_worker_addresses }}"
IFS=',' read -ra ADDR_LIST <<< "$ADDRS"
FIRST=true
for addr in "${ADDR_LIST[@]}"; do
if $FIRST; then
docker buildx create --use --name ${{ inputs.builder_name }} --driver remote "$addr"
FIRST=false
else
docker buildx create --append --name ${{ inputs.builder_name }} --driver remote "$addr"
fi
done
- name: Create Kubernetes builder for both platforms
if: inputs.buildkit_worker_addresses == ''
shell: bash
run: |
if docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
# If exit code is 0 (success), print the message
echo "✅ Builder '${{ inputs.builder_name }}' already exists. Skipping creation."
else
echo "K8s Builder '${{ inputs.builder_name }}' does not exist. Creating it."
docker buildx create --use --name ${{ inputs.builder_name }} --driver kubernetes --platform=linux/amd64 \
'--driver-opt=requests.ephemeral-storage=${{ inputs.ephemeral_storage }}' \
'--driver-opt=namespace=${{ inputs.namespace }}' \
'--driver-opt=loadbalance=sticky' \
'--driver-opt=replicas=${{ inputs.replicas }}' \
'--driver-opt=requests.cpu=${{ inputs.requests_cpu }}' \
'--driver-opt=requests.memory=${{ inputs.requests_memory }}' \
'--driver-opt=limits.memory=${{ inputs.limits_memory }}' \
'--driver-opt="nodeselector=kubernetes.io/arch=amd64,role=dynamo-builder-fallback"' \
'--driver-opt="tolerations=${{ inputs.tolerations }}"'
docker buildx create --append --name ${{ inputs.builder_name }} --driver kubernetes --platform=linux/arm64 \
'--driver-opt=requests.ephemeral-storage=${{ inputs.ephemeral_storage }}' \
'--driver-opt=namespace=${{ inputs.namespace }}' \
'--driver-opt=loadbalance=sticky' \
'--driver-opt=replicas=${{ inputs.replicas }}' \
'--driver-opt=requests.cpu=${{ inputs.requests_cpu }}' \
'--driver-opt=requests.memory=${{ inputs.requests_memory }}' \
'--driver-opt=limits.memory=${{ inputs.limits_memory }}' \
'--driver-opt="nodeselector=kubernetes.io/arch=arm64,role=dynamo-builder-fallback"' \
'--driver-opt="tolerations=${{ inputs.tolerations }}"'
fi
sleep 3 # Give the builders some time to be ready
if [[ "${{ inputs.skip_bootstrap }}" != "true" ]]; then
echo "::warning::Build is using fallback pod. Please alert the ops team."
echo "## ⚠️ Fallback Build Warning" >> $GITHUB_STEP_SUMMARY
echo "This build is running on a **fallback pod**. Please alert the ops team." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
fi
- name: Bootstrap buildkit
if: inputs.skip_bootstrap != 'true'
shell: bash
run: |
echo "Bootstrapping buildkit..."
docker buildx inspect ${{ inputs.builder_name }} --bootstrap
...@@ -322,4 +322,3 @@ runs: ...@@ -322,4 +322,3 @@ runs:
name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }} name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
retention-days: 7 retention-days: 7
name: 'Docker Build'
description: 'Build Dynamo container images'
inputs:
framework:
description: 'Framework to build'
required: true
default: 'vllm'
target:
description: 'Target to build'
required: false
default: 'runtime'
platform:
description: 'Docker platform to build on, ie. linux/amd64'
required: false
default: 'linux/amd64'
image_tag:
description: 'Custom image tag'
required: true
ci_token:
description: 'CI Token'
required: false
aws_default_region:
description: 'AWS Default Region'
required: false
sccache_s3_bucket:
description: 'SCCache S3 Bucket'
required: false
aws_account_id:
description: 'AWS Account ID'
required: false
aws_access_key_id:
description: 'AWS Access Key ID'
required: false
aws_secret_access_key:
description: 'AWS Secret Access Key'
required: false
base_image_tag:
description: 'Optional override for base image tag passed to build.sh'
required: false
runtime_image_tag:
description: 'Optional override for RUNTIME_IMAGE_TAG build-arg'
required: false
cuda_version:
description: 'Optional override for CUDA_VERSION build-arg'
required: true
enable_kvbm:
description: 'Enable KVBM support (optional)'
required: false
dynamo_base_image:
description: 'Pre-built Dynamo base image to use instead of building from scratch'
required: false
no_cache:
description: 'Disable Docker build cache'
required: false
extra_tags:
description: 'Additional image tags (newline-separated list of full image:tag references)'
required: false
default: ''
push_image:
description: 'Push the image to the registry'
required: false
default: 'false'
no_load:
description: 'Do not load the image into docker (useful for validation-only builds)'
required: false
default: 'true'
use_sccache:
description: 'Use SCCache for caching'
required: false
default: 'false'
ci:
description: 'CI mode: for frontend target, uses existing buildx builder and pushes EPP image to ECR'
required: false
default: 'false'
outputs:
image_tag:
description: 'Image Tag'
value: ${{ steps.build.outputs.image_tag }}
runs:
using: "composite"
steps:
- name: Build image
id: build
shell: bash
env:
GITHUB_TOKEN: ${{ inputs.ci_token }}
AWS_DEFAULT_REGION: ${{ inputs.aws_default_region }}
SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }}
AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }}
PLATFORM: ${{ inputs.platform }}
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_JOB: ${{ github.job }}
GITHUB_REF_NAME: ${{ github.ref_name }}
CUDA_VERSION: ${{ inputs.cuda_version }}
run: |
set -x
IMAGE_TAG="${{ inputs.image_tag }}"
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
BUILD_START_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV
echo "image_tag=$IMAGE_TAG" >> $GITHUB_OUTPUT
# Create build logs directory
mkdir -p build-logs
BUILD_LOG_FILE="build-logs/build-${{ inputs.framework }}-$(echo '${{ inputs.platform }}' | sed 's/linux\///').log"
echo "BUILD_LOG_FILE=${BUILD_LOG_FILE}" >> $GITHUB_ENV
echo "📝 Build log will be saved to: ${BUILD_LOG_FILE}"
# Collect optional overrides provided by the workflow
# Set base cache args and set --cache-to if this is a main commit
# TODO: Fix this - Skip cache for frontend target - a different docker driver is used for the EPP build, which causes issues with cache export
EXTRA_ARGS=""
if [[ "${{ inputs.target }}" != "frontend" ]]; then
EXTRA_ARGS="--cache-to type=inline "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/} "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache "
if [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
# Release branches also use release cache
EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache,mode=max "
elif [[ "$GITHUB_REF_NAME" == "main" ]]; then
EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache,mode=max "
fi
fi
echo "$EXTRA_ARGS"
# Collect optional overrides provided by the workflow
if [[ "${{ inputs.ci }}" == "true" ]]; then
# CI mode for frontend: use existing buildx builder, push EPP to registry
EXTRA_ARGS+=" --ci"
fi
if [ -n "${{ inputs.base_image_tag }}" ]; then
EXTRA_ARGS+="--base-image-tag ${{ inputs.base_image_tag }} "
fi
if [ -n "${{ inputs.runtime_image_tag }}" ]; then
EXTRA_ARGS+="--build-arg RUNTIME_IMAGE_TAG=${{ inputs.runtime_image_tag }} "
fi
if [ -n "${{ inputs.cuda_version }}" ]; then
EXTRA_ARGS+="--build-arg CUDA_VERSION=${{ inputs.cuda_version }} "
fi
if [ -n "${{ inputs.dynamo_base_image }}" ]; then
EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
fi
if [ -n "${{ inputs.enable_kvbm }}" ]; then
EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
fi
if [ "${{ inputs.no_cache }}" == "true" ]; then
EXTRA_ARGS+=" --no-cache"
fi
if [ "${{ inputs.use_sccache }}" == "true" ]; then
EXTRA_ARGS+=" --build-arg CARGO_BUILD_JOBS=4 --use-sccache"
fi
if [ "${{ inputs.push_image }}" == "true" ]; then
EXTRA_ARGS+=" --push --no-load"
elif [ "${{ inputs.no_load }}" == "true" ]; then
EXTRA_ARGS+=" --no-load"
fi
# Add extra tags (each as a separate --tag argument)
EXTRA_TAGS="${{ inputs.extra_tags }}"
if [ -n "$EXTRA_TAGS" ]; then
while IFS= read -r EXTRA_TAG; do
if [ -n "$EXTRA_TAG" ]; then
EXTRA_ARGS+=" --tag ${EXTRA_TAG}"
fi
done <<< "$EXTRA_TAGS"
fi
# Execute build and capture output (show on console AND save to file)
./container/build.sh --tag "$IMAGE_TAG" \
--target ${{ inputs.target }} \
--vllm-max-jobs 10 \
--no-tag-latest \
--framework ${{ inputs.framework }} \
--platform ${{ inputs.platform }} \
--sccache-bucket "$SCCACHE_S3_BUCKET" \
--sccache-region "$AWS_DEFAULT_REGION" \
$EXTRA_ARGS 2>&1 | tee "${BUILD_LOG_FILE}"
BUILD_EXIT_CODE=${PIPESTATUS[0]}
BUILD_END_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
echo "BUILD_END_TIME=${BUILD_END_TIME}" >> $GITHUB_ENV
# Exit with the build's exit code
exit ${BUILD_EXIT_CODE}
- name: Capture Build Metrics
id: metrics
shell: bash
run: |
# Create metrics directory
mkdir -p build-metrics
# Get accurate build timing
BUILD_START_TIME="${{ env.BUILD_START_TIME }}"
BUILD_END_TIME="${{ env.BUILD_END_TIME }}"
# Calculate duration
START_EPOCH=$(date -d "$BUILD_START_TIME" +%s)
END_EPOCH=$(date -d "$BUILD_END_TIME" +%s)
BUILD_DURATION_SEC=$((END_EPOCH - START_EPOCH))
echo "🕐 Build timing:"
echo " Start: ${BUILD_START_TIME}"
echo " End: ${BUILD_END_TIME}"
echo " Duration: ${BUILD_DURATION_SEC} seconds"
# Get image size using docker inspect
IMAGE_TAG="${{ steps.build.outputs.image_tag }}"
if [ -n "$IMAGE_TAG" ]; then
IMAGE_SIZE_BYTES=$(docker image inspect "$IMAGE_TAG" --format='{{.Size}}' 2>/dev/null || echo "0")
echo "📦 Image size: ${IMAGE_SIZE_BYTES} bytes"
else
IMAGE_SIZE_BYTES=0
echo "⚠️ No image tag available"
fi
PLATFORM_ARCH=$(echo "${{ inputs.platform }}" | sed 's/linux\///')
echo " Architecture: ${PLATFORM_ARCH}"
echo "PLATFORM_ARCH=${PLATFORM_ARCH}" >> $GITHUB_ENV
JOB_KEY="${{ inputs.framework }}-${PLATFORM_ARCH}"
echo " Job Key: ${JOB_KEY}"
# Create job-specific metrics file
mkdir -p build-metrics
METRICS_FILE="build-metrics/metrics-${{ inputs.framework }}-${PLATFORM_ARCH}-${{ github.run_id }}-${{ job.check_run_id }}.json"
# Create the job metrics file
cat > "$METRICS_FILE" << EOF
{
"framework": "${{ inputs.framework }}",
"target": "${{ inputs.target }}",
"platform": "${{ inputs.platform }}",
"platform_arch": "${PLATFORM_ARCH}",
"image_size_bytes": ${IMAGE_SIZE_BYTES},
"build_start_time": "${BUILD_START_TIME}",
"build_end_time": "${BUILD_END_TIME}",
"build_duration_sec": ${BUILD_DURATION_SEC}
}
EOF
cat "$METRICS_FILE"
- name: Generate Comprehensive Build Metrics
id: comprehensive-metrics
if: always()
shell: bash
run: |
echo "=========================================="
echo "📊 GENERATING COMPREHENSIVE BUILD METRICS"
echo "=========================================="
# Create metrics directory
mkdir -p build-metrics
PLATFORM_ARCH="${{ env.PLATFORM_ARCH }}"
WORKFLOW_ID="${{ github.run_id }}"
JOB_ID="${{ job.check_run_id }}"
FRAMEWORK_LOWER=$(echo "${{ inputs.framework }}" | tr '[:upper:]' '[:lower:]')
# Make parser executable
chmod +x .github/scripts/parse_buildkit_output.py
# Check for build logs and build stage arguments dynamically
# Use the BUILD_LOG_FILE set during the build step
BUILD_LOG="${{ env.BUILD_LOG_FILE }}"
# Path to container metadata created in previous step
CONTAINER_METADATA="build-metrics/metrics-${{ inputs.framework }}-${PLATFORM_ARCH}-${WORKFLOW_ID}-${JOB_ID}.json"
# Output single comprehensive JSON with all build stages
COMPREHENSIVE_JSON="build-metrics/build-${{ inputs.framework }}-${PLATFORM_ARCH}-${WORKFLOW_ID}-${JOB_ID}.json"
echo "🚀 Parsing BuildKit outputs and merging with container metrics..."
# Build stage arguments dynamically based on which logs exist
STAGE_ARGS=()
if [ -f "$BUILD_LOG" ]; then
echo " ✓ Found base image log: ${BUILD_LOG}"
STAGE_ARGS+=("runtime:${BUILD_LOG}")
else
echo " ℹ️ No image log found"
fi
# Check for any additional stage logs (e.g., build-logs/stage3-*.log)
for extra_log in build-logs/stage*.log; do
if [ -f "$extra_log" ]; then
stage_name=$(basename "$extra_log" .log)
echo " ✓ Found additional stage log: ${extra_log} (${stage_name})"
STAGE_ARGS+=("${stage_name}:${extra_log}")
fi
done
echo "Container Metadata: ${CONTAINER_METADATA}"
echo "Output: ${COMPREHENSIVE_JSON}"
echo ""
# Run parser with all discovered stages
# Usage: parse_buildkit_output.py <output_json> <stage1_name:log_file> [stage2_name:log_file] ... [--metadata=<file>]
set +e
python3 .github/scripts/parse_buildkit_output.py \
"$COMPREHENSIVE_JSON" \
"${STAGE_ARGS[@]}" \
"--metadata=${CONTAINER_METADATA}"
PARSER_EXIT_CODE=$?
set -e
echo ""
echo "📊 Parser exit code: ${PARSER_EXIT_CODE}"
if [ ${PARSER_EXIT_CODE} -eq 0 ] && [ -f "$COMPREHENSIVE_JSON" ]; then
echo "✅ Comprehensive build metrics generated successfully"
echo "📄 Output file: ${COMPREHENSIVE_JSON}"
else
echo "⚠️ Metrics generation had issues but continuing..."
fi
# Upload comprehensive build metrics as artifact
- name: Upload Comprehensive Build Metrics
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
with:
name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
retention-days: 7
name: 'Dynamo Graph Deployment Test'
description: 'Deploy a DynamoGraphDeployment to Kubernetes, validate it serves requests, and cleanup'
inputs:
# Kubernetes Configuration
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for cluster access'
required: true
namespace:
description: 'Kubernetes namespace for deployment'
required: true
# Deployment Configuration
deployment_file:
description: 'Path to the DynamoGraphDeployment YAML file (relative to examples/backends/<framework>)'
required: true
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
framework_runtime_image:
description: 'Full container image reference for the framework runtime'
required: true
# Model Configuration
model_name:
description: 'Model name to test (e.g., Qwen/Qwen3-0.6B)'
required: false
default: 'Qwen/Qwen3-0.6B'
# Test Configuration
pod_ready_timeout:
description: 'Timeout for pods to become ready (kubectl wait format)'
required: false
default: '300s'
model_available_max_attempts:
description: 'Maximum attempts to wait for model availability'
required: false
default: '30'
model_available_retry_delay:
description: 'Delay between model availability checks (seconds)'
required: false
default: '5'
port_forward_delay:
description: 'Delay after port-forward to allow connection (seconds)'
required: false
default: '10'
test_identifier:
description: 'Unique identifier for test output (used for log file and artifact naming)'
required: true
# Request Configuration
max_tokens:
description: 'Maximum tokens for test request'
required: false
default: '30'
temperature:
description: 'Temperature for test request'
required: false
default: '0.0'
test_prompt:
description: 'Test prompt to send (optional, uses default if not provided)'
required: false
default: ''
# Validation Configuration
min_response_length:
description: 'Minimum expected response content length'
required: false
default: '100'
skip_cleanup:
description: 'Skip cleanup step (useful for debugging)'
required: false
default: 'false'
outputs:
graph_name:
description: 'Name of the deployed DynamoGraphDeployment'
value: ${{ steps.deploy.outputs.graph_name }}
test_result:
description: 'Test result (0=pass, 1=fail)'
value: ${{ steps.test.outputs.test_result }}
test_log_path:
description: 'Path to test output log'
value: ${{ steps.setup-test-names.outputs.test_output_log_file }}
artifact_name:
description: 'Name of the uploaded artifact'
value: ${{ steps.setup-test-names.outputs.artifact_name }}
runs:
using: "composite"
steps:
- name: Setup Kubeconfig
id: setup-kubeconfig
shell: bash
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig
chmod 600 ${{ github.workspace }}/.kubeconfig
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV
export KUBECONFIG=${{ github.workspace }}/.kubeconfig
kubectl config set-context --current --namespace=${{ inputs.namespace }}
kubectl config get-contexts
- name: Setup Test Output Names
id: setup-test-names
shell: bash
run: |
TEST_IDENTIFIER="${{ inputs.test_identifier }}"
echo "test_output_log_file=deploy_test_output_${TEST_IDENTIFIER}.log" >> $GITHUB_OUTPUT
# Replace underscores with dashes for artifact name (GitHub artifact naming convention)
ARTIFACT_NAME="test-results-${TEST_IDENTIFIER//_/-}"
echo "artifact_name=${ARTIFACT_NAME}" >> $GITHUB_OUTPUT
- name: Deploy and Test
id: deploy
shell: bash
working-directory: ${{ github.workspace }}/examples/backends/${{ inputs.framework }}
env:
NAMESPACE: ${{ inputs.namespace }}
FRAMEWORK: ${{ inputs.framework }}
FRAMEWORK_RUNTIME_IMAGE: ${{ inputs.framework_runtime_image }}
DEPLOYMENT_FILE: ${{ inputs.deployment_file }}
MODEL_NAME: ${{ inputs.model_name }}
POD_READY_TIMEOUT: ${{ inputs.pod_ready_timeout }}
run: |
set -x
export KUBECONFIG=${{ github.workspace }}/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
# Redirect all output to a log file while still showing it
exec > >(tee -a "${{ steps.setup-test-names.outputs.test_output_log_file }}") 2>&1
export KUBE_NS=$NAMESPACE
export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
echo "graph_name=${GRAPH_NAME}" >> $GITHUB_OUTPUT
# Update the deployment file with the runtime image
# Use strenv() to ensure the image string is treated as plain string, not parsed as YAML
yq -i '.spec.services.[].extraPodSpec.mainContainer.image = strenv(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE
echo "=== DEPLOYMENT CONFIGURATION ==="
echo "Framework: ${FRAMEWORK}"
echo "Runtime Image: ${FRAMEWORK_RUNTIME_IMAGE}"
echo "Graph Name: ${GRAPH_NAME}"
echo "Namespace: ${KUBE_NS}"
echo ""
echo "=== UPDATED DEPLOYMENT FILE ==="
cat $DEPLOYMENT_FILE
# Apply the deployment
kubectl apply -n $KUBE_NS -f $DEPLOYMENT_FILE
# Wait for pods to be ready
echo "=== WAITING FOR PODS ==="
sleep 20
echo "Waiting for pods with label nvidia.com/dynamo-graph-deployment-name: $GRAPH_NAME"
if ! kubectl wait --for=condition=ready pod \
-l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" \
-n ${KUBE_NS} \
--timeout=${POD_READY_TIMEOUT}; then
echo "::error::Pods failed to become ready within timeout"
echo "deploy_failed=true" >> $GITHUB_OUTPUT
exit 1
fi
echo "=== FINAL POD STATUSES ==="
kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $KUBE_NS -o wide
echo ""
kubectl get all -n $KUBE_NS
- name: Debug Pod Failure
id: debug-failure
if: failure() && steps.deploy.outputs.deploy_failed == 'true'
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
FRAMEWORK: ${{ inputs.framework }}
DEPLOYMENT_FILE: ${{ inputs.deployment_file }}
GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }}
run: |
export KUBECONFIG=${{ github.workspace }}/.kubeconfig
echo "## ❌ Pod Readiness Failure: ${FRAMEWORK} / ${DEPLOYMENT_FILE}" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "**Graph Name:** \`${GRAPH_NAME}\`" >> "$GITHUB_STEP_SUMMARY"
echo "**Namespace:** \`${NAMESPACE}\`" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "### All relevant Pods in Namespace" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} -o wide >> "$GITHUB_STEP_SUMMARY" 2>&1
echo '```' >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
# echo "### Pod Descriptions" >> "$GITHUB_STEP_SUMMARY"
# echo '```' >> "$GITHUB_STEP_SUMMARY"
# kubectl describe pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} >> "$GITHUB_STEP_SUMMARY" 2>&1
# echo '```' >> "$GITHUB_STEP_SUMMARY"
# echo "" >> "$GITHUB_STEP_SUMMARY"
echo "### Pod Logs (last 30 lines per container)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
# Get logs pod by pod for better readability
PODS=$(kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${NAMESPACE} -o jsonpath='{.items[*].metadata.name}')
if [ -z "$PODS" ]; then
echo "_No pods found matching the deployment label_" >> "$GITHUB_STEP_SUMMARY"
else
for POD in $PODS; do
echo "#### Pod: \`${POD}\`" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
kubectl logs --tail=30 --all-containers=true ${POD} -n ${NAMESPACE} >> "$GITHUB_STEP_SUMMARY" 2>&1 || echo "No logs available for ${POD}" >> "$GITHUB_STEP_SUMMARY"
echo '```' >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
done
fi
- name: Run Validation Tests
id: test
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
FRAMEWORK: ${{ inputs.framework }}
MODEL_NAME: ${{ inputs.model_name }}
MAX_ATTEMPTS: ${{ inputs.model_available_max_attempts }}
RETRY_DELAY: ${{ inputs.model_available_retry_delay }}
PORT_FORWARD_DELAY: ${{ inputs.port_forward_delay }}
MAX_TOKENS: ${{ inputs.max_tokens }}
TEMPERATURE: ${{ inputs.temperature }}
MIN_RESPONSE_LENGTH: ${{ inputs.min_response_length }}
TEST_PROMPT: ${{ inputs.test_prompt }}
GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }}
run: |
set -x
export KUBECONFIG=${{ github.workspace }}/.kubeconfig
# Get frontend pod and setup port-forward
FRONTEND_POD=$(kubectl get pods -n ${NAMESPACE} \
-l nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME} \
-o jsonpath='{.items[0].metadata.name}')
CONTAINER_PORT=$(kubectl get pod $FRONTEND_POD -n ${NAMESPACE} \
-o jsonpath='{.spec.containers[0].ports[?(@.name=="http")].containerPort}')
echo "Frontend Pod: ${FRONTEND_POD}"
echo "Container Port: ${CONTAINER_PORT}"
kubectl port-forward pod/$FRONTEND_POD 8000:${CONTAINER_PORT} -n ${NAMESPACE} &
PORT_FORWARD_PID=$!
LLM_URL="http://localhost:8000"
sleep ${PORT_FORWARD_DELAY}
echo "LLM URL: ${LLM_URL}"
echo "Model Name: ${MODEL_NAME}"
# Wait for model to be available
ATTEMPT=1
while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
MODELS_RESPONSE=$(curl -s --retry 5 --retry-delay 2 --retry-connrefused "${LLM_URL}/v1/models" || true)
if echo "$MODELS_RESPONSE" | jq -e --arg MODEL_NAME "$MODEL_NAME" '.data[]?.id == $MODEL_NAME' >/dev/null 2>&1; then
echo "Model $MODEL_NAME is available in /v1/models"
break
fi
echo "Waiting for model $MODEL_NAME... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
sleep ${RETRY_DELAY}
ATTEMPT=$((ATTEMPT + 1))
done
if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
echo "Model $MODEL_NAME not found after $MAX_ATTEMPTS attempts"
echo "Last response: $MODELS_RESPONSE"
echo "test_result=1" >> $GITHUB_OUTPUT
kill $PORT_FORWARD_PID 2>/dev/null || true
exit 1
fi
# Use default prompt if not provided
if [ -z "$TEST_PROMPT" ]; then
TEST_PROMPT="In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
fi
# Send test request
RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused \
-X POST "${LLM_URL}/v1/chat/completions" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"model": "'"${MODEL_NAME}"'",
"messages": [{"role": "user", "content": "'"${TEST_PROMPT}"'"}],
"stream": false,
"max_tokens": '"${MAX_TOKENS}"',
"temperature": '"${TEMPERATURE}"'
}' 2>&1)
echo "Response: $RESPONSE"
# Validate response
TEST_RESULT=0
if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
echo "❌ Test failed: Response is not valid JSON"
echo "Got: $RESPONSE"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
echo "❌ Test failed: Message role is not 'assistant'"
echo "Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
echo "❌ Test failed: Model name mismatch"
echo "Expected: ${MODEL_NAME}"
echo "Got: $(echo "$RESPONSE" | jq '.model')"
TEST_RESULT=1
elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > '"${MIN_RESPONSE_LENGTH}"'' >/dev/null 2>&1; then
echo "❌ Test failed: Response too short (min: ${MIN_RESPONSE_LENGTH})"
echo "Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
TEST_RESULT=1
else
echo "✅ Test passed: Response matches expected format and content"
fi
echo "test_result=${TEST_RESULT}" >> $GITHUB_OUTPUT
# Cleanup port-forward
kill $PORT_FORWARD_PID 2>/dev/null || true
exit $TEST_RESULT
- name: Cleanup Deployment
if: always() && inputs.skip_cleanup != 'true'
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }}
run: |
set -x
export KUBECONFIG=${{ github.workspace }}/.kubeconfig
echo "=== PRE-CLEANUP STATUS ==="
kubectl get dynamographdeployments -n $NAMESPACE || true
kubectl get pods -n $NAMESPACE || true
echo "Deleting DynamoGraphDeployment: ${GRAPH_NAME}"
kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE --timeout=60s || true
- name: Upload Test Results
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
with:
name: ${{ steps.setup-test-names.outputs.artifact_name }}
path: ${{ steps.setup-test-names.outputs.test_output_log_file }}
retention-days: 7
name: 'Initialize Dynamo Builder'
description: 'Route buildkit workers and bootstrap buildx builder for dynamo builds'
# This action combines buildkit worker discovery and builder bootstrapping into a single step.
# It wraps route_buildkit.sh and bootstrap-buildkit action to simplify workflow configuration.
#
# How it works:
# 1. Discovers available BuildKit pods via Kubernetes DNS using route_buildkit.sh
# 2. Routes pods to the specified flavor based on modulo-3 strategy (see route_buildkit.sh)
# 3. Bootstraps a docker buildx builder using the discovered workers
# 4. Falls back to Kubernetes driver if no remote workers are available
#
# Architecture modes:
# - Single arch: Set arch to 'amd64' or 'arm64' to initialize for one architecture
# - Multi arch: Set all_arch to 'true' to initialize for both amd64 and arm64
#
# Flavor routing:
# BuildKit pods are assigned to flavors based on pod index modulo 3:
# - Pool 0 (mod 0): vllm-cuda12, trtllm-cuda12
# - Pool 1 (mod 1): vllm-cuda13, trtllm-cuda13, sglang-cuda13
# - Pool 2 (mod 2): sglang-cuda12, general (any/no CUDA)
#
# Usage examples:
# # Initialize for both architectures with general flavor:
# - uses: ./.github/actions/init-dynamo-builder
# with:
# builder_name: my-builder
# flavor: general
# all_arch: 'true'
#
# # Initialize for single architecture with specific flavor and CUDA version:
# - uses: ./.github/actions/init-dynamo-builder
# with:
# builder_name: my-builder
# flavor: vllm
# arch: amd64
# cuda_version: '12.9'
inputs:
builder_name:
description: 'Name for the buildx builder'
required: true
flavor:
description: 'Buildkit flavor (vllm, trtllm, sglang, general)'
required: false
default: 'general'
arch:
description: 'Target architecture (amd64, arm64). Ignored if all_arch is true.'
required: false
default: 'amd64'
all_arch:
description: 'If true, initialize builder for both amd64 and arm64 architectures'
required: false
default: 'false'
cuda_version:
description: 'CUDA version (12.9, 13.0). Optional for general flavor.'
required: false
default: ''
# Passthrough inputs for bootstrap-buildkit (kubernetes fallback)
ephemeral_storage:
description: 'Ephemeral storage request for Kubernetes driver'
required: false
default: '400Gi'
namespace:
description: 'Kubernetes namespace for buildkit pods'
required: false
default: 'buildkit'
replicas:
description: 'Number of buildkit replicas'
required: false
default: '1'
requests_cpu:
description: 'CPU requests for buildkit pods'
required: false
default: '12'
requests_memory:
description: 'Memory requests for buildkit pods'
required: false
default: '26Gi'
limits_memory:
description: 'Memory limits for buildkit pods'
required: false
default: '29Gi'
tolerations:
description: 'Tolerations for buildkit pods'
required: false
default: "key=buildkit-fallback-worker,value=true,operator=Equal,effect=NoSchedule"
runs:
using: "composite"
steps:
- name: Route buildkit workers
id: route-buildkit
continue-on-error: true
shell: bash
run: |
CUDA_ARG=""
if [[ -n "${{ inputs.cuda_version }}" ]]; then
CUDA_ARG="--cuda ${{ inputs.cuda_version }}"
fi
if [[ "${{ inputs.all_arch }}" == "true" ]]; then
echo "running with --arch all --flavor ${{ inputs.flavor }} $CUDA_ARG"
.github/scripts/route_buildkit.sh --arch all --flavor ${{ inputs.flavor }} $CUDA_ARG
else
echo "running with --arch ${{ inputs.arch }} --flavor ${{ inputs.flavor }} $CUDA_ARG"
.github/scripts/route_buildkit.sh --arch ${{ inputs.arch }} --flavor ${{ inputs.flavor }} $CUDA_ARG
fi
- name: Prepare worker addresses and platform
id: prepare
shell: bash
run: |
if [[ "${{ inputs.all_arch }}" == "true" ]]; then
# Combine both architecture outputs for multi-arch builds
AMD64_ADDRS="${{ steps.route-buildkit.outputs[format('{0}_amd64', inputs.flavor)] }}"
ARM64_ADDRS="${{ steps.route-buildkit.outputs[format('{0}_arm64', inputs.flavor)] }}"
if [[ -n "$AMD64_ADDRS" && -n "$ARM64_ADDRS" ]]; then
echo "worker_addresses=${AMD64_ADDRS},${ARM64_ADDRS}" >> "$GITHUB_OUTPUT"
elif [[ -n "$AMD64_ADDRS" ]]; then
echo "worker_addresses=${AMD64_ADDRS}" >> "$GITHUB_OUTPUT"
elif [[ -n "$ARM64_ADDRS" ]]; then
echo "worker_addresses=${ARM64_ADDRS}" >> "$GITHUB_OUTPUT"
else
echo "worker_addresses=" >> "$GITHUB_OUTPUT"
fi
else
# Single architecture build
echo "worker_addresses=${{ steps.route-buildkit.outputs[format('{0}_{1}', inputs.flavor, inputs.arch)] }}" >> "$GITHUB_OUTPUT"
fi
- name: Bootstrap buildkit
uses: ./.github/actions/bootstrap-buildkit
with:
builder_name: ${{ inputs.builder_name }}
buildkit_worker_addresses: ${{ steps.prepare.outputs.worker_addresses }}
ephemeral_storage: ${{ inputs.ephemeral_storage }}
namespace: ${{ inputs.namespace }}
replicas: ${{ inputs.replicas }}
requests_cpu: ${{ inputs.requests_cpu }}
requests_memory: ${{ inputs.requests_memory }}
limits_memory: ${{ inputs.limits_memory }}
tolerations: ${{ inputs.tolerations }}
...@@ -8,10 +8,6 @@ inputs: ...@@ -8,10 +8,6 @@ inputs:
image_tag: image_tag:
description: 'Image Tag to run tests on' description: 'Image Tag to run tests on'
required: true required: true
cpu_limit:
description: 'Maximum number of cores available to docker'
required: false
default: '10'
framework: framework:
description: 'Framework name for test metrics' description: 'Framework name for test metrics'
required: false required: false
...@@ -38,7 +34,18 @@ inputs: ...@@ -38,7 +34,18 @@ inputs:
default: 'false' default: 'false'
hf_token: hf_token:
required: false required: false
parallel_mode:
description: 'Parallelization mode: auto (use all cores), none/0 (sequential), or a number of workers'
required: false
default: 'auto'
dind_as_sidecar:
description: 'dind runs as a sidecar container (true/false)'
required: false
default: 'false'
cpu_limit:
description: 'Maximum number of cores available to docker'
required: false
default: '10'
runs: runs:
using: "composite" using: "composite"
...@@ -98,7 +105,8 @@ runs: ...@@ -98,7 +105,8 @@ runs:
sleep 1 sleep 1
done done
- name: Run tests - name: Run tests for runner v1
if: inputs.dind_as_sidecar == 'false'
shell: bash shell: bash
env: env:
NUM_CPUS: ${{ inputs.cpu_limit }} NUM_CPUS: ${{ inputs.cpu_limit }}
...@@ -164,6 +172,96 @@ runs: ...@@ -164,6 +172,96 @@ runs:
# Always continue to results processing # Always continue to results processing
exit 0 exit 0
- name: Run tests in dind as sidecar mode
if: inputs.dind_as_sidecar == 'true'
shell: bash
env:
CONTAINER_ID: test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}
PYTEST_XML_FILE: pytest_test_report.xml
HF_HOME: /runner/_work/_temp
HF_TOKEN: ${{ inputs.hf_token }}
run: |
# Run pytest with detailed output and JUnit XML
set +e # Don't exit on test failures
# Define common docker flags for stability (Shared memory & limits)
# --ipc=host is critical for parallel pytest workers to communicate fast
DOCKER_OPTS="--ipc=host --ulimit memlock=-1 --ulimit stack=67108864"
# Determine docker runtime flags and pytest command based on dry_run mode
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
GPU_FLAGS=""
PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
else
echo "🚀 Running pytest in normal mode"
MYPY_FLAG=""
if [[ "${{ inputs.enable_mypy }}" == "true" ]]; then
echo "🔍 Mypy type checking enabled"
MYPY_FLAG="--mypy"
fi
# Detect GPU availability and conditionally add GPU flags
GPU_FLAGS=""
# We check 'docker info' for the 'nvidia' runtime, which indicates the Daemon can spawn GPU containers.
if docker info 2>/dev/null | grep -i "runtimes" | grep -q "nvidia"; then
echo "✓ Docker Daemon supports Nvidia runtime, enabling GPU flags"
GPU_FLAGS="--gpus all"
else
echo "⚠️ Nvidia runtime not found in Docker Daemon, running in CPU-only mode"
fi
# Determine parallelization based on parallel_mode input
case "${{ inputs.parallel_mode }}" in
"auto")
PARALLEL_OPTS="-n auto"
echo "📊 Parallelization: auto (use all available cores)"
;;
"none"|"0")
PARALLEL_OPTS="-n 0"
echo "📊 Parallelization: disabled (sequential execution) for GPU runs"
;;
*)
PARALLEL_OPTS="-n ${{ inputs.parallel_mode }}"
echo "📊 Parallelization: ${{ inputs.parallel_mode }} workers"
;;
esac
# Construct final command with xdist parallelization (-n) and other options
# --dist=loadscope groups tests by module/class to prevent race conditions in stateful tests
PYTEST_CMD="pytest ${PARALLEL_OPTS} --dist=loadscope --continue-on-collection-errors -v --tb=short --basetemp=/tmp/pytest_temp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 ${MYPY_FLAG} -m \"${{ inputs.pytest_marks }}\""
fi
# Get absolute path for test-results directory and ensure it has proper permissions
TEST_RESULTS_DIR="$(pwd)/test-results"
chmod 777 "${TEST_RESULTS_DIR}"
echo "📁 Test results will be saved to: ${TEST_RESULTS_DIR}"
echo "▶️ Executing: $PYTEST_CMD"
docker run ${GPU_FLAGS} ${DOCKER_OPTS} --rm -w /workspace \
--network host \
--env HF_TOKEN="${HF_TOKEN}" \
--name ${{ env.CONTAINER_ID }}_pytest \
-v "${TEST_RESULTS_DIR}:/workspace/test-results" \
${{ inputs.image_tag }} \
bash -c "${PYTEST_CMD}"
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
echo "🧪 Tests completed with exit code: ${TEST_EXIT_CODE}"
# Verify test results were written (only in normal mode)
if [[ "${{ inputs.dry_run }}" != "true" ]]; then
if [[ -f "${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}" ]]; then
echo "✅ Test results file found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
else
echo "⚠️ Test results file not found: ${TEST_RESULTS_DIR}/${{ env.PYTEST_XML_FILE }}"
fi
fi
# Always continue to results processing
exit 0
- name: Process Test Results - name: Process Test Results
shell: bash shell: bash
run: | run: |
......
name: 'Skopeo Copy'
description: 'Copy container images between registries using skopeo'
inputs:
source_registry:
description: 'Source registry hostname (e.g., 123456789.dkr.ecr.us-east-1.amazonaws.com)'
required: true
source_image:
description: 'Source image name (e.g., ai-dynamo/dynamo)'
required: true
source_tag:
description: 'Source image tag'
required: true
target_registry:
description: 'Target registry hostname'
required: true
target_image:
description: 'Target image name (defaults to source_image if not specified)'
required: false
target_tag:
description: 'Target image tag (defaults to source_tag if not specified)'
required: false
# Skopeo Login inputs for source registry
source_aws_default_region:
description: 'AWS Default Region for source ECR'
required: false
source_aws_account_id:
description: 'AWS Account ID for source ECR'
required: false
source_azure_acr_hostname:
description: 'Azure ACR hostname for source registry'
required: false
source_azure_acr_user:
description: 'Azure ACR user for source registry'
required: false
source_azure_acr_password:
description: 'Azure ACR password for source registry'
required: false
# Skopeo Login inputs for target registry
target_aws_default_region:
description: 'AWS Default Region for target ECR'
required: false
target_aws_account_id:
description: 'AWS Account ID for target ECR'
required: false
target_azure_acr_hostname:
description: 'Azure ACR hostname for target registry'
required: false
target_azure_acr_user:
description: 'Azure ACR user for target registry'
required: false
target_azure_acr_password:
description: 'Azure ACR password for target registry'
required: false
outputs:
target_image_ref:
description: 'Full target image reference'
value: ${{ steps.copy.outputs.target_image_ref }}
runs:
using: "composite"
steps:
- name: Login to Source Registry
uses: ./.github/actions/skopeo-login
with:
aws_default_region: ${{ inputs.source_aws_default_region }}
aws_account_id: ${{ inputs.source_aws_account_id }}
azure_acr_hostname: ${{ inputs.source_azure_acr_hostname }}
azure_acr_user: ${{ inputs.source_azure_acr_user }}
azure_acr_password: ${{ inputs.source_azure_acr_password }}
- name: Login to Target Registry
uses: ./.github/actions/skopeo-login
with:
aws_default_region: ${{ inputs.target_aws_default_region }}
aws_account_id: ${{ inputs.target_aws_account_id }}
azure_acr_hostname: ${{ inputs.target_azure_acr_hostname }}
azure_acr_user: ${{ inputs.target_azure_acr_user }}
azure_acr_password: ${{ inputs.target_azure_acr_password }}
- name: Copy Image
id: copy
shell: bash
run: |
set -euo pipefail
SOURCE_REF="docker://${{ inputs.source_registry }}/${{ inputs.source_image }}:${{ inputs.source_tag }}"
# Use source values as defaults if target not specified
TARGET_IMAGE="${{ inputs.target_image }}"
if [ -z "$TARGET_IMAGE" ]; then
TARGET_IMAGE="${{ inputs.source_image }}"
fi
TARGET_TAG="${{ inputs.target_tag }}"
if [ -z "$TARGET_TAG" ]; then
TARGET_TAG="${{ inputs.source_tag }}"
fi
TARGET_REF="docker://${{ inputs.target_registry }}/${TARGET_IMAGE}:${TARGET_TAG}"
echo "Copying image:"
echo " Source: ${SOURCE_REF}"
echo " Target: ${TARGET_REF}"
skopeo copy --all "${SOURCE_REF}" "${TARGET_REF}"
echo "target_image_ref=${{ inputs.target_registry }}/${TARGET_IMAGE}:${TARGET_TAG}" >> $GITHUB_OUTPUT
echo "✅ Image copied successfully"
name: 'Skopeo Login'
description: 'Login to multiple container registries using skopeo (ECR, ACR)'
inputs:
aws_default_region:
description: 'AWS Default Region'
required: false
aws_account_id:
description: 'AWS Account ID'
required: false
azure_acr_hostname:
description: 'Azure ACR hostname'
required: false
azure_acr_user:
description: 'Azure ACR user'
required: false
azure_acr_password:
description: 'Azure ACR password'
required: false
runs:
using: "composite"
steps:
- name: Install skopeo
shell: bash
run: |
if ! command -v skopeo &> /dev/null; then
echo "Installing skopeo..."
if [ -f /etc/debian_version ]; then
# Added --fix-missing to handle 404s on transient package versions
sudo apt-get update
sudo apt-get install -y --fix-missing skopeo
elif [ -f /etc/redhat-release ]; then
sudo dnf install -y skopeo
else
echo "Unsupported OS for automatic skopeo installation"
exit 1
fi
else
echo "skopeo is already installed"
fi
skopeo --version
- name: ECR Login
shell: bash
if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
env:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
set -euo pipefail
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | skopeo login --username AWS --password-stdin "${ECR_HOSTNAME}"
- name: ACR Login
shell: bash
if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
run: |
set -euo pipefail
echo "${{ inputs.azure_acr_password }}" | skopeo login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# =============================================================================
# route_buildkit.sh - Discover and route BuildKit pods for CI builds
# =============================================================================
#
# ROUTING LOGIC:
# --------------
# Routing is optimized for Docker layer caching based on shared base images:
# - vLLM and SGLang share the same base image (cuda-dl-base) when CUDA versions match
# - TensorRT-LLM uses a different base (pytorch), so it's isolated
# - General builds have no framework, grouped with trtllm for isolation
#
# Flavors are routed to BuildKit pods using modulo 3 on the pod index:
# - Pool 0 (idx % 3 == 0): vllm-cuda12, sglang-cuda12 (share cuda-dl-base + wheel_builder cache)
# - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache)
# - Pool 2 (idx % 3 == 2): trtllm-cuda13, general (isolated - different/no framework base)
#
# FALLBACK: If no pods match the target pool, the highest available index is used.
#
# EXPECTED ROUTING TABLE (pod indices returned for each flavor):
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# | Pods | vllm-cuda12 | sglang-cuda12 | vllm-cuda13 | sglang-cuda13 | trtllm-cuda13 | general |
# | | (mod 0) | (mod 0) | (mod 1) | (mod 1) | (mod 2) | (mod 2) |
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# | 1 | 0 | 0 | 0 (fb) | 0 (fb) | 0 (fb) | 0 (fb) |
# | 2 | 0 | 0 | 1 | 1 | 1 (fb) | 1 (fb) |
# | 3 | 0 | 0 | 1 | 1 | 2 | 2 |
# | 4 | 0, 3 | 0, 3 | 1 | 1 | 2 | 2 |
# | 5 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2 | 2 |
# | 6 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2, 5 | 2, 5 |
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# (fb) = fallback - no pods matched target pool, returns max available index
#
# =============================================================================
set -e
# --- ARGUMENT PARSING ---
ARCH_INPUT=""
FLAVOR_INPUT=""
CUDA_VERSION=""
ALL_FLAVORS=("vllm" "trtllm" "sglang" "general")
while [[ $# -gt 0 ]]; do
case $1 in
--arch)
ARCH_INPUT="$2"
shift 2
;;
--flavor)
FLAVOR_INPUT="$2"
shift 2
;;
--cuda)
CUDA_VERSION="$2"
shift 2
;;
*)
echo "❌ Error: Unknown argument '$1'. Use --arch <amd64|arm64|all> --flavor <vllm|trtllm|sglang|general|all> [--cuda <12.9|13.0>]."
exit 1
;;
esac
done
if [ -z "$ARCH_INPUT" ]; then
echo "❌ Error: Must specify --arch <amd64|arm64|all>."
exit 1
fi
if [ -z "$FLAVOR_INPUT" ]; then
echo "❌ Error: Must specify --flavor <vllm|trtllm|sglang|general|all>."
exit 1
fi
# CUDA version is required for all flavors except "general"
if [ -z "$CUDA_VERSION" ] && [ "$FLAVOR_INPUT" != "general" ]; then
echo "❌ Error: Must specify --cuda <12.9|13.0> for flavor '$FLAVOR_INPUT'."
exit 1
fi
# Validate arch input
case $ARCH_INPUT in
amd64|arm64|all) ;;
*)
echo "❌ Error: Invalid arch '$ARCH_INPUT'. Must be amd64, arm64, or all."
exit 1
;;
esac
# Validate flavor input
case $FLAVOR_INPUT in
vllm|trtllm|sglang|general|all) ;;
*)
echo "❌ Error: Invalid flavor '$FLAVOR_INPUT'. Must be vllm, trtllm, sglang, general, or all."
exit 1
;;
esac
# Validate CUDA version input (allow empty for general flavor)
if [ -n "$CUDA_VERSION" ]; then
case $CUDA_VERSION in
12.9|13.0|13.1) ;;
*)
echo "❌ Error: Invalid CUDA version '$CUDA_VERSION'. Must be 12.9, 13.0, or 13.1."
exit 1
;;
esac
fi
# Determine architectures to process
if [ "$ARCH_INPUT" = "all" ]; then
ARCHS=("amd64" "arm64")
else
ARCHS=("$ARCH_INPUT")
fi
# Determine flavors to process
if [ "$FLAVOR_INPUT" = "all" ]; then
FLAVORS=("${ALL_FLAVORS[@]}")
else
FLAVORS=("$FLAVOR_INPUT")
fi
# --- CONFIGURATION ---
NAMESPACE="buildkit"
PORT="1234"
MAX_POD_CHECK=10 # How many pod indices to probe (e.g., 0 to 3)
# ---------------------
if ! command -v nslookup &> /dev/null; then
echo "❌ Error: nslookup not found. Please install dnsutils or bind-tools."
exit 1
fi
# --- RETRY CONFIGURATION ---
MAX_RETRIES=${MAX_RETRIES:-8}
RETRY_DELAY=${RETRY_DELAY:-30}
# ---------------------------
# Function to discover SPECIFIC active pod indices
# This handles gaps (e.g., if pod-0 and pod-2 are up, but pod-1 is down)
get_active_indices() {
local arch=$1
local service_name=$2
local active_indices=()
# Loop through theoretical indices to see which ones actually resolve via DNS.
for (( i=0; i<MAX_POD_CHECK; i++ )); do
local pod_dns="buildkit-${arch}-${i}.${service_name}.${NAMESPACE}.svc.cluster.local"
# Check if this specific pod resolves
if nslookup "$pod_dns" >/dev/null 2>&1; then
active_indices+=("$i")
fi
done
echo "${active_indices[@]}"
}
# Function to route flavors to specific active indices based on Modulo 3
get_target_indices() {
local flavor=$1
local cuda_version=$2
# Read remaining arguments as an array of available indices
local -a available_indices=("${@:3}")
if [ ${#available_indices[@]} -eq 0 ]; then
echo ""
return
fi
local cuda_major=${cuda_version%%.*}
local route_key="${flavor}-cuda${cuda_major}"
local target_mod
case "$route_key" in
# --- POOL 0: CUDA 12 builds (vLLM + SGLang share cuda-dl-base:cuda12.9) ---
vllm-cuda12|sglang-cuda12)
target_mod=0
;;
# --- POOL 1: CUDA 13 builds (vLLM + SGLang share cuda-dl-base:cuda13.0) ---
vllm-cuda13|sglang-cuda13)
target_mod=1
;;
# --- POOL 2: Isolated builds (TensorRT-LLM uses pytorch base, general has no framework) ---
trtllm-cuda13|general-*)
target_mod=2
;;
# --- FALLBACK ---
*)
target_mod=2
;;
esac
echo " [DEBUG] Routing Key: '$route_key' -> Worker Index Modulo: $target_mod" >&2
local final_targets=()
# Filter the AVAILABLE indices (not just 0..count)
for idx in "${available_indices[@]}"; do
if [ $(( idx % 3 )) -eq "$target_mod" ]; then
final_targets+=("$idx")
fi
done
# If no pods match the specific modulo, fallback to the highest available index
if [ "${#final_targets[@]}" -eq "0" ]; then
local max_idx=${available_indices[0]}
for idx in "${available_indices[@]}"; do
if [ "$idx" -gt "$max_idx" ]; then
max_idx=$idx
fi
done
echo "$max_idx"
else
echo "${final_targets[@]}"
fi
}
# Process each architecture
for ARCH in "${ARCHS[@]}"; do
SERVICE_NAME="buildkit-${ARCH}-headless"
POD_PREFIX="buildkit-${ARCH}"
echo "🔍 Discovering active Buildkit pods for ${ARCH} via DNS (checking indices 0-$((MAX_POD_CHECK-1)))..."
# Get the actual list of alive indices (e.g., "0 2 5")
ACTIVE_INDICES=($(get_active_indices "$ARCH" "$SERVICE_NAME"))
COUNT=${#ACTIVE_INDICES[@]}
# Retry loop if no pods found
if [ "$COUNT" -eq "0" ]; then
echo "⚠️ DNS returned 0 records for ${ARCH}. KEDA should be triggering a new buildkit pod."
for (( retry=1; retry<=MAX_RETRIES; retry++ )); do
echo "⏳ Waiting ${RETRY_DELAY}s for BuildKit pods to become available (attempt ${retry}/${MAX_RETRIES})..."
sleep "$RETRY_DELAY"
# Re-probe for active indices
ACTIVE_INDICES=($(get_active_indices "$ARCH" "$SERVICE_NAME"))
COUNT=${#ACTIVE_INDICES[@]}
if [ "$COUNT" -gt "0" ]; then
echo "✅ BuildKit pods for ${ARCH} are now available!"
break
fi
if [ "$retry" -eq "$MAX_RETRIES" ]; then
echo "::warning::No remote BuildKit pods available for ${ARCH} after ${MAX_RETRIES} attempts. Falling back to Kubernetes driver."
echo "⚠️ Warning: No remote BuildKit pods available for ${ARCH}."
for flavor in "${FLAVORS[@]}"; do
echo "${flavor}_${ARCH}=" >> "$GITHUB_OUTPUT"
done
exit 1
fi
done
fi
echo "✅ Found $COUNT active pod(s) (Indices: ${ACTIVE_INDICES[*]})."
# Iterate over flavors and set outputs
for flavor in "${FLAVORS[@]}"; do
# Pass the discovered ACTIVE_INDICES to the routing function
TARGET_INDICES=($(get_target_indices "$flavor" "$CUDA_VERSION" "${ACTIVE_INDICES[@]}"))
ADDRS=""
for idx in "${TARGET_INDICES[@]}"; do
POD_NAME="${POD_PREFIX}-${idx}"
ADDR="tcp://${POD_NAME}.${SERVICE_NAME}.${NAMESPACE}.svc.cluster.local:${PORT}"
if [ -z "$ADDRS" ]; then
ADDRS="$ADDR"
else
ADDRS="${ADDRS},${ADDR}"
fi
done
echo " -> Routing ${flavor}_${ARCH} to pod indices: ${TARGET_INDICES[*]}"
# Write to GitHub Output
echo "${flavor}_${ARCH}=$ADDRS" >> "$GITHUB_OUTPUT"
done
done
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Framework Matrix Pipeline
on:
workflow_call:
inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
type: string
platforms:
description: 'Platforms to build (JSON array, e.g., ["amd64", "arm64"])'
required: true
type: string
cuda_versions:
description: 'CUDA versions to build (JSON array, e.g., ["12.9", "13.0"])'
required: true
type: string
run_tests:
description: 'Whether to run pytest'
required: false
type: boolean
default: true
copy_to_acr:
description: 'Whether to copy images to ACR'
required: false
type: boolean
default: true
builder_name:
description: 'Buildkit builder name'
required: true
type: string
extra_tags:
description: 'Additional tags (newline-separated, -$platform suffix auto-appended)'
required: false
type: string
default: ''
build_image:
description: 'Whether to build image'
required: false
type: boolean
default: true
no_cache:
description: 'Disable Docker build cache'
required: false
type: boolean
default: false
push_image:
description: 'Push image to registry'
required: false
type: boolean
default: true
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
CI_TOKEN:
required: false
SCCACHE_S3_BUCKET:
required: false
AWS_ACCESS_KEY_ID:
required: false
AWS_SECRET_ACCESS_KEY:
required: false
HF_TOKEN:
required: false
jobs:
pipeline:
strategy:
fail-fast: false
matrix:
platform: ${{ fromJson(inputs.platforms) }}
cuda_version: ${{ fromJson(inputs.cuda_versions) }}
name: ${{ inputs.framework }}-cuda${{ matrix.cuda_version }}-${{ matrix.platform }}
uses: ./.github/workflows/build-test-distribute-flavor.yml
with:
framework: ${{ inputs.framework }}
platform: ${{ matrix.platform }}
cuda_version: ${{ matrix.cuda_version }}
extra_tags: ${{ inputs.extra_tags }}
no_cache: ${{ inputs.no_cache }}
builder_name: ${{ inputs.builder_name }}
build_image: ${{ inputs.build_image }}
run_tests: ${{ inputs.run_tests && !(inputs.framework == 'trtllm' && matrix.platform == 'arm64') }} # trtllm tests on arm64 are not supported
copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR
push_image: ${{ inputs.push_image }}
secrets: inherit
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Build, Test, and Copy Framework Image
on:
workflow_call:
inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
type: string
platform:
description: 'Platform to build (amd64 or arm64)'
required: true
type: string
cuda_version:
description: 'CUDA version to build (e.g., 12.9, 13.0)'
required: true
type: string
run_tests:
description: 'Whether to run pytest'
required: false
type: boolean
default: true
copy_to_acr:
description: 'Whether to copy images to ACR'
required: false
type: boolean
default: true
builder_name:
description: 'Buildkit builder name'
required: true
type: string
extra_tags:
description: 'Additional tags (newline-separated, -$platform suffix auto-appended)'
required: false
type: string
default: ''
build_image:
description: 'Whether to build image'
required: false
type: boolean
default: true
no_cache:
description: 'Disable Docker build cache'
required: false
type: boolean
default: false
push_image:
description: 'Push image to registry'
required: false
type: boolean
default: true
no_load:
description: 'Do not load the image into docker (you must have dind installed if you want to load the image)'
required: false
type: boolean
default: true
show_summary:
description: 'Show summary'
required: false
type: boolean
default: false
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
CI_TOKEN:
required: false
SCCACHE_S3_BUCKET:
required: false
AWS_ACCESS_KEY_ID:
required: false
AWS_SECRET_ACCESS_KEY:
required: false
HF_TOKEN:
required: false
outputs:
image_tag:
description: 'Image tag in ACR'
value: ${{ jobs.copy-to-acr.outputs.target_tag_plain }}-${{ inputs.platform }}
jobs:
# ============================================================================
# BUILD
# ============================================================================
build:
if: inputs.build_image
name: Build ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-builder-v2
outputs:
target_tag_plain: ${{ steps.calculate-target-tag.outputs.target_tag_plain }}
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Calculate extra tags with platform suffix # will get redundant upon multi arch builds support
id: extra-tags
shell: bash
env:
EXTRA_TAGS: ${{ inputs.extra_tags }}
PLATFORM: ${{ inputs.platform }}
run: |
if [ -n "$EXTRA_TAGS" ]; then
RESULT=""
while IFS= read -r tag; do
if [ -n "$tag" ]; then
RESULT+="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${tag}-${PLATFORM}"$'\n'
fi
done <<< "$EXTRA_TAGS"
echo "tags<<EOF" >> $GITHUB_OUTPUT
echo "$RESULT" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
else
echo "tags=" >> $GITHUB_OUTPUT
fi
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
TARGET_TAG_PLAIN="${{ github.sha }}-${{ inputs.framework }}"
DEFAULT_TARGET_IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TARGET_TAG_PLAIN}-cuda${CUDA_VERSION}-${{ inputs.platform }}"
echo "default_target_image_uri=${DEFAULT_TARGET_IMAGE_URI}" >> $GITHUB_OUTPUT
echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.framework }}
arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
- name: Print Build Container inputs
run: |
echo "=== Build Container Inputs ==="
echo "image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}"
echo "framework: ${{ inputs.framework }}"
echo "target: runtime"
echo "platform: linux/${{ inputs.platform }}"
echo "cuda_version: ${{ inputs.cuda_version }}"
echo "no_cache: ${{ inputs.no_cache }}"
echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}"
echo "push_image: ${{ inputs.push_image }}"
echo "no_load: ${{ inputs.no_load }}"
- name: Build Container
id: build-image
uses: ./.github/actions/docker-remote-build
with:
image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}
framework: ${{ inputs.framework }}
target: runtime
platform: linux/${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
no_cache: ${{ inputs.no_cache }}
extra_tags: ${{ steps.extra-tags.outputs.tags }}
push_image: ${{ inputs.push_image }}
no_load: ${{ inputs.no_load }}
- name: Show summary
shell: bash
if: ${{ inputs.push_image && inputs.show_summary }}
run: |
echo "### 🐳 ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} Default Image" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
echo "| \`${{ steps.calculate-target-tag.outputs.default_target_image_uri }}\` |" >> $GITHUB_STEP_SUMMARY
# ============================================================================
# TEST
# ============================================================================
test:
if: inputs.run_tests && inputs.build_image
needs: [build]
name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }}
env:
FRAMEWORK: ${{ inputs.framework }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull relevant images
shell: bash
run: |
start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s"
- name: Run Sanity Check on Runtime Image
shell: bash
run: |
echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.test_image }}"
# Run the sanity check script inside the container
# The script is located in /workspace/deploy/sanity_check.py in runtime containers
export WORKSPACE=/workspace
set +e
docker run --rm "${{ steps.calculate-target-tag.outputs.test_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check
SANITY_CHECK_EXIT_CODE=$?
set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
exit ${SANITY_CHECK_EXIT_CODE}
else
echo "✅ Sanity check passed"
fi
# Run CPU-only tests first (parallelized for speed)
# These are unit tests marked with gpu_0 that don't require GPU hardware
- name: Run CPU-only tests (parallelized)
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_0', inputs.framework) }}
framework: ${{ inputs.framework }}
test_type: "pre_merge_cpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'true'
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'auto'
dind_as_sidecar: 'true'
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_1 that require GPU hardware
- name: Run GPU tests (sequential)
if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_1', inputs.framework) }}
framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }}
enable_mypy: 'false' # already covered by CPU tests
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
# ============================================================================
# COPY TO ACR
# ============================================================================
copy-to-acr:
needs: [build, test]
# Run if copy_to_acr is true AND build succeeded AND (test succeeded OR test was skipped)
if: |
always() &&
inputs.copy_to_acr &&
needs.build.result == 'success' &&
(needs.test.result == 'success' || needs.test.result == 'skipped')
name: copy ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: prod-default-small-v2
outputs:
target_tag_plain: ${{ needs.build.outputs.target_tag_plain }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
CUDA_VERSION_RAW=${{ inputs.cuda_version }}
CUDA_VERSION=${CUDA_VERSION_RAW%%.*}
echo "cuda_version_plain=${CUDA_VERSION}" >> $GITHUB_OUTPUT
TEST_IMAGE=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${{ needs.build.outputs.target_tag_plain }}-cuda${CUDA_VERSION}-${{ inputs.platform }}
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Copy image to target registry
uses: ./.github/actions/skopeo-copy
with:
source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
source_image: ai-dynamo/dynamo
source_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_image: ai-dynamo/dynamo
target_tag: ${{ needs.build.outputs.target_tag_plain }}-cuda${{ steps.calculate-target-tag.outputs.cuda_version_plain }}-${{ inputs.platform }}
source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Reusable CI Test Suite Workflow # Reusable CI Test Suite Workflow
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
name: Post-Merge CI Pipeline name: Post-Merge CI Pipeline
......
This diff is collapsed.
...@@ -22,6 +22,7 @@ fi ...@@ -22,6 +22,7 @@ fi
set -e set -e
TAG= TAG=
PRIMARY_TAG=
RUN_PREFIX= RUN_PREFIX=
PLATFORM=linux/amd64 PLATFORM=linux/amd64
...@@ -301,7 +302,12 @@ get_options() { ...@@ -301,7 +302,12 @@ get_options() {
;; ;;
--tag) --tag)
if [ "$2" ]; then if [ "$2" ]; then
if [ -z "$TAG" ]; then
TAG="--tag $2" TAG="--tag $2"
PRIMARY_TAG="$2"
else
TAG+=" --tag $2"
fi
shift shift
else else
missing_requirement "$1" missing_requirement "$1"
...@@ -475,8 +481,10 @@ get_options() { ...@@ -475,8 +481,10 @@ get_options() {
if [ -z "$TAG" ]; then if [ -z "$TAG" ]; then
TAG="--tag dynamo:${VERSION}-${FRAMEWORK,,}" TAG="--tag dynamo:${VERSION}-${FRAMEWORK,,}"
PRIMARY_TAG="dynamo:${VERSION}-${FRAMEWORK,,}"
if [ -n "${TARGET}" ] && [ "${TARGET}" != "local-dev" ]; then if [ -n "${TARGET}" ] && [ "${TARGET}" != "local-dev" ]; then
TAG="${TAG}-${TARGET}" TAG="${TAG}-${TARGET}"
PRIMARY_TAG="${PRIMARY_TAG}-${TARGET}"
fi fi
fi fi
...@@ -540,7 +548,7 @@ show_help() { ...@@ -540,7 +548,7 @@ show_help() {
echo " [--build-arg additional build args to pass to docker build]" echo " [--build-arg additional build args to pass to docker build]"
echo " [--cache-from cache location to start from]" echo " [--cache-from cache location to start from]"
echo " [--cache-to location where to cache the build output]" echo " [--cache-to location where to cache the build output]"
echo " [--tag tag for image]" echo " [--tag tag for image (can be specified multiple times)]"
echo " [--uid user ID for local-dev images (only with --target local-dev)]" echo " [--uid user ID for local-dev images (only with --target local-dev)]"
echo " [--gid group ID for local-dev images (only with --target local-dev)]" echo " [--gid group ID for local-dev images (only with --target local-dev)]"
echo " [--no-cache disable docker build cache]" echo " [--no-cache disable docker build cache]"
...@@ -1010,7 +1018,7 @@ if [[ -z "${TARGET:-}" || "${TARGET:-}" == "dev" || "${TARGET:-}" == "local-dev" ...@@ -1010,7 +1018,7 @@ if [[ -z "${TARGET:-}" || "${TARGET:-}" == "dev" || "${TARGET:-}" == "local-dev"
BUILD_ARGS+=" --build-arg FRAMEWORK=${FRAMEWORK,,} " BUILD_ARGS+=" --build-arg FRAMEWORK=${FRAMEWORK,,} "
# Preserve historical tagging behavior for dev/local-dev (build.sh used to delegate out). # Preserve historical tagging behavior for dev/local-dev (build.sh used to delegate out).
base="${TAG#--tag }" base="${PRIMARY_TAG}"
base="${base%-runtime}" base="${base%-runtime}"
base="${base%-local-dev}" base="${base%-local-dev}"
base="${base%-dev}" base="${base%-dev}"
...@@ -1096,7 +1104,7 @@ fi ...@@ -1096,7 +1104,7 @@ fi
# Use BuildKit for enhanced metadata # Use BuildKit for enhanced metadata
if docker buildx version &>/dev/null; then if docker buildx version &>/dev/null; then
$RUN_PREFIX docker buildx build --progress=plain${LOAD_FLAG}${PUSH} -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}" $RUN_PREFIX docker buildx build --progress=plain ${LOAD_FLAG} ${PUSH} -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
BUILD_EXIT_CODE=${PIPESTATUS[0]} BUILD_EXIT_CODE=${PIPESTATUS[0]}
else else
$RUN_PREFIX DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}" $RUN_PREFIX DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
...@@ -1110,8 +1118,8 @@ fi ...@@ -1110,8 +1118,8 @@ fi
# Handle --make-efa flag: add AWS EFA layer on top of the built image # Handle --make-efa flag: add AWS EFA layer on top of the built image
# This runs BEFORE local-dev so the flow is: dev -> dev-aws -> local-dev-aws # This runs BEFORE local-dev so the flow is: dev -> dev-aws -> local-dev-aws
if [[ "${MAKE_EFA:-}" == "true" ]]; then if [[ "${MAKE_EFA:-}" == "true" ]]; then
# Get the base image that was just built (dev or runtime) # Get the base image that was just built (use PRIMARY_TAG to avoid parsing issues)
BASE_IMAGE_FOR_EFA=$(echo "$TAG" | sed 's/--tag //') BASE_IMAGE_FOR_EFA="${PRIMARY_TAG}"
# Determine the EFA stage based on the target # Determine the EFA stage based on the target
# runtime target -> runtime-aws stage # runtime target -> runtime-aws stage
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment