"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "15539fd0931c9f122b2967ae789ab421af133a70"
Unverified Commit 9321ae9c authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci(build): fix BuildKit connection timeout and extract test image build action (#7398)


Co-authored-by: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent 65cd7f3b
name: 'Refresh BuildKit Builder'
description: 'Ensure a BuildKit builder is healthy; re-initialize it if the connection has been lost'
# This action guards against the remote BuildKit connection going stale between
# build steps in the same job (e.g. after a long primary build the TCP connection
# to the remote daemon can time out before the test-image build starts).
#
# How it works:
# 1. Runs `docker buildx inspect --bootstrap` to check whether the builder is
# still reachable. If it succeeds the action exits immediately — no extra
# work is done.
# 2. If the inspect/bootstrap fails the builder is considered stale and
# init-dynamo-builder is called with the same flavor/arch/cuda_version that
# were used when the builder was first created, effectively re-routing
# workers and re-registering the builder from scratch.
inputs:
builder_name:
description: 'Name of the buildx builder to check'
required: true
flavor:
description: 'Buildkit flavor used to route workers on re-init (vllm, sglang, trtllm, general)'
required: true
arch:
description: 'Target architecture used on re-init (amd64, arm64)'
required: false
default: 'amd64'
cuda_version:
description: 'CUDA version used on re-init (e.g. 12.9, 13.0). Leave empty for general flavor.'
required: false
default: ''
all_arch:
description: 'If true, re-initialize builder for both amd64 and arm64 architectures'
required: false
default: 'false'
# Kubernetes fallback passthrough inputs (forwarded to init-dynamo-builder)
ephemeral_storage:
description: 'Ephemeral storage request for Kubernetes fallback driver'
required: false
default: '400Gi'
namespace:
description: 'Kubernetes namespace for buildkit fallback pods'
required: false
default: 'buildkit'
replicas:
description: 'Number of buildkit fallback replicas'
required: false
default: '1'
requests_cpu:
description: 'CPU requests for buildkit fallback pods'
required: false
default: '12'
requests_memory:
description: 'Memory requests for buildkit fallback pods'
required: false
default: '26Gi'
limits_memory:
description: 'Memory limits for buildkit fallback pods'
required: false
default: '29Gi'
tolerations:
description: 'Tolerations for buildkit fallback pods'
required: false
default: "key=buildkit-fallback-worker,value=true,operator=Equal,effect=NoSchedule"
runs:
using: "composite"
steps:
- name: Check builder health
id: check-health
continue-on-error: true
shell: bash
run: |
echo "Checking BuildKit builder '${{ inputs.builder_name }}'..."
docker buildx inspect "${{ inputs.builder_name }}" --bootstrap
echo "Builder is healthy."
- name: Remove stale builder (if unhealthy)
if: steps.check-health.outcome == 'failure'
shell: bash
run: |
echo "::warning::Builder '${{ inputs.builder_name }}' is unhealthy. Removing and re-initializing..."
docker buildx rm "${{ inputs.builder_name }}" || true
- name: Re-initialize builder (if unhealthy)
if: steps.check-health.outcome == 'failure'
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.flavor }}
arch: ${{ inputs.arch }}
all_arch: ${{ inputs.all_arch }}
cuda_version: ${{ inputs.cuda_version }}
ephemeral_storage: ${{ inputs.ephemeral_storage }}
namespace: ${{ inputs.namespace }}
replicas: ${{ inputs.replicas }}
requests_cpu: ${{ inputs.requests_cpu }}
requests_memory: ${{ inputs.requests_memory }}
limits_memory: ${{ inputs.limits_memory }}
tolerations: ${{ inputs.tolerations }}
name: 'Docker Test Image Build'
description: 'Build and push a Dynamo test image from an existing runtime image base'
# Builds container/Dockerfile.test on top of a pre-built runtime image.
# Unlike docker-remote-build this action has no sccache wiring, no build metrics,
# and always uses the "test-<framework>-..." cache namespace.
inputs:
image_tag:
description: 'Full image URI to tag the test image with'
required: true
base_image:
description: 'Full image URI of the runtime image to use as BASE_IMAGE'
required: true
framework:
description: 'Framework name (vllm, sglang, trtllm) used to derive the cache tag'
required: true
platform:
description: 'Target platform architecture (amd64 or arm64)'
required: true
cuda_version:
description: 'CUDA major.minor version (e.g. 12.9) major version used in cache tag'
required: true
aws_account_id:
description: 'AWS account ID for ECR hostname'
required: true
aws_default_region:
description: 'AWS region for ECR hostname'
required: true
push_image:
description: 'Push the image to the registry'
required: false
default: 'false'
no_load:
description: 'Do not load the image into the local Docker daemon'
required: false
default: 'true'
no_cache:
description: 'Disable Docker layer cache'
required: false
default: 'false'
runs:
using: "composite"
steps:
- name: Build and push test image
shell: bash
env:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
CUDA_MAJOR=${{ inputs.cuda_version }}
CUDA_MAJOR=${CUDA_MAJOR%%.*}
CACHE_TAG="test-${{ inputs.framework }}-cuda${CUDA_MAJOR}-${{ inputs.platform }}-cache"
CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}"
CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG},mode=max"
elif [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG},mode=max"
fi
PUSH_ARGS=""
if [ "${{ inputs.push_image }}" == "true" ]; then
PUSH_ARGS="--push"
elif [ "${{ inputs.no_load }}" == "false" ]; then
PUSH_ARGS="--load"
fi
NO_CACHE_ARG=""
if [ "${{ inputs.no_cache }}" == "true" ]; then
NO_CACHE_ARG="--no-cache"
fi
docker buildx build \
--progress=plain \
${PUSH_ARGS} \
${NO_CACHE_ARG} \
--platform "linux/${{ inputs.platform }}" \
-f container/Dockerfile.test \
--build-arg "BASE_IMAGE=${{ inputs.base_image }}" \
${CACHE_ARGS} \
-t "${{ inputs.image_tag }}" .
......@@ -284,43 +284,29 @@ jobs:
no_load: ${{ inputs.no_load }}
extra_build_args: |
DYNAMO_COMMIT_SHA=${{ github.sha }}
- name: Refresh BuildKit builder
if: ${{ inputs.target != 'dev' }}
uses: ./.github/actions/builder-refresher
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ inputs.framework }}
arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
- name: Build and Push Test Image
if: ${{ inputs.target != 'dev' }} # no need to build a separate test image for dev as its not tested
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
CUDA_MAJOR=${{ steps.calculate-target-tag.outputs.cuda_version_plain }}
CACHE_TAG="test-${{ inputs.framework }}-cuda${CUDA_MAJOR}-${{ inputs.platform }}-cache"
CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}"
CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG},mode=max"
elif [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG},mode=max"
fi
PUSH_ARGS=""
if [ "${{ inputs.push_image }}" == "true" ]; then
PUSH_ARGS="--push"
elif [ "${{ inputs.no_load }}" == "false" ]; then
PUSH_ARGS="--load"
fi
NO_CACHE_ARG=""
if [ "${{ inputs.no_cache }}" == "true" ]; then
NO_CACHE_ARG="--no-cache"
fi
docker buildx build \
--progress=plain \
${PUSH_ARGS} \
${NO_CACHE_ARG} \
--platform linux/${{ inputs.platform }} \
-f container/Dockerfile.test \
--build-arg BASE_IMAGE=${{ steps.calculate-target-tag.outputs.default_target_image_uri }} \
${CACHE_ARGS} \
-t ${{ steps.calculate-target-tag.outputs.test_image_uri }} .
timeout-minutes: ${{ inputs.build_timeout_minutes }}
uses: ./.github/actions/docker-test-image-build
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image_uri }}
base_image: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}
framework: ${{ inputs.framework }}
platform: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
push_image: ${{ inputs.push_image }}
no_load: ${{ inputs.no_load }}
no_cache: ${{ inputs.no_cache }}
- name: Show summary
shell: bash
if: ${{ inputs.push_image && inputs.show_summary }}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment