Unverified Commit bcbb4d4c authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

fix: Added retries for docker pull. Removed unused docker-tag-push GH action (#6804)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent 0404a9be
name: 'Docker Tag and Push'
description: 'Tag and Push Docker Images'
inputs:
local_image:
description: 'Local Image Name:Tag'
required: true
push_tags:
description: 'Target Name:Tag (newline-separated list for multiple tags)'
required: true
# There isn't a clean way to have an additional tag that is conditional
# Adding this to handle this use-case (we want multiple tags for main builds)
conditional_tag:
description: 'Optional tag for conditionals'
required: false
aws_push:
description: 'Push to AWS Boolean'
required: false
default: 'false'
azure_push:
description: 'Push to Azure Container Registry (ACR) Boolean'
required: false
default: 'false'
aws_account_id:
description: 'AWS Account ID'
required: false
aws_default_region:
description: 'AWS Default Region'
required: false
azure_acr_hostname:
description: 'Azure ACR hostname'
required: false
outputs:
image_tags:
description: 'Image Tags'
value: ${{ inputs.push_tags }}
runs:
using: "composite"
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: ECR Tag and Push
shell: bash
if: ${{ inputs.aws_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAGS: ${{ inputs.push_tags }}
CONDITIONAL_TAG: ${{ inputs.conditional_tag }}
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
set -euo pipefail
source "${{ github.action_path }}/retry_push.sh"
if [[ -n "${CONDITIONAL_TAG}" ]]; then
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${CONDITIONAL_TAG}"
retry_push "${ECR_HOSTNAME}/${CONDITIONAL_TAG}"
fi
while IFS= read -r TAG; do
if [ -z "$TAG" ]; then
continue
fi
echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
retry_push "${ECR_HOSTNAME}/${TAG}"
done <<< "$PUSH_TAGS"
- name: ACR Tag and Push
shell: bash
if: ${{ inputs.azure_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAGS: ${{ inputs.push_tags }}
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
run: |
set -euo pipefail
source "${{ github.action_path }}/retry_push.sh"
while IFS= read -r TAG; do
if [ -z "$TAG" ]; then
continue
fi
echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
retry_push "${AZURE_ACR_HOSTNAME}/${TAG}"
done <<< "$PUSH_TAGS"
# Retry docker push with exponential backoff. # Retry docker operations with exponential backoff.
# Safe under `set -e`: the `if` conditional context prevents a failed # Safe under `set -e`: the `if` conditional context prevents a failed
# `docker push` from triggering an immediate exit. # `docker <operation>` from triggering an immediate exit.
retry_push() { retry_docker_operation() {
local image="$1" local operation="$1"
local image="$2"
local max_attempts=3 local max_attempts=3
local wait_seconds=10 local wait_seconds=10
local attempt=1 local attempt=1
if [[ "$operation" != "push" && "$operation" != "pull" ]]; then
echo "Unsupported docker operation: $operation (expected: push|pull)" >&2
return 2
fi
while true; do while true; do
if docker push "$image"; then if docker "$operation" "$image"; then
return 0 return 0
fi fi
echo "Push failed for $image (attempt ${attempt}/${max_attempts})." >&2 echo "Docker ${operation} failed for $image (attempt ${attempt}/${max_attempts})." >&2
if (( attempt >= max_attempts )); then if (( attempt >= max_attempts )); then
echo "Push failed after ${max_attempts} attempts: $image" >&2 echo "Docker ${operation} failed after ${max_attempts} attempts: $image" >&2
return 1 return 1
fi fi
echo "Retrying in ${wait_seconds}s..." echo "Retrying docker ${operation} in ${wait_seconds}s..."
sleep "$wait_seconds" sleep "$wait_seconds"
attempt=$((attempt + 1)) attempt=$((attempt + 1))
wait_seconds=$((wait_seconds * 2)) wait_seconds=$((wait_seconds * 2))
...@@ -27,3 +33,13 @@ retry_push() { ...@@ -27,3 +33,13 @@ retry_push() {
fi fi
done done
} }
retry_push() {
local image="$1"
retry_docker_operation push "$image"
}
retry_pull() {
local image="$1"
retry_docker_operation pull "$image"
}
...@@ -304,9 +304,10 @@ jobs: ...@@ -304,9 +304,10 @@ jobs:
- name: Pull relevant images - name: Pull relevant images
shell: bash shell: bash
run: | run: |
source ./.github/scripts/retry_docker.sh
start_time=$(date +%s) start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio retry_pull quay.io/minio/minio
end_time=$(date +%s) end_time=$(date +%s)
duration=$((end_time - start_time)) duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s" echo "⏱️ Image pull duration: ${duration}s"
...@@ -402,9 +403,10 @@ jobs: ...@@ -402,9 +403,10 @@ jobs:
- name: Pull relevant images - name: Pull relevant images
shell: bash shell: bash
run: | run: |
source ./.github/scripts/retry_docker.sh
start_time=$(date +%s) start_time=$(date +%s)
docker pull ${{ steps.calculate-target-tag.outputs.test_image }} retry_pull ${{ steps.calculate-target-tag.outputs.test_image }}
docker pull quay.io/minio/minio retry_pull quay.io/minio/minio
end_time=$(date +%s) end_time=$(date +%s)
duration=$((end_time - start_time)) duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s" echo "⏱️ Image pull duration: ${duration}s"
......
...@@ -138,7 +138,9 @@ jobs: ...@@ -138,7 +138,9 @@ jobs:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image - name: Pull image
run: docker pull ${{ env.IMAGE_TAG }} run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run Rust checks (block-manager + media-ffmpeg + integration tests) - name: Run Rust checks (block-manager + media-ffmpeg + integration tests)
run: | run: |
docker run --rm --runtime=nvidia --gpus all --user root -w /workspace/lib/llm \ docker run --rm --runtime=nvidia --gpus all --user root -w /workspace/lib/llm \
...@@ -175,7 +177,9 @@ jobs: ...@@ -175,7 +177,9 @@ jobs:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image - name: Pull image
run: docker pull ${{ env.IMAGE_TAG }} run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run pytest (parallel tests with xdist) - name: Run pytest (parallel tests with xdist)
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
...@@ -205,7 +209,9 @@ jobs: ...@@ -205,7 +209,9 @@ jobs:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image - name: Pull image
run: docker pull ${{ env.IMAGE_TAG }} run: |
source ./.github/scripts/retry_docker.sh
retry_pull ${{ env.IMAGE_TAG }}
- name: Run pytest (sequential tests) - name: Run pytest (sequential tests)
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment