Unverified Commit b6911f78 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci: adding timeouts (#6062)

parent bd344cf9
...@@ -105,7 +105,20 @@ runs: ...@@ -105,7 +105,20 @@ runs:
echo " Source: ${SOURCE_REF}" echo " Source: ${SOURCE_REF}"
echo " Target: ${TARGET_REF}" echo " Target: ${TARGET_REF}"
skopeo copy --all "${SOURCE_REF}" "${TARGET_REF}" MAX_RETRIES=3
RETRY_DELAY=10
for attempt in $(seq 1 $MAX_RETRIES); do
echo "Attempt ${attempt}/${MAX_RETRIES}..."
if skopeo copy --all "${SOURCE_REF}" "${TARGET_REF}"; then
echo "target_image_ref=${{ inputs.target_registry }}/${TARGET_IMAGE}:${TARGET_TAG}" >> $GITHUB_OUTPUT echo "target_image_ref=${{ inputs.target_registry }}/${TARGET_IMAGE}:${TARGET_TAG}" >> $GITHUB_OUTPUT
echo "✅ Image copied successfully" echo "✅ Image copied successfully"
exit 0
fi
if [ "$attempt" -lt "$MAX_RETRIES" ]; then
echo "⚠️ Attempt ${attempt} failed, retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
RETRY_DELAY=$((RETRY_DELAY * 2))
fi
done
echo "❌ All ${MAX_RETRIES} attempts failed"
exit 1
...@@ -56,6 +56,26 @@ on: ...@@ -56,6 +56,26 @@ on:
required: false required: false
type: boolean type: boolean
default: true default: true
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
test_gpu_timeout_minutes:
description: 'Timeout in minutes for the GPU test step'
required: false
type: number
default: 30
test_cpu_timeout_minutes:
description: 'Timeout in minutes for the CPU test step'
required: false
type: number
default: 10
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 5
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -99,4 +119,8 @@ jobs: ...@@ -99,4 +119,8 @@ jobs:
run_tests: ${{ inputs.run_tests && !(inputs.framework == 'trtllm' && matrix.platform == 'arm64') }} # trtllm tests on arm64 are not supported run_tests: ${{ inputs.run_tests && !(inputs.framework == 'trtllm' && matrix.platform == 'arm64') }} # trtllm tests on arm64 are not supported
copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR
push_image: ${{ inputs.push_image }} push_image: ${{ inputs.push_image }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
test_gpu_timeout_minutes: ${{ inputs.test_gpu_timeout_minutes }}
test_cpu_timeout_minutes: ${{ inputs.test_cpu_timeout_minutes }}
copy_timeout_minutes: ${{ inputs.copy_timeout_minutes }}
secrets: inherit secrets: inherit
...@@ -66,6 +66,26 @@ on: ...@@ -66,6 +66,26 @@ on:
required: false required: false
type: boolean type: boolean
default: false default: false
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
test_gpu_timeout_minutes:
description: 'Timeout in minutes for the GPU test step'
required: false
type: number
default: 30
test_cpu_timeout_minutes:
description: 'Timeout in minutes for the CPU test step'
required: false
type: number
default: 10
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 5
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -185,6 +205,7 @@ jobs: ...@@ -185,6 +205,7 @@ jobs:
--short-output --short-output
- name: Build Container - name: Build Container
id: build-image id: build-image
timeout-minutes: ${{ inputs.build_timeout_minutes }}
uses: ./.github/actions/docker-remote-build uses: ./.github/actions/docker-remote-build
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }} image_tag: ${{ steps.calculate-target-tag.outputs.default_target_image_uri }}
...@@ -275,6 +296,7 @@ jobs: ...@@ -275,6 +296,7 @@ jobs:
# Run CPU-only tests first (parallelized for speed) # Run CPU-only tests first (parallelized for speed)
# These are unit tests marked with gpu_0 that don't require GPU hardware # These are unit tests marked with gpu_0 that don't require GPU hardware
- name: Run CPU-only tests (parallelized) - name: Run CPU-only tests (parallelized)
timeout-minutes: ${{ inputs.test_cpu_timeout_minutes }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
...@@ -290,6 +312,7 @@ jobs: ...@@ -290,6 +312,7 @@ jobs:
# Run GPU tests sequentially (only on amd64 runners with GPU) # Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_1 that require GPU hardware # These are e2e tests marked with gpu_1 that require GPU hardware
- name: Run GPU tests (sequential) - name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }}
if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64 if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
...@@ -333,6 +356,7 @@ jobs: ...@@ -333,6 +356,7 @@ jobs:
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Copy image to target registry - name: Copy image to target registry
timeout-minutes: ${{ inputs.copy_timeout_minutes }}
uses: ./.github/actions/skopeo-copy uses: ./.github/actions/skopeo-copy
with: with:
source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
......
...@@ -184,6 +184,7 @@ jobs: ...@@ -184,6 +184,7 @@ jobs:
${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
test_gpu_timeout_minutes: 35
secrets: inherit secrets: inherit
# ============================================================================ # ============================================================================
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment