Unverified Commit fcbd79ab authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

feat(nightly-ci): replace no_cache with fresh_builder for K8s BuildKit isolation (#7678)


Co-authored-by: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent c43197ee
...@@ -59,6 +59,10 @@ inputs: ...@@ -59,6 +59,10 @@ inputs:
description: 'Skip the bootstrap step (only create the builder)' description: 'Skip the bootstrap step (only create the builder)'
required: false required: false
default: 'false' default: 'false'
fresh_builder:
description: 'Force creation of a new K8s builder even if one already exists. Used by the create-fresh-builder preliminary job.'
required: false
default: 'false'
runs: runs:
using: "composite" using: "composite"
...@@ -83,11 +87,14 @@ runs: ...@@ -83,11 +87,14 @@ runs:
if: inputs.buildkit_worker_addresses == '' if: inputs.buildkit_worker_addresses == ''
shell: bash shell: bash
run: | run: |
if docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then if [[ "${{ inputs.fresh_builder }}" != "true" ]] && docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
# If exit code is 0 (success), print the message
echo "✅ Builder '${{ inputs.builder_name }}' already exists. Skipping creation." echo "✅ Builder '${{ inputs.builder_name }}' already exists. Skipping creation."
else else
echo "K8s Builder '${{ inputs.builder_name }}' does not exist. Creating it." if [[ "${{ inputs.fresh_builder }}" == "true" ]] && docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
echo "🔄 Forcing fresh K8s builder: removing existing '${{ inputs.builder_name }}'."
docker buildx rm ${{ inputs.builder_name }} || true
fi
echo "🔨 Creating K8s builder '${{ inputs.builder_name }}'."
docker buildx create --use --name ${{ inputs.builder_name }} --driver kubernetes --platform=linux/amd64 \ docker buildx create --use --name ${{ inputs.builder_name }} --driver kubernetes --platform=linux/amd64 \
'--driver-opt=requests.ephemeral-storage=${{ inputs.ephemeral_storage }}' \ '--driver-opt=requests.ephemeral-storage=${{ inputs.ephemeral_storage }}' \
'--driver-opt=namespace=${{ inputs.namespace }}' \ '--driver-opt=namespace=${{ inputs.namespace }}' \
...@@ -112,7 +119,7 @@ runs: ...@@ -112,7 +119,7 @@ runs:
fi fi
sleep 3 # Give the builders some time to be ready sleep 3 # Give the builders some time to be ready
if [[ "${{ inputs.skip_bootstrap }}" != "true" ]]; then if [[ "${{ inputs.skip_bootstrap }}" != "true" && "${{ inputs.fresh_builder }}" != "true" ]]; then
echo "::warning::Build is using fallback pod. Please alert the ops team." echo "::warning::Build is using fallback pod. Please alert the ops team."
echo "## ⚠️ Fallback Build Warning" >> $GITHUB_STEP_SUMMARY echo "## ⚠️ Fallback Build Warning" >> $GITHUB_STEP_SUMMARY
echo "This build is running on a **fallback pod**. Please alert the ops team." >> $GITHUB_STEP_SUMMARY echo "This build is running on a **fallback pod**. Please alert the ops team." >> $GITHUB_STEP_SUMMARY
......
...@@ -79,6 +79,10 @@ inputs: ...@@ -79,6 +79,10 @@ inputs:
description: 'Additional Docker build args (newline-separated KEY=VALUE pairs) forwarded to docker-remote-build' description: 'Additional Docker build args (newline-separated KEY=VALUE pairs) forwarded to docker-remote-build'
required: false required: false
default: '' default: ''
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
default: 'false'
outputs: outputs:
target_tag_plain: target_tag_plain:
description: 'Target tag (without registry prefix)' description: 'Target tag (without registry prefix)'
...@@ -127,6 +131,7 @@ runs: ...@@ -127,6 +131,7 @@ runs:
flavor: ${{ inputs.framework }} flavor: ${{ inputs.framework }}
arch: ${{ inputs.platform }} arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }} cuda_version: ${{ inputs.cuda_version }}
fresh_builder: ${{ inputs.fresh_builder }}
- name: Calculate extra tags - name: Calculate extra tags
id: extra-tags id: extra-tags
shell: bash shell: bash
......
...@@ -52,6 +52,10 @@ inputs: ...@@ -52,6 +52,10 @@ inputs:
description: 'CUDA version (12.9, 13.0). Optional for general flavor.' description: 'CUDA version (12.9, 13.0). Optional for general flavor.'
required: false required: false
default: '' default: ''
fresh_builder:
description: 'Skip remote worker routing and always use the K8s driver. Set by callers that want a fresh, isolated builder per run.'
required: false
default: 'false'
# Passthrough inputs for bootstrap-buildkit (kubernetes fallback) # Passthrough inputs for bootstrap-buildkit (kubernetes fallback)
ephemeral_storage: ephemeral_storage:
...@@ -88,6 +92,7 @@ runs: ...@@ -88,6 +92,7 @@ runs:
steps: steps:
- name: Route buildkit workers - name: Route buildkit workers
id: route-buildkit id: route-buildkit
if: inputs.fresh_builder != 'true'
continue-on-error: true continue-on-error: true
shell: bash shell: bash
run: | run: |
...@@ -108,6 +113,7 @@ runs: ...@@ -108,6 +113,7 @@ runs:
- name: Prepare worker addresses and platform - name: Prepare worker addresses and platform
id: prepare id: prepare
if: inputs.fresh_builder != 'true'
shell: bash shell: bash
env: env:
AMD64_ADDRS: ${{ steps.route-buildkit.outputs[format('{0}_amd64', inputs.flavor)] }} AMD64_ADDRS: ${{ steps.route-buildkit.outputs[format('{0}_amd64', inputs.flavor)] }}
...@@ -138,6 +144,7 @@ runs: ...@@ -138,6 +144,7 @@ runs:
with: with:
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
buildkit_worker_addresses: ${{ steps.prepare.outputs.worker_addresses }} buildkit_worker_addresses: ${{ steps.prepare.outputs.worker_addresses }}
fresh_builder: ${{ inputs.fresh_builder }}
ephemeral_storage: ${{ inputs.ephemeral_storage }} ephemeral_storage: ${{ inputs.ephemeral_storage }}
namespace: ${{ inputs.namespace }} namespace: ${{ inputs.namespace }}
replicas: ${{ inputs.replicas }} replicas: ${{ inputs.replicas }}
......
...@@ -114,6 +114,11 @@ on: ...@@ -114,6 +114,11 @@ on:
required: false required: false
type: boolean type: boolean
default: false default: false
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
type: boolean
default: false
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -151,6 +156,7 @@ jobs: ...@@ -151,6 +156,7 @@ jobs:
cuda_version: ${{ matrix.cuda_version }} cuda_version: ${{ matrix.cuda_version }}
extra_tags: ${{ inputs.extra_tags }} extra_tags: ${{ inputs.extra_tags }}
no_cache: ${{ inputs.no_cache }} no_cache: ${{ inputs.no_cache }}
fresh_builder: ${{ inputs.fresh_builder }}
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
build_image: ${{ inputs.build_image }} build_image: ${{ inputs.build_image }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }} build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
......
...@@ -118,6 +118,11 @@ on: ...@@ -118,6 +118,11 @@ on:
required: false required: false
type: boolean type: boolean
default: false default: false
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
type: boolean
default: false
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -207,6 +212,7 @@ jobs: ...@@ -207,6 +212,7 @@ jobs:
no_load: ${{ inputs.no_load }} no_load: ${{ inputs.no_load }}
no_cache: ${{ inputs.no_cache }} no_cache: ${{ inputs.no_cache }}
make_efa: ${{ inputs.make_efa }} make_efa: ${{ inputs.make_efa }}
fresh_builder: ${{ inputs.fresh_builder }}
extra_tags: ${{ inputs.extra_tags }} extra_tags: ${{ inputs.extra_tags }}
show_summary: ${{ inputs.push_image && inputs.show_summary }} show_summary: ${{ inputs.push_image && inputs.show_summary }}
......
...@@ -11,7 +11,37 @@ on: ...@@ -11,7 +11,37 @@ on:
permissions: permissions:
contents: read contents: read
env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs: jobs:
# ============================================================================
# PRE-WARM K8S BUILDER
# ============================================================================
create-fresh-builder:
name: Create fresh K8s builder
runs-on: prod-default-small-v2
permissions:
contents: read
outputs:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
- name: Create and bootstrap fresh K8s builder
uses: ./.github/actions/bootstrap-buildkit
with:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
buildkit_worker_addresses: ''
fresh_builder: 'true'
# ============================================================================ # ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy) # FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================ # ============================================================================
...@@ -20,16 +50,17 @@ jobs: ...@@ -20,16 +50,17 @@ jobs:
# ============================================================================ # ============================================================================
vllm-pipeline: vllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with: with:
framework: vllm framework: vllm
target: runtime target: runtime
no_cache: true fresh_builder: true
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]' cuda_versions: '["12.9", "13.0"]'
extra_tags: | extra_tags: |
${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180 build_timeout_minutes: 180
cpu_only_test_markers: 'vllm and gpu_0' cpu_only_test_markers: 'vllm and gpu_0'
single_gpu_test_markers: 'vllm and gpu_1' single_gpu_test_markers: 'vllm and gpu_1'
...@@ -43,16 +74,17 @@ jobs: ...@@ -43,16 +74,17 @@ jobs:
# ============================================================================ # ============================================================================
sglang-pipeline: sglang-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with: with:
framework: sglang framework: sglang
target: runtime target: runtime
no_cache: true fresh_builder: true
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]' cuda_versions: '["12.9", "13.0"]'
extra_tags: | extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180 build_timeout_minutes: 180
cpu_only_test_markers: 'sglang and gpu_0' cpu_only_test_markers: 'sglang and gpu_0'
single_gpu_test_markers: 'sglang and gpu_1' single_gpu_test_markers: 'sglang and gpu_1'
...@@ -66,16 +98,17 @@ jobs: ...@@ -66,16 +98,17 @@ jobs:
# ============================================================================ # ============================================================================
trtllm-pipeline: trtllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with: with:
framework: trtllm framework: trtllm
target: runtime target: runtime
no_cache: true fresh_builder: true
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["13.1"]' cuda_versions: '["13.1"]'
extra_tags: | extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180 build_timeout_minutes: 180
cpu_only_test_markers: 'trtllm and gpu_0' cpu_only_test_markers: 'trtllm and gpu_0'
single_gpu_test_markers: 'trtllm and gpu_1' single_gpu_test_markers: 'trtllm and gpu_1'
...@@ -84,10 +117,35 @@ jobs: ...@@ -84,10 +117,35 @@ jobs:
multi_gpu_test_timeout_minutes: 120 multi_gpu_test_timeout_minutes: 120
secrets: inherit secrets: inherit
# ============================================================================
# CLEANUP
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, create-fresh-builder]
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Register K8s builder context (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
buildkit_worker_addresses: ''
skip_bootstrap: 'true'
- name: Remove K8s builder
shell: bash
run: |
docker buildx rm ${{ env.BUILDER_NAME }} || true
############################## SLACK NOTIFICATION ############################## ############################## SLACK NOTIFICATION ##############################
notify-slack: notify-slack:
name: Notify Slack name: Notify Slack
runs-on: prod-builder-amd-v1 runs-on: ubuntu-slim
if: always() && failure() if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ] needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ]
permissions: permissions:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment