Unverified Commit fcbd79ab authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

feat(nightly-ci): replace no_cache with fresh_builder for K8s BuildKit isolation (#7678)


Co-authored-by: default avatarClaude Sonnet 4.6 <noreply@anthropic.com>
parent c43197ee
......@@ -59,6 +59,10 @@ inputs:
description: 'Skip the bootstrap step (only create the builder)'
required: false
default: 'false'
fresh_builder:
description: 'Force creation of a new K8s builder even if one already exists. Used by the create-fresh-builder preliminary job.'
required: false
default: 'false'
runs:
using: "composite"
......@@ -83,11 +87,14 @@ runs:
if: inputs.buildkit_worker_addresses == ''
shell: bash
run: |
if docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
# If exit code is 0 (success), print the message
if [[ "${{ inputs.fresh_builder }}" != "true" ]] && docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
echo "✅ Builder '${{ inputs.builder_name }}' already exists. Skipping creation."
else
echo "K8s Builder '${{ inputs.builder_name }}' does not exist. Creating it."
if [[ "${{ inputs.fresh_builder }}" == "true" ]] && docker buildx inspect ${{ inputs.builder_name }} > /dev/null 2>&1; then
echo "🔄 Forcing fresh K8s builder: removing existing '${{ inputs.builder_name }}'."
docker buildx rm ${{ inputs.builder_name }} || true
fi
echo "🔨 Creating K8s builder '${{ inputs.builder_name }}'."
docker buildx create --use --name ${{ inputs.builder_name }} --driver kubernetes --platform=linux/amd64 \
'--driver-opt=requests.ephemeral-storage=${{ inputs.ephemeral_storage }}' \
'--driver-opt=namespace=${{ inputs.namespace }}' \
......@@ -112,7 +119,7 @@ runs:
fi
sleep 3 # Give the builders some time to be ready
if [[ "${{ inputs.skip_bootstrap }}" != "true" ]]; then
if [[ "${{ inputs.skip_bootstrap }}" != "true" && "${{ inputs.fresh_builder }}" != "true" ]]; then
echo "::warning::Build is using fallback pod. Please alert the ops team."
echo "## ⚠️ Fallback Build Warning" >> $GITHUB_STEP_SUMMARY
echo "This build is running on a **fallback pod**. Please alert the ops team." >> $GITHUB_STEP_SUMMARY
......
......@@ -79,6 +79,10 @@ inputs:
description: 'Additional Docker build args (newline-separated KEY=VALUE pairs) forwarded to docker-remote-build'
required: false
default: ''
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
default: 'false'
outputs:
target_tag_plain:
description: 'Target tag (without registry prefix)'
......@@ -127,6 +131,7 @@ runs:
flavor: ${{ inputs.framework }}
arch: ${{ inputs.platform }}
cuda_version: ${{ inputs.cuda_version }}
fresh_builder: ${{ inputs.fresh_builder }}
- name: Calculate extra tags
id: extra-tags
shell: bash
......
......@@ -52,6 +52,10 @@ inputs:
description: 'CUDA version (12.9, 13.0). Optional for general flavor.'
required: false
default: ''
fresh_builder:
description: 'Skip remote worker routing and always use the K8s driver. Set by callers that want a fresh, isolated builder per run.'
required: false
default: 'false'
# Passthrough inputs for bootstrap-buildkit (kubernetes fallback)
ephemeral_storage:
......@@ -88,6 +92,7 @@ runs:
steps:
- name: Route buildkit workers
id: route-buildkit
if: inputs.fresh_builder != 'true'
continue-on-error: true
shell: bash
run: |
......@@ -108,6 +113,7 @@ runs:
- name: Prepare worker addresses and platform
id: prepare
if: inputs.fresh_builder != 'true'
shell: bash
env:
AMD64_ADDRS: ${{ steps.route-buildkit.outputs[format('{0}_amd64', inputs.flavor)] }}
......@@ -138,6 +144,7 @@ runs:
with:
builder_name: ${{ inputs.builder_name }}
buildkit_worker_addresses: ${{ steps.prepare.outputs.worker_addresses }}
fresh_builder: ${{ inputs.fresh_builder }}
ephemeral_storage: ${{ inputs.ephemeral_storage }}
namespace: ${{ inputs.namespace }}
replicas: ${{ inputs.replicas }}
......
......@@ -114,6 +114,11 @@ on:
required: false
type: boolean
default: false
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
type: boolean
default: false
secrets:
AWS_DEFAULT_REGION:
required: true
......@@ -151,6 +156,7 @@ jobs:
cuda_version: ${{ matrix.cuda_version }}
extra_tags: ${{ inputs.extra_tags }}
no_cache: ${{ inputs.no_cache }}
fresh_builder: ${{ inputs.fresh_builder }}
builder_name: ${{ inputs.builder_name }}
build_image: ${{ inputs.build_image }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
......
......@@ -118,6 +118,11 @@ on:
required: false
type: boolean
default: false
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
type: boolean
default: false
secrets:
AWS_DEFAULT_REGION:
required: true
......@@ -207,6 +212,7 @@ jobs:
no_load: ${{ inputs.no_load }}
no_cache: ${{ inputs.no_cache }}
make_efa: ${{ inputs.make_efa }}
fresh_builder: ${{ inputs.fresh_builder }}
extra_tags: ${{ inputs.extra_tags }}
show_summary: ${{ inputs.push_image && inputs.show_summary }}
......
......@@ -11,7 +11,37 @@ on:
permissions:
contents: read
env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs:
# ============================================================================
# PRE-WARM K8S BUILDER
# ============================================================================
create-fresh-builder:
name: Create fresh K8s builder
runs-on: prod-default-small-v2
permissions:
contents: read
outputs:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Export builder name
id: export-builder-name
run: |
echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT
- name: Create and bootstrap fresh K8s builder
uses: ./.github/actions/bootstrap-buildkit
with:
builder_name: ${{ steps.export-builder-name.outputs.builder_name }}
buildkit_worker_addresses: ''
fresh_builder: 'true'
# ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================
......@@ -20,16 +50,17 @@ jobs:
# ============================================================================
vllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with:
framework: vllm
target: runtime
no_cache: true
fresh_builder: true
platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180
cpu_only_test_markers: 'vllm and gpu_0'
single_gpu_test_markers: 'vllm and gpu_1'
......@@ -43,16 +74,17 @@ jobs:
# ============================================================================
sglang-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with:
framework: sglang
target: runtime
no_cache: true
fresh_builder: true
platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180
cpu_only_test_markers: 'sglang and gpu_0'
single_gpu_test_markers: 'sglang and gpu_1'
......@@ -66,16 +98,17 @@ jobs:
# ============================================================================
trtllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
needs: [create-fresh-builder]
with:
framework: trtllm
target: runtime
no_cache: true
fresh_builder: true
platform: 'linux/amd64,linux/arm64'
cuda_versions: '["13.1"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
build_timeout_minutes: 180
cpu_only_test_markers: 'trtllm and gpu_0'
single_gpu_test_markers: 'trtllm and gpu_1'
......@@ -84,10 +117,35 @@ jobs:
multi_gpu_test_timeout_minutes: 120
secrets: inherit
# ============================================================================
# CLEANUP
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, create-fresh-builder]
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Register K8s builder context (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }}
buildkit_worker_addresses: ''
skip_bootstrap: 'true'
- name: Remove K8s builder
shell: bash
run: |
docker buildx rm ${{ env.BUILDER_NAME }} || true
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-builder-amd-v1
runs-on: ubuntu-slim
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ]
permissions:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment