Unverified Commit d5af6040 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

chore: container build maintenance (#6316)

parent 9200ac9b
...@@ -125,4 +125,15 @@ runs: ...@@ -125,4 +125,15 @@ runs:
shell: bash shell: bash
run: | run: |
echo "Bootstrapping buildkit..." echo "Bootstrapping buildkit..."
docker buildx inspect ${{ inputs.builder_name }} --bootstrap for i in 1 2 3; do
if docker buildx inspect ${{ inputs.builder_name }} --bootstrap; then
echo "Bootstrap succeeded on attempt $i"
break
fi
if [ "$i" -eq 3 ]; then
echo "::error::Bootstrap failed after 3 attempts"
exit 1
fi
echo "::warning::Bootstrap attempt $i failed, retrying in 10s..."
sleep 10
done
...@@ -107,8 +107,7 @@ runs: ...@@ -107,8 +107,7 @@ runs:
# TODO: Fix this - Skip cache for frontend target - a different docker driver is used for the EPP build, which causes issues with cache export # TODO: Fix this - Skip cache for frontend target - a different docker driver is used for the EPP build, which causes issues with cache export
CACHE_ARGS="" CACHE_ARGS=""
if [[ "${{ inputs.target }}" != "frontend" ]]; then if [[ "${{ inputs.target }}" != "frontend" ]]; then
CACHE_ARGS="--cache-to type=inline " CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM}-cache "
CACHE_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM}-cache "
CACHE_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM} " CACHE_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM} "
CACHE_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM}-cache " CACHE_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM}-cache "
if [[ "$GITHUB_REF_NAME" =~ ^release ]]; then if [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
......
...@@ -20,25 +20,30 @@ ...@@ -20,25 +20,30 @@
# business hours) gets the HEAVIEST workload (vllm/sglang-cuda12). # business hours) gets the HEAVIEST workload (vllm/sglang-cuda12).
# #
# Flavors are routed to BuildKit pods using modulo 3 on the pod index: # Flavors are routed to BuildKit pods using modulo 3 on the pod index:
# - Pool 0 (idx % 3 == 0): trtllm-cuda13, general (lightest - offsets overnight fallback load) # - Pool 0 (idx % 3 == 0): trtllm (any CUDA), general (lightest - offsets overnight fallback load)
# - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache) # - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache)
# - Pool 2 (idx % 3 == 2): vllm-cuda12, sglang-cuda12 (heaviest - only active during business hours) # - Pool 2 (idx % 3 == 2): vllm-cuda12, sglang-cuda12 (heaviest - only active during business hours)
# Note: Unrecognized route keys (e.g. trtllm-cuda12) fall through to pool 0 via wildcard.
#
# SELECTION: From the candidate pool, ONE pod is randomly selected and its
# tcp:// address is written to $GITHUB_OUTPUT.
# #
# FALLBACK: If no pods match the target pool, the highest available index is used. # FALLBACK: If no pods match the target pool, the highest available index is used.
# #
# EXPECTED ROUTING TABLE (pod indices returned for each flavor): # CANDIDATE POOL TABLE (one pod is randomly selected from the candidate set):
# +------+---------------+---------+-------------+---------------+-------------+---------------+ # +------+---------------------+---------+---------------+---------------+---------------+---------------+
# | Pods | trtllm-cuda13 | general | vllm-cuda13 | sglang-cuda13 | vllm-cuda12 | sglang-cuda12 | # | Pods | trtllm (any cuda) | general | vllm-cuda13 | sglang-cuda13 | vllm-cuda12 | sglang-cuda12 |
# | | (mod 0) | (mod 0) | (mod 1) | (mod 1) | (mod 2) | (mod 2) | # | | (pool 0, mod 0) | (pool 0)| (pool 1,mod 1)| (pool 1,mod 1)| (pool 2,mod 2)| (pool 2,mod 2)|
# +------+---------------+---------+-------------+---------------+-------------+---------------+ # +------+---------------------+---------+---------------+---------------+---------------+---------------+
# | 1 | 0 | 0 | 0 (fb) | 0 (fb) | 0 (fb) | 0 (fb) | # | 1 | {0} | {0} | {0} (fb) | {0} (fb) | {0} (fb) | {0} (fb) |
# | 2 | 0 | 0 | 1 | 1 | 1 (fb) | 1 (fb) | # | 2 | {0} | {0} | {1} | {1} | {1} (fb) | {1} (fb) |
# | 3 | 0 | 0 | 1 | 1 | 2 | 2 | # | 3 | {0} | {0} | {1} | {1} | {2} | {2} |
# | 4 | 0, 3 | 0, 3 | 1 | 1 | 2 | 2 | # | 4 | {0, 3} | {0, 3} | {1} | {1} | {2} | {2} |
# | 5 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2 | 2 | # | 5 | {0, 3} | {0, 3} | {1, 4} | {1, 4} | {2} | {2} |
# | 6 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2, 5 | 2, 5 | # | 6 | {0, 3} | {0, 3} | {1, 4} | {1, 4} | {2, 5} | {2, 5} |
# +------+---------------+---------+-------------+---------------+-------------+---------------+ # +------+---------------------+---------+---------------+---------------+---------------+---------------+
# (fb) = fallback - no pods matched target pool, returns max available index # {x, y} = candidate pool; ONE pod is randomly selected from this set
# (fb) = no pods in target pool; falls back to highest available index
# #
# ============================================================================= # =============================================================================
...@@ -142,7 +147,7 @@ if ! command -v nslookup &> /dev/null; then ...@@ -142,7 +147,7 @@ if ! command -v nslookup &> /dev/null; then
fi fi
# --- RETRY CONFIGURATION --- # --- RETRY CONFIGURATION ---
MAX_RETRIES=${MAX_RETRIES:-8} MAX_RETRIES=${MAX_RETRIES:-2}
RETRY_DELAY=${RETRY_DELAY:-30} RETRY_DELAY=${RETRY_DELAY:-30}
# --------------------------- # ---------------------------
......
aiconfigurator @ 7c08d2f2
Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
...@@ -84,12 +84,13 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \ ...@@ -84,12 +84,13 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/ COPY --chmod=775 --chown=dynamo:0 benchmarks/ /workspace/benchmarks/
# Install common and test dependencies as root # Install common and test dependencies as root
RUN --mount=type=bind,source=.,target=/mnt/local_src \ RUN --mount=type=bind,source=container/deps/requirements.txt,target=/tmp/deps/requirements.txt \
--mount=type=bind,source=container/deps/requirements.test.txt,target=/tmp/deps/requirements.test.txt \
--mount=type=cache,target=/root/.cache/pip,sharing=locked \ --mount=type=cache,target=/root/.cache/pip,sharing=locked \
export PIP_CACHE_DIR=/root/.cache/pip && \ export PIP_CACHE_DIR=/root/.cache/pip && \
pip install --break-system-packages \ pip install --break-system-packages \
--requirement /mnt/local_src/container/deps/requirements.txt \ --requirement /tmp/deps/requirements.txt \
--requirement /mnt/local_src/container/deps/requirements.test.txt \ --requirement /tmp/deps/requirements.test.txt \
sglang==${SGLANG_VERSION} && \ sglang==${SGLANG_VERSION} && \
cd /workspace/benchmarks && \ cd /workspace/benchmarks && \
pip install --break-system-packages . && \ pip install --break-system-packages . && \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment