Unverified Commit a61ac684 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci: modify container build routing (#6185)

parent 3188c70a
...@@ -13,25 +13,31 @@ ...@@ -13,25 +13,31 @@
# - TensorRT-LLM uses a different base (pytorch), so it's isolated # - TensorRT-LLM uses a different base (pytorch), so it's isolated
# - General builds have no framework, grouped with trtllm for isolation # - General builds have no framework, grouped with trtllm for isolation
# #
# Pool assignment is also optimized for uneven uptime: pod 0 is the only pod
# running outside business hours (via KEDA), so it accumulates fallback cache
# for all flavors overnight. To compensate, pool 0 is assigned the LIGHTEST
# daytime workload (trtllm + general), while pool 2 (only active during
# business hours) gets the HEAVIEST workload (vllm/sglang-cuda12).
#
# Flavors are routed to BuildKit pods using modulo 3 on the pod index: # Flavors are routed to BuildKit pods using modulo 3 on the pod index:
# - Pool 0 (idx % 3 == 0): vllm-cuda12, sglang-cuda12 (share cuda-dl-base + wheel_builder cache) # - Pool 0 (idx % 3 == 0): trtllm-cuda13, general (lightest - offsets overnight fallback load)
# - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache) # - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache)
# - Pool 2 (idx % 3 == 2): trtllm-cuda13, general (isolated - different/no framework base) # - Pool 2 (idx % 3 == 2): vllm-cuda12, sglang-cuda12 (heaviest - only active during business hours)
# #
# FALLBACK: If no pods match the target pool, the highest available index is used. # FALLBACK: If no pods match the target pool, the highest available index is used.
# #
# EXPECTED ROUTING TABLE (pod indices returned for each flavor): # EXPECTED ROUTING TABLE (pod indices returned for each flavor):
# +------+-------------+---------------+-------------+---------------+---------------+---------+ # +------+---------------+---------+-------------+---------------+-------------+---------------+
# | Pods | vllm-cuda12 | sglang-cuda12 | vllm-cuda13 | sglang-cuda13 | trtllm-cuda13 | general | # | Pods | trtllm-cuda13 | general | vllm-cuda13 | sglang-cuda13 | vllm-cuda12 | sglang-cuda12 |
# | | (mod 0) | (mod 0) | (mod 1) | (mod 1) | (mod 2) | (mod 2) | # | | (mod 0) | (mod 0) | (mod 1) | (mod 1) | (mod 2) | (mod 2) |
# +------+-------------+---------------+-------------+---------------+---------------+---------+ # +------+---------------+---------+-------------+---------------+-------------+---------------+
# | 1 | 0 | 0 | 0 (fb) | 0 (fb) | 0 (fb) | 0 (fb) | # | 1 | 0 | 0 | 0 (fb) | 0 (fb) | 0 (fb) | 0 (fb) |
# | 2 | 0 | 0 | 1 | 1 | 1 (fb) | 1 (fb) | # | 2 | 0 | 0 | 1 | 1 | 1 (fb) | 1 (fb) |
# | 3 | 0 | 0 | 1 | 1 | 2 | 2 | # | 3 | 0 | 0 | 1 | 1 | 2 | 2 |
# | 4 | 0, 3 | 0, 3 | 1 | 1 | 2 | 2 | # | 4 | 0, 3 | 0, 3 | 1 | 1 | 2 | 2 |
# | 5 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2 | 2 | # | 5 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2 | 2 |
# | 6 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2, 5 | 2, 5 | # | 6 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2, 5 | 2, 5 |
# +------+-------------+---------------+-------------+---------------+---------------+---------+ # +------+---------------+---------+-------------+---------------+-------------+---------------+
# (fb) = fallback - no pods matched target pool, returns max available index # (fb) = fallback - no pods matched target pool, returns max available index
# #
# ============================================================================= # =============================================================================
...@@ -177,21 +183,21 @@ get_target_indices() { ...@@ -177,21 +183,21 @@ get_target_indices() {
local target_mod local target_mod
case "$route_key" in case "$route_key" in
# --- POOL 0: CUDA 12 builds (vLLM + SGLang share cuda-dl-base:cuda12.9) --- # --- POOL 0: Isolated builds — lightest load offsets overnight fallback accumulation ---
vllm-cuda12|sglang-cuda12) trtllm-cuda13|general-*)
target_mod=0 target_mod=0
;; ;;
# --- POOL 1: CUDA 13 builds (vLLM + SGLang share cuda-dl-base:cuda13.0) --- # --- POOL 1: CUDA 13 builds (vLLM + SGLang share cuda-dl-base:cuda13.0) ---
vllm-cuda13|sglang-cuda13) vllm-cuda13|sglang-cuda13)
target_mod=1 target_mod=1
;; ;;
# --- POOL 2: Isolated builds (TensorRT-LLM uses pytorch base, general has no framework) --- # --- POOL 2: CUDA 12 builds — heaviest load, only active during business hours ---
trtllm-cuda13|general-*) vllm-cuda12|sglang-cuda12)
target_mod=2 target_mod=2
;; ;;
# --- FALLBACK --- # --- FALLBACK ---
*) *)
target_mod=2 target_mod=0
;; ;;
esac esac
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment