Unverified Commit a61ac684 authored by Ran Rubin's avatar Ran Rubin Committed by GitHub
Browse files

ci: modify container build routing (#6185)

parent 3188c70a
......@@ -13,25 +13,31 @@
# - TensorRT-LLM uses a different base (pytorch), so it's isolated
# - General builds have no framework, grouped with trtllm for isolation
#
# Pool assignment is also optimized for uneven uptime: pod 0 is the only pod
# running outside business hours (via KEDA), so it accumulates fallback cache
# for all flavors overnight. To compensate, pool 0 is assigned the LIGHTEST
# daytime workload (trtllm + general), while pool 2 (only active during
# business hours) gets the HEAVIEST workload (vllm/sglang-cuda12).
#
# Flavors are routed to BuildKit pods using modulo 3 on the pod index:
# - Pool 0 (idx % 3 == 0): vllm-cuda12, sglang-cuda12 (share cuda-dl-base + wheel_builder cache)
# - Pool 0 (idx % 3 == 0): trtllm-cuda13, general (lightest - offsets overnight fallback load)
# - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13 (share cuda-dl-base + wheel_builder cache)
# - Pool 2 (idx % 3 == 2): trtllm-cuda13, general (isolated - different/no framework base)
# - Pool 2 (idx % 3 == 2): vllm-cuda12, sglang-cuda12 (heaviest - only active during business hours)
#
# FALLBACK: If no pods match the target pool, the highest available index is used.
#
# EXPECTED ROUTING TABLE (pod indices returned for each flavor):
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# | Pods | vllm-cuda12 | sglang-cuda12 | vllm-cuda13 | sglang-cuda13 | trtllm-cuda13 | general |
# +------+---------------+---------+-------------+---------------+-------------+---------------+
# | Pods | trtllm-cuda13 | general | vllm-cuda13 | sglang-cuda13 | vllm-cuda12 | sglang-cuda12 |
# | | (mod 0) | (mod 0) | (mod 1) | (mod 1) | (mod 2) | (mod 2) |
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# +------+---------------+---------+-------------+---------------+-------------+---------------+
# | 1 | 0 | 0 | 0 (fb) | 0 (fb) | 0 (fb) | 0 (fb) |
# | 2 | 0 | 0 | 1 | 1 | 1 (fb) | 1 (fb) |
# | 3 | 0 | 0 | 1 | 1 | 2 | 2 |
# | 4 | 0, 3 | 0, 3 | 1 | 1 | 2 | 2 |
# | 5 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2 | 2 |
# | 6 | 0, 3 | 0, 3 | 1, 4 | 1, 4 | 2, 5 | 2, 5 |
# +------+-------------+---------------+-------------+---------------+---------------+---------+
# +------+---------------+---------+-------------+---------------+-------------+---------------+
# (fb) = fallback - no pods matched target pool, returns max available index
#
# =============================================================================
......@@ -177,21 +183,21 @@ get_target_indices() {
local target_mod
case "$route_key" in
# --- POOL 0: CUDA 12 builds (vLLM + SGLang share cuda-dl-base:cuda12.9) ---
vllm-cuda12|sglang-cuda12)
# --- POOL 0: Isolated builds — lightest load offsets overnight fallback accumulation ---
trtllm-cuda13|general-*)
target_mod=0
;;
# --- POOL 1: CUDA 13 builds (vLLM + SGLang share cuda-dl-base:cuda13.0) ---
vllm-cuda13|sglang-cuda13)
target_mod=1
;;
# --- POOL 2: Isolated builds (TensorRT-LLM uses pytorch base, general has no framework) ---
trtllm-cuda13|general-*)
# --- POOL 2: CUDA 12 builds — heaviest load, only active during business hours ---
vllm-cuda12|sglang-cuda12)
target_mod=2
;;
# --- FALLBACK ---
*)
target_mod=2
target_mod=0
;;
esac
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment