ci: modify container build routing (#6185)

a61ac684 · Ran Rubin · GitHub · 3188c70a · a61ac684
Unverified Commit a61ac684 authored Feb 11, 2026 by Ran Rubin Committed by GitHub Feb 11, 2026
Show whitespace changes
Inline Side-by-side

Showing with 24 additions and 18 deletions

.github/scripts/route_buildkit.sh .github/scripts/route_buildkit.sh +24 -18

No files found.
--- a/.github/scripts/route_buildkit.sh
+++ b/.github/scripts/route_buildkit.sh
@@ -13,25 +13,31 @@
 #   - TensorRT-LLM uses a different base (pytorch), so it's isolated
 #   - General builds have no framework, grouped with trtllm for isolation
 #
+# Pool assignment is also optimized for uneven uptime: pod 0 is the only pod
+# running outside business hours (via KEDA), so it accumulates fallback cache
+# for all flavors overnight. To compensate, pool 0 is assigned the LIGHTEST
+# daytime workload (trtllm + general), while pool 2 (only active during
+# business hours) gets the HEAVIEST workload (vllm/sglang-cuda12).
+#
 # Flavors are routed to BuildKit pods using modulo 3 on the pod index:
-#   - Pool 0 (idx % 3 == 0): vllm-cuda12, sglang-cuda12  (share cuda-dl-base + wheel_builder cache)
+#   - Pool 0 (idx % 3 == 0): trtllm-cuda13, general      (lightest - offsets overnight fallback load)
 #   - Pool 1 (idx % 3 == 1): vllm-cuda13, sglang-cuda13  (share cuda-dl-base + wheel_builder cache)
-#   - Pool 2 (idx % 3 == 2): trtllm-cuda13, general      (isolated - different/no framework base)
+#   - Pool 2 (idx % 3 == 2): vllm-cuda12, sglang-cuda12  (heaviest - only active during business hours)
 #
 # FALLBACK: If no pods match the target pool, the highest available index is used.
 #
 # EXPECTED ROUTING TABLE (pod indices returned for each flavor):
-# +------+-------------+---------------+-------------+---------------+---------------+---------+
-# | Pods | vllm-cuda12 | sglang-cuda12 | vllm-cuda13 | sglang-cuda13 | trtllm-cuda13 | general |
+# +------+---------------+---------+-------------+---------------+-------------+---------------+
+# | Pods | trtllm-cuda13 | general | vllm-cuda13 | sglang-cuda13 | vllm-cuda12 | sglang-cuda12 |
 # |      | (mod 0)       | (mod 0) | (mod 1)     | (mod 1)       | (mod 2)     | (mod 2)       |
-# +------+-------------+---------------+-------------+---------------+---------------+---------+
+# +------+---------------+---------+-------------+---------------+-------------+---------------+
 # |  1   | 0             | 0       | 0 (fb)      | 0 (fb)        | 0 (fb)      | 0 (fb)        |
 # |  2   | 0             | 0       | 1           | 1             | 1 (fb)      | 1 (fb)        |
 # |  3   | 0             | 0       | 1           | 1             | 2           | 2             |
 # |  4   | 0, 3          | 0, 3    | 1           | 1             | 2           | 2             |
 # |  5   | 0, 3          | 0, 3    | 1, 4        | 1, 4          | 2           | 2             |
 # |  6   | 0, 3          | 0, 3    | 1, 4        | 1, 4          | 2, 5        | 2, 5          |
-# +------+-------------+---------------+-------------+---------------+---------------+---------+
+# +------+---------------+---------+-------------+---------------+-------------+---------------+
 # (fb) = fallback - no pods matched target pool, returns max available index
 #
 # =============================================================================
@@ -177,21 +183,21 @@ get_target_indices() {
  local target_mod

  case "$route_key" in
-    # --- POOL 0: CUDA 12 builds (vLLM + SGLang share cuda-dl-base:cuda12.9) ---
-    vllm-cuda12|sglang-cuda12)
+    # --- POOL 0: Isolated builds — lightest load offsets overnight fallback accumulation ---
+    trtllm-cuda13|general-*)
      target_mod=0
      ;;
    # --- POOL 1: CUDA 13 builds (vLLM + SGLang share cuda-dl-base:cuda13.0) ---
    vllm-cuda13|sglang-cuda13)
      target_mod=1
      ;;
-    # --- POOL 2: Isolated builds (TensorRT-LLM uses pytorch base, general has no framework) ---
-    trtllm-cuda13|general-*)
+    # --- POOL 2: CUDA 12 builds — heaviest load, only active during business hours ---
+    vllm-cuda12|sglang-cuda12)
      target_mod=2
      ;;
    # --- FALLBACK ---
    *)
-      target_mod=2
+      target_mod=0
      ;;
  esac