Unverified Commit 1d086539 authored by Sai Enduri's avatar Sai Enduri Committed by GitHub
Browse files

[AMD CI] Add image and weights caching. (#11593)

parent a04efc49
...@@ -30,7 +30,7 @@ jobs: ...@@ -30,7 +30,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -59,7 +59,7 @@ jobs: ...@@ -59,7 +59,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-2] runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -86,7 +86,7 @@ jobs: ...@@ -86,7 +86,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -113,7 +113,7 @@ jobs: ...@@ -113,7 +113,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -156,7 +156,7 @@ jobs: ...@@ -156,7 +156,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -193,7 +193,7 @@ jobs: ...@@ -193,7 +193,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-2] runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -240,7 +240,7 @@ jobs: ...@@ -240,7 +240,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
...@@ -268,7 +268,7 @@ jobs: ...@@ -268,7 +268,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-2] runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -323,7 +323,7 @@ jobs: ...@@ -323,7 +323,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
runner: [linux-mi325-gpu-1] runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}} runs-on: ${{matrix.runner}}
steps: steps:
- name: Checkout code - name: Checkout code
......
...@@ -95,6 +95,19 @@ find_latest_image() { ...@@ -95,6 +95,19 @@ find_latest_image() {
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;; *) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
esac esac
# First, check local cache
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
local local_image="rocm/sgl-dev:${image_tag}"
image_id=$(docker images -q "${local_image}")
if [[ -n "$image_id" ]]; then
echo "Found cached image locally: ${local_image}" >&2
echo "${local_image}"
return 0
fi
done
# If not found locally, fall back to pulling from public registry
for days_back in {0..6}; do for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)" image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2 echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
...@@ -119,13 +132,22 @@ IMAGE=$(find_latest_image "${GPU_ARCH}") ...@@ -119,13 +132,22 @@ IMAGE=$(find_latest_image "${GPU_ARCH}")
echo "Pulling Docker image: ${IMAGE}" echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}" docker pull "${IMAGE}"
HF_CACHE_HOST=/home/runner/sgl-data/hf-cache
if [[ -d "$HF_CACHE_HOST" ]]; then
CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home"
else
CACHE_VOLUME=""
fi
echo "Launching container: ci_sglang" echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \ docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \ -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
$CACHE_VOLUME \
--ipc=host --group-add video \ --ipc=host --group-add video \
--shm-size 32g \ --shm-size 32g \
--cap-add=SYS_PTRACE \ --cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \ -e HF_TOKEN="${HF_TOKEN:-}" \
-e HF_HOME=/hf_home \
--security-opt seccomp=unconfined \ --security-opt seccomp=unconfined \
-w /sglang-checkout \ -w /sglang-checkout \
--name ci_sglang \ --name ci_sglang \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment