"vscode:/vscode.git/clone" did not exist on "751b4c269d98525a6c1031f945adfbd2b3445de1"
Unverified Commit 1d086539 authored by Sai Enduri's avatar Sai Enduri Committed by GitHub
Browse files

[AMD CI] Add image and weights caching. (#11593)

parent a04efc49
......@@ -30,7 +30,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -59,7 +59,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-2]
runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -86,7 +86,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -113,7 +113,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -156,7 +156,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -193,7 +193,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-2]
runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -240,7 +240,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
runs-on: ${{matrix.runner}}
steps:
......@@ -268,7 +268,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-2]
runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......@@ -323,7 +323,7 @@ jobs:
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-1]
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
......
......@@ -95,6 +95,19 @@ find_latest_image() {
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
esac
# First, check local cache
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
local local_image="rocm/sgl-dev:${image_tag}"
image_id=$(docker images -q "${local_image}")
if [[ -n "$image_id" ]]; then
echo "Found cached image locally: ${local_image}" >&2
echo "${local_image}"
return 0
fi
done
# If not found locally, fall back to pulling from public registry
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
......@@ -119,13 +132,22 @@ IMAGE=$(find_latest_image "${GPU_ARCH}")
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"
HF_CACHE_HOST=/home/runner/sgl-data/hf-cache
if [[ -d "$HF_CACHE_HOST" ]]; then
CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home"
else
CACHE_VOLUME=""
fi
echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
$CACHE_VOLUME \
--ipc=host --group-add video \
--shm-size 32g \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
-e HF_HOME=/hf_home \
--security-opt seccomp=unconfined \
-w /sglang-checkout \
--name ci_sglang \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment