amd_ci_start_container.sh 4.62 KB
Newer Older
Sai Enduri's avatar
Sai Enduri committed
1
2
3
#!/bin/bash
set -euo pipefail

4
# Get version from SGLang version.py file
5
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
6
SGLANG_VERSION="v0.5.0rc0"   # Default version, will be overridden if version.py is found
7
8

if [ -f "$SGLANG_VERSION_FILE" ]; then
9
  VERSION_FROM_FILE=$(python3 -c '
10
11
12
13
14
15
import re, sys
with open(sys.argv[1], "r") as f:
    content = f.read()
    match = re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", content)
    if match:
        print("v" + match.group(1))
16
' "$SGLANG_VERSION_FILE" 2>/dev/null || echo "")
17

18
19
20
21
22
  if [ -n "$VERSION_FROM_FILE" ]; then
      SGLANG_VERSION="$VERSION_FROM_FILE"
      echo "Using SGLang version from version.py: $SGLANG_VERSION"
  else
      echo "Warning: Could not parse version from $SGLANG_VERSION_FILE, using default: $SGLANG_VERSION" >&2
23
24
  fi
else
25
  echo "Warning: version.py not found, using default version: $SGLANG_VERSION" >&2
26
27
fi

28

29
# Default base tags (can be overridden by command line arguments)
30
31
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
32
33

# Parse command line arguments
34
35
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
36
37
38

while [[ $# -gt 0 ]]; do
  case $1 in
39
40
    --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
    --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
41
42
43
44
    -h|--help)
      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
      exit 0
      ;;
45
    *) echo "Unknown option $1"; exit 1;;
46
47
48
  esac
done

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78


# Detect GPU architecture from the Kubernetes runner hostname
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x"   # default

# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
  GPU_ARCH="${BASH_REMATCH[1]}"
  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi

# Normalise / collapse architectures we don’t yet build specifically for
case "${GPU_ARCH}" in
  mi35x)
    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
    ;;
  mi30x|mi300|mi325)
    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
    GPU_ARCH="mi30x"
    ;;
  *)
    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
    GPU_ARCH="mi30x"
    ;;
esac


Sai Enduri's avatar
Sai Enduri committed
79
# Set up DEVICE_FLAG based on Kubernetes pod info
80
if [[ -f /etc/podinfo/gha-render-devices ]]; then
Sai Enduri's avatar
Sai Enduri committed
81
82
83
84
85
  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
  DEVICE_FLAG="--device /dev/dri"
fi

86

87
# Find the latest image
88
89
find_latest_image() {
  local gpu_arch=$1
90
  local base_tag days_back image_tag
91

92
93
94
95
96
  case "${gpu_arch}" in
      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
  esac
97

98
99
100
101
102
103
104
105
106
107
108
109
110
  # First, check local cache
  for days_back in {0..6}; do
    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
    local local_image="rocm/sgl-dev:${image_tag}"
    image_id=$(docker images -q "${local_image}")
    if [[ -n "$image_id" ]]; then
        echo "Found cached image locally: ${local_image}" >&2
        echo "${local_image}"
        return 0
    fi
  done

  # If not found locally, fall back to pulling from public registry
111
112
  for days_back in {0..6}; do
    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
113
114
115
116
117
118
119
120
    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
      echo "rocm/sgl-dev:${image_tag}"
      return 0
    fi
  done

121
122
123
  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
  echo "Using hard-coded fallback…" >&2
  if [[ "${gpu_arch}" == "mi35x" ]]; then
124
125
    echo "rocm/sgl-dev:v0.5.0rc0-rocm700-mi35x-20250812"
  else
126
    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
127
  fi
128
129
}

130
# Pull and run the latest image
131
IMAGE=$(find_latest_image "${GPU_ARCH}")
132
133
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"
Sai Enduri's avatar
Sai Enduri committed
134

135
136
137
138
139
140
141
HF_CACHE_HOST=/home/runner/sgl-data/hf-cache
if [[ -d "$HF_CACHE_HOST" ]]; then
    CACHE_VOLUME="-v $HF_CACHE_HOST:/hf_home"
else
    CACHE_VOLUME=""
fi

142
143
echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
Sai Enduri's avatar
Sai Enduri committed
144
  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
145
  $CACHE_VOLUME \
Sai Enduri's avatar
Sai Enduri committed
146
  --ipc=host --group-add video \
147
  --shm-size 32g \
Sai Enduri's avatar
Sai Enduri committed
148
149
  --cap-add=SYS_PTRACE \
  -e HF_TOKEN="${HF_TOKEN:-}" \
150
  -e HF_HOME=/hf_home \
Sai Enduri's avatar
Sai Enduri committed
151
152
153
  --security-opt seccomp=unconfined \
  -w /sglang-checkout \
  --name ci_sglang \
154
  "${IMAGE}"