"vscode:/vscode.git/clone" did not exist on "4c584fc6324b7326cf5fbe4ad248277bdf1c05e3"
amd_ci_start_container.sh 3.98 KB
Newer Older
Sai Enduri's avatar
Sai Enduri committed
1
2
3
#!/bin/bash
set -euo pipefail

4
# Get version from SGLang version.py file
5
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
6
SGLANG_VERSION="v0.5.0rc0"   # Default version, will be overridden if version.py is found
7
8

if [ -f "$SGLANG_VERSION_FILE" ]; then
9
  VERSION_FROM_FILE=$(python3 -c '
10
11
12
13
14
15
import re, sys
with open(sys.argv[1], "r") as f:
    content = f.read()
    match = re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", content)
    if match:
        print("v" + match.group(1))
16
' "$SGLANG_VERSION_FILE" 2>/dev/null || echo "")
17

18
19
20
21
22
  if [ -n "$VERSION_FROM_FILE" ]; then
      SGLANG_VERSION="$VERSION_FROM_FILE"
      echo "Using SGLang version from version.py: $SGLANG_VERSION"
  else
      echo "Warning: Could not parse version from $SGLANG_VERSION_FILE, using default: $SGLANG_VERSION" >&2
23
24
  fi
else
25
  echo "Warning: version.py not found, using default version: $SGLANG_VERSION" >&2
26
27
fi

28

29
# Default base tags (can be overridden by command line arguments)
30
31
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
32
33

# Parse command line arguments
34
35
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
36
37
38

while [[ $# -gt 0 ]]; do
  case $1 in
39
40
    --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
    --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
41
42
43
44
    -h|--help)
      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
      exit 0
      ;;
45
    *) echo "Unknown option $1"; exit 1;;
46
47
48
  esac
done

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78


# Detect GPU architecture from the Kubernetes runner hostname
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x"   # default

# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
  GPU_ARCH="${BASH_REMATCH[1]}"
  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi

# Normalise / collapse architectures we don’t yet build specifically for
case "${GPU_ARCH}" in
  mi35x)
    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
    ;;
  mi30x|mi300|mi325)
    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
    GPU_ARCH="mi30x"
    ;;
  *)
    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
    GPU_ARCH="mi30x"
    ;;
esac


Sai Enduri's avatar
Sai Enduri committed
79
# Set up DEVICE_FLAG based on Kubernetes pod info
80
if [[ -f /etc/podinfo/gha-render-devices ]]; then
Sai Enduri's avatar
Sai Enduri committed
81
82
83
84
85
  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
  DEVICE_FLAG="--device /dev/dri"
fi

86

87
# Find the latest image
88
89
find_latest_image() {
  local gpu_arch=$1
90
  local base_tag days_back image_tag
91

92
93
94
95
96
  case "${gpu_arch}" in
      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
  esac
97

98
99
  for days_back in {0..6}; do
    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
100
101
102
103
104
105
106
107
    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
      echo "rocm/sgl-dev:${image_tag}"
      return 0
    fi
  done

108
109
110
  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
  echo "Using hard-coded fallback…" >&2
  if [[ "${gpu_arch}" == "mi35x" ]]; then
111
112
    echo "rocm/sgl-dev:v0.5.0rc0-rocm700-mi35x-20250812"
  else
113
    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
114
  fi
115
116
}

117
# Pull and run the latest image
118
IMAGE=$(find_latest_image "${GPU_ARCH}")
119
120
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"
Sai Enduri's avatar
Sai Enduri committed
121

122
123
echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
Sai Enduri's avatar
Sai Enduri committed
124
125
  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
  --ipc=host --group-add video \
126
  --shm-size 32g \
Sai Enduri's avatar
Sai Enduri committed
127
128
129
130
131
  --cap-add=SYS_PTRACE \
  -e HF_TOKEN="${HF_TOKEN:-}" \
  --security-opt seccomp=unconfined \
  -w /sglang-checkout \
  --name ci_sglang \
132
  "${IMAGE}"