"sgl-kernel/python/vscode:/vscode.git/clone" did not exist on "81eb07da4de0041cb1c49d0039d42a70fc1cccd2"
amd_ci_start_container.sh 4.83 KB
Newer Older
Sai Enduri's avatar
Sai Enduri committed
1
2
3
#!/bin/bash
set -euo pipefail

4
5
# Get version from SGLang version.py file
FALLBACK_SGLANG_VERSION="v0.4.10.post2"
6
SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

if [ -f "$SGLANG_VERSION_FILE" ]; then
  SGLANG_VERSION=$(python3 -c '
import re, sys
with open(sys.argv[1], "r") as f:
    content = f.read()
    match = re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", content)
    if match:
        print("v" + match.group(1))
' "$SGLANG_VERSION_FILE")

  if [ -z "$SGLANG_VERSION" ]; then
      SGLANG_VERSION="$FALLBACK_SGLANG_VERSION"
      echo "Warning: Could not parse version from $SGLANG_VERSION_FILE, using fallback version: $SGLANG_VERSION" >&2
  fi
else
  # Fallback version if file is not found
  SGLANG_VERSION="$FALLBACK_SGLANG_VERSION"
  echo "Warning: version.py not found, using fallback version: $SGLANG_VERSION" >&2
fi

echo "Using SGLang version: $SGLANG_VERSION"

30
# Default base tags (can be overridden by command line arguments)
31
32
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

# Parse command line arguments
MI30X_BASE_TAG="$DEFAULT_MI30X_BASE_TAG"
MI35X_BASE_TAG="$DEFAULT_MI35X_BASE_TAG"

while [[ $# -gt 0 ]]; do
  case $1 in
    --mi30x-base-tag)
      MI30X_BASE_TAG="$2"
      shift 2
      ;;
    --mi35x-base-tag)
      MI35X_BASE_TAG="$2"
      shift 2
      ;;
    -h|--help)
      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
      echo "  --mi30x-base-tag TAG    Base tag for mi30x images (default: $DEFAULT_MI30X_BASE_TAG)"
      echo "  --mi35x-base-tag TAG    Base tag for mi35x images (default: $DEFAULT_MI35X_BASE_TAG)"
      exit 0
      ;;
    *)
      echo "Unknown option $1"
      echo "Use --help for usage information"
      exit 1
      ;;
  esac
done

Sai Enduri's avatar
Sai Enduri committed
62
63
64
65
66
67
68
# Set up DEVICE_FLAG based on Kubernetes pod info
if [ -f "/etc/podinfo/gha-render-devices" ]; then
  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
  DEVICE_FLAG="--device /dev/dri"
fi

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Function to find latest available image for a given GPU architecture
find_latest_image() {
  local gpu_arch=$1
  local base_tag

  if [ "$gpu_arch" == "mi30x" ]; then
    base_tag="$MI30X_BASE_TAG"
  elif [ "$gpu_arch" == "mi35x" ]; then
    base_tag="$MI35X_BASE_TAG"
  else
    echo "Error: Unsupported GPU architecture '$gpu_arch'" >&2
    return 1
  fi

  local days_back=0

  while [ $days_back -lt 30 ]; do
    local check_date=$(date -d "$days_back days ago" +%Y%m%d)
    local image_tag="${base_tag}-${check_date}"

    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2

    # Check if the image exists by trying to get its manifest
    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
      echo "rocm/sgl-dev:${image_tag}"
      return 0
    fi

    days_back=$((days_back + 1))
  done

  echo "Error: No ${gpu_arch} image found in the last 30 days" >&2
  return 1
}

# Determine image finder and fallback based on runner
# In Kubernetes, the hostname contains the GPU type (e.g., linux-mi300-gpu-1-bgg8r-runner-vknlb)
# Extract the GPU type from hostname
HOSTNAME_VALUE=$(hostname)
RUNNER_NAME="unknown"

if [[ "${HOSTNAME_VALUE}" =~ ^(linux-mi[0-9]+-gpu-[0-9]+) ]]; then
  RUNNER_NAME="${BASH_REMATCH[1]}"
  echo "Extracted runner from hostname: ${RUNNER_NAME}"
else
  echo "Could not extract runner info from hostname: ${HOSTNAME_VALUE}"
fi

echo "The runner is: ${RUNNER_NAME}"
GPU_ARCH="mi30x"
FALLBACK_IMAGE="rocm/sgl-dev:${MI30X_BASE_TAG}-20250715"
FALLBACK_MSG="No mi30x image found in last 30 days, using fallback image"

# Check for mi350/mi355 runners
if [[ "${RUNNER_NAME}" =~ ^linux-mi350-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi355-gpu-[0-9]+$ ]]; then
  echo "Runner is ${RUNNER_NAME}, will find mi35x image."
  GPU_ARCH="mi35x"
  FALLBACK_IMAGE="rocm/sgl-dev:${MI35X_BASE_TAG}-20250715"
  FALLBACK_MSG="No mi35x image found in last 30 days, using fallback image"
# Check for mi300/mi325 runners
elif [[ "${RUNNER_NAME}" =~ ^linux-mi300-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi325-gpu-[0-9]+$ ]]; then
  echo "Runner is ${RUNNER_NAME}, will find mi30x image."
else
  echo "Runner type not recognized: '${RUNNER_NAME}'"
  echo "Defaulting to find mi30x image"
fi

# Find and pull the latest image
138
if IMAGE=$(find_latest_image "${GPU_ARCH}"); then
139
140
141
142
143
144
  echo "Pulling Docker image: $IMAGE"
else
  echo "$FALLBACK_MSG" >&2
  IMAGE="$FALLBACK_IMAGE"
  echo "Pulling fallback Docker image: $IMAGE"
fi
Sai Enduri's avatar
Sai Enduri committed
145
146
147
148
149
150
151
docker pull "$IMAGE"

# Run the container
echo "Starting container: ci_sglang"
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
  --ipc=host --group-add video \
152
  --shm-size 32g \
Sai Enduri's avatar
Sai Enduri committed
153
154
155
156
157
158
  --cap-add=SYS_PTRACE \
  -e HF_TOKEN="${HF_TOKEN:-}" \
  --security-opt seccomp=unconfined \
  -w /sglang-checkout \
  --name ci_sglang \
  "$IMAGE"