Add tests to AMD CI for MI35x (#9662)

Co-authored-by: Sai Enduri <saimanas.enduri@amd.com>

Add tests to AMD CI for MI35x (#9662)
Co-authored-by: Sai Enduri <saimanas.enduri@amd.com>
91b3555d · Hubert Lu · GitHub · 9e2f7252 · 91b3555d · 91b3555d
Unverified Commit 91b3555d authored Sep 10, 2025 by Hubert Lu Committed by GitHub Sep 10, 2025
7 changed files
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -28,6 +28,7 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
    runs-on: ${{matrix.runner}}
@@ -54,8 +55,9 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
-        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2, linux-mi35x-gpu-2]
    runs-on: ${{matrix.runner}}
    steps:
      - name: Checkout code
@@ -70,7 +72,7 @@ jobs:
        run: bash scripts/ci/amd_ci_install_dependency.sh

      - name: Evaluate accuracy (TP=2)
-        timeout-minutes: 30
+        timeout-minutes: 60
        run: |
          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py

@@ -78,6 +80,7 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
    runs-on: ${{matrix.runner}}
@@ -102,6 +105,7 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
    runs-on: ${{matrix.runner}}
@@ -142,6 +146,7 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
    runs-on: ${{matrix.runner}}
@@ -176,6 +181,7 @@ jobs:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
    runs-on: ${{matrix.runner}}
@@ -242,12 +248,13 @@ jobs:
        run: |
          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 8

-  unit-test-backend-2-gpu-amd:
+  unit-test-backend-1-gpu-amd-mi35x:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
-        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
+        runner: [linux-mi35x-gpu-1]
    runs-on: ${{matrix.runner}}
    steps:
      - name: Checkout code
@@ -262,16 +269,17 @@ jobs:
        run: bash scripts/ci/amd_ci_install_dependency.sh

      - name: Run test
-        timeout-minutes: 40
+        timeout-minutes: 50
        run: |
-          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd-mi35x

-  unit-test-backend-8-gpu-amd:
+  unit-test-backend-2-gpu-amd:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
-        runner: [linux-mi300-gpu-8]
+        runner: [linux-mi300-gpu-2, linux-mi325-gpu-2]
    runs-on: ${{matrix.runner}}
    steps:
      - name: Checkout code
@@ -286,14 +294,15 @@ jobs:
        run: bash scripts/ci/amd_ci_install_dependency.sh

      - name: Run test
-        timeout-minutes: 60
+        timeout-minutes: 40
        run: |
-          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd

-  unit-test-backend-8-gpu-CAR-amd:
+  unit-test-backend-8-gpu-amd:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
      github.event.pull_request.draft == false
    strategy:
+      fail-fast: false
      matrix:
        runner: [linux-mi300-gpu-8]
    runs-on: ${{matrix.runner}}
@@ -309,10 +318,10 @@ jobs:
      - name: Install dependencies
        run: bash scripts/ci/amd_ci_install_dependency.sh

-      - name: Run CustomAllReduce test
-        timeout-minutes: 20
+      - name: Run test
+        timeout-minutes: 60
        run: |
-          bash scripts/ci/amd_ci_exec.sh -e CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m unittest test_custom_allreduce.TestCustomAllReduce
+          bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600

  unit-test-sgl-kernel-amd:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -350,8 +359,8 @@ jobs:
    needs: [
      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
      accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
-      unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd,
-      unit-test-sgl-kernel-amd
+      unit-test-backend-1-gpu-amd, unit-test-backend-1-gpu-amd-mi35x, unit-test-backend-2-gpu-amd,
+      unit-test-backend-8-gpu-amd, unit-test-sgl-kernel-amd
    ]
    runs-on: ubuntu-latest
    steps:

--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -2027,7 +2027,10 @@ class DeepseekV2DecoderLayer(nn.Module):
        quant_format = (
            "mxfp4"
            if _is_gfx95_supported
-            and self.self_attn.fused_qkv_a_proj_with_mqa.weight == torch.uint8
+            and getattr(self.self_attn, "fused_qkv_a_proj_with_mqa", None) is not None
+            and getattr(self.self_attn.fused_qkv_a_proj_with_mqa, "weight", None)
+            is not None
+            and self.self_attn.fused_qkv_a_proj_with_mqa.weight.dtype == torch.uint8
            else ""
        )

@@ -2582,7 +2585,11 @@ class DeepseekV2ForCausalLM(nn.Module):
                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)

-            if _use_aiter_gfx95 and self.quant_config.get_name() == "quark":
+            if (
+                _use_aiter_gfx95
+                and self.quant_config is not None
+                and self.quant_config.get_name() == "quark"
+            ):
                w_kc, self_attn.w_scale_k, w_vc, self_attn.w_scale_v = (
                    quark_post_load_weights(self_attn, w, "mxfp4")
                )

--- a/scripts/ci/amd_ci_exec.sh
+++ b/scripts/ci/amd_ci_exec.sh
 #!/bin/bash
 set -euo pipefail

+# Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz)
+HOSTNAME_VALUE=$(hostname)
+GPU_FAMILY=""
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_FAMILY="${BASH_REMATCH[1]}"
+  echo "Detected GPU family from hostname: ${GPU_FAMILY}"
+else
+  echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'"
+fi
+
 WORKDIR="/sglang-checkout/test/srt"
 declare -A ENV_MAP=(
  [SGLANG_AMD_CI]=1
@@ -8,6 +20,11 @@ declare -A ENV_MAP=(
  [SGLANG_USE_AITER]=1
 )

+# Conditionally add GPU_ARCHS only for mi35x
+if [[ "${GPU_FAMILY}" == "mi35x" ]]; then
+  ENV_MAP[GPU_ARCHS]="gfx950"
+fi
+
 # Parse -w/--workdir and -e ENV=VAL
 while [[ $# -gt 0 ]]; do
  case "$1" in

--- a/scripts/ci/amd_ci_install_dependency.sh
+++ b/scripts/ci/amd_ci_install_dependency.sh
 #!/bin/bash
 set -euo pipefail
+HOSTNAME_VALUE=$(hostname)
+GPU_ARCH="mi30x"   # default
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_ARCH="${BASH_REMATCH[1]}"
+  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
+else
+  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
+fi

 # Install the required dependencies in CI.
 docker exec ci_sglang pip install --upgrade pip
 docker exec ci_sglang pip uninstall sgl-kernel -y || true
 docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
-docker exec ci_sglang pip install -e "python[dev_hip]"
+
+case "${GPU_ARCH}" in
+  mi35x)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
+    docker exec ci_sglang pip install -e "python[dev_hip]" --no-deps # TODO: only for mi35x
+    # For lmms_evals evaluating MMMU
+    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    docker exec -w /lmms-eval ci_sglang pip install -e . --no-deps # TODO: only for mi35x
+    ;;
+  mi30x|mi300|mi325)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
+    docker exec ci_sglang pip install -e "python[dev_hip]"
+    # For lmms_evals evaluating MMMU
+    docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+    docker exec -w /lmms-eval ci_sglang pip install -e .
+    ;;
+  *)
+    echo "Runner architecture '${GPU_ARCH}' unrecognised;" >&2
+    ;;
+esac

 docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
 docker exec -w /human-eval ci_sglang pip install -e .

-# For lmms_evals evaluating MMMU
-docker exec -w / ci_sglang git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-docker exec -w /lmms-eval ci_sglang pip install -e .
-
 docker exec -w / ci_sglang mkdir -p /dummy-grok
 mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
 docker cp ./dummy-grok ci_sglang:/

--- a/scripts/ci/amd_ci_start_container.sh
+++ b/scripts/ci/amd_ci_start_container.sh
@@ -3,7 +3,7 @@ set -euo pipefail

 # Get version from SGLang version.py file
 SGLANG_VERSION_FILE="$(dirname "$0")/../../python/sglang/version.py"
-SGLANG_VERSION="v0.5.0rc0"  # Default version, will be overridden if version.py is found
+SGLANG_VERSION="v0.5.0rc0"   # Default version, will be overridden if version.py is found

 if [ -f "$SGLANG_VERSION_FILE" ]; then
  VERSION_FROM_FILE=$(python3 -c '
@@ -25,130 +25,102 @@ else
  echo "Warning: version.py not found, using default version: $SGLANG_VERSION" >&2
 fi

+
 # Default base tags (can be overridden by command line arguments)
 DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm630-mi30x"
 DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"

 # Parse command line arguments
-MI30X_BASE_TAG="$DEFAULT_MI30X_BASE_TAG"
-MI35X_BASE_TAG="$DEFAULT_MI35X_BASE_TAG"
+MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
+MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"

 while [[ $# -gt 0 ]]; do
  case $1 in
-    --mi30x-base-tag)
-      MI30X_BASE_TAG="$2"
-      shift 2
-      ;;
-    --mi35x-base-tag)
-      MI35X_BASE_TAG="$2"
-      shift 2
-      ;;
+    --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
+    --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
    -h|--help)
      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
-      echo "  --mi30x-base-tag TAG    Base tag for mi30x images (default: $DEFAULT_MI30X_BASE_TAG)"
-      echo "  --mi35x-base-tag TAG    Base tag for mi35x images (default: $DEFAULT_MI35X_BASE_TAG)"
      exit 0
      ;;
-    *)
-      echo "Unknown option $1"
-      echo "Use --help for usage information"
-      exit 1
-      ;;
+    *) echo "Unknown option $1"; exit 1;;
  esac
 done

+
+
+# Detect GPU architecture from the Kubernetes runner hostname
+HOSTNAME_VALUE=$(hostname)
+GPU_ARCH="mi30x"   # default
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_ARCH="${BASH_REMATCH[1]}"
+  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
+else
+  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
+fi
+
+# Normalise / collapse architectures we don’t yet build specifically for
+case "${GPU_ARCH}" in
+  mi35x)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
+    ;;
+  mi30x|mi300|mi325)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
+    GPU_ARCH="mi30x"
+    ;;
+  *)
+    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
+    GPU_ARCH="mi30x"
+    ;;
+esac
+
+
 # Set up DEVICE_FLAG based on Kubernetes pod info
-if [ -f "/etc/podinfo/gha-render-devices" ]; then
+if [[ -f /etc/podinfo/gha-render-devices ]]; then
  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
 else
  DEVICE_FLAG="--device /dev/dri"
 fi


-
-# Function to find latest available image for a given GPU architecture
+# Find the latest image
 find_latest_image() {
  local gpu_arch=$1
-  local base_tag
-
-  if [ "$gpu_arch" == "mi30x" ]; then
-    base_tag="$MI30X_BASE_TAG"
-  elif [ "$gpu_arch" == "mi35x" ]; then
-    base_tag="$MI35X_BASE_TAG"
-  else
-    echo "Error: Unsupported GPU architecture '$gpu_arch'" >&2
-    return 1
-  fi
+  local base_tag days_back image_tag

-  local days_back=0
-
-  while [ $days_back -lt 7 ]; do
-    local check_date=$(date -d "$days_back days ago" +%Y%m%d)
-    local image_tag="${base_tag}-${check_date}"
+  case "${gpu_arch}" in
+      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
+      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
+      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
+  esac

+  for days_back in {0..6}; do
+    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
-
-    # Check if the image exists by trying to get its manifest
    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
      echo "rocm/sgl-dev:${image_tag}"
      return 0
    fi
-
-    days_back=$((days_back + 1))
  done

-  echo "Error: No ${gpu_arch} image found in the last 7 days for version ${base_tag}" >&2
-
-  # Final fallback to specific hardcoded images
-  echo "Using final fallback images..." >&2
-  if [ "$gpu_arch" == "mi30x" ]; then
-    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
-  elif [ "$gpu_arch" == "mi35x" ]; then
+  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
+  echo "Using hard-coded fallback…" >&2
+  if [[ "${gpu_arch}" == "mi35x" ]]; then
    echo "rocm/sgl-dev:v0.5.0rc0-rocm700-mi35x-20250812"
  else
-    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"  # Default to mi30x
+    echo "rocm/sgl-dev:v0.5.0rc0-rocm630-mi30x-20250812"
  fi
-
-  return 0
 }

-# Determine image finder and fallback based on runner
-# In Kubernetes, the hostname contains the GPU type (e.g., linux-mi300-gpu-1-bgg8r-runner-vknlb)
-# Extract the GPU type from hostname
-HOSTNAME_VALUE=$(hostname)
-RUNNER_NAME="unknown"
-
-if [[ "${HOSTNAME_VALUE}" =~ ^(linux-mi[0-9]+-gpu-[0-9]+) ]]; then
-  RUNNER_NAME="${BASH_REMATCH[1]}"
-  echo "Extracted runner from hostname: ${RUNNER_NAME}"
-else
-  echo "Could not extract runner info from hostname: ${HOSTNAME_VALUE}"
-fi
-
-echo "The runner is: ${RUNNER_NAME}"
-GPU_ARCH="mi30x"
-
-# Check for mi350/mi355 runners
-if [[ "${RUNNER_NAME}" =~ ^linux-mi350-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi355-gpu-[0-9]+$ ]]; then
-  echo "Runner is ${RUNNER_NAME}, will find mi35x image."
-  GPU_ARCH="mi35x"
-# Check for mi300/mi325 runners
-elif [[ "${RUNNER_NAME}" =~ ^linux-mi300-gpu-[0-9]+$ ]] || [[ "${RUNNER_NAME}" =~ ^linux-mi325-gpu-[0-9]+$ ]]; then
-  echo "Runner is ${RUNNER_NAME}, will find mi30x image."
-else
-  echo "Runner type not recognized: '${RUNNER_NAME}'"
-  echo "Defaulting to find mi30x image"
-fi
-
-# Find and pull the latest image
+# Pull and run the latest image
 IMAGE=$(find_latest_image "${GPU_ARCH}")
-echo "Pulling Docker image: $IMAGE"
-docker pull "$IMAGE"
+echo "Pulling Docker image: ${IMAGE}"
+docker pull "${IMAGE}"

-# Run the container
-echo "Starting container: ci_sglang"
-docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
+echo "Launching container: ci_sglang"
+docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
  --ipc=host --group-add video \
  --shm-size 32g \
@@ -157,4 +129,4 @@ docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
  --security-opt seccomp=unconfined \
  -w /sglang-checkout \
  --name ci_sglang \
-  "$IMAGE"
+  "${IMAGE}"
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -243,6 +243,10 @@ suite_amd = {
        TestFile("test_wave_attention_kernels.py", 2),
        TestFile("test_wave_attention_backend.py", 150),
    ],
+    "per-commit-amd-mi35x": [
+        TestFile("test_mla.py", 242),
+        TestFile("test_gpt_oss_1gpu.py", 600),
+    ],
    "per-commit-2-gpu-amd": [
        TestFile("lora/test_lora_tp.py", 116),
        TestFile("rl/test_update_weights_from_distributed.py", 103),

--- a/test/srt/test_gpt_oss_common.py
+++ b/test/srt/test_gpt_oss_common.py
+import os
 from concurrent.futures import ThreadPoolExecutor
 from types import SimpleNamespace
 from typing import Dict, List, Literal, Optional

-from sglang.srt.utils import kill_process_tree
+from sglang.srt.utils import is_hip, kill_process_tree
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -14,6 +15,7 @@ from sglang.test.test_utils import (
 )

 _base_url = DEFAULT_URL_FOR_TEST
+_is_hip = is_hip()


 class BaseTestGptOss(CustomTestCase):
@@ -36,7 +38,8 @@ class BaseTestGptOss(CustomTestCase):

        if model_variant == "20b":
            other_args += ["--cuda-graph-max-bs", "600"]
-
+        if _is_hip:
+            os.environ["SGLANG_USE_AITER"] = "0"
        self._run_test_raw(
            model=model,
            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,