Unverified Commit c47a51db authored by Sai Enduri's avatar Sai Enduri Committed by GitHub
Browse files

Clean up AMD CI (#6365)

parent 11553c1a
...@@ -27,32 +27,15 @@ jobs: ...@@ -27,32 +27,15 @@ jobs:
- name: Setup docker - name: Setup docker
run: | run: |
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
touch github_summary.md touch github_summary.md
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630 bash scripts/amd_ci_start_container.sh
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \ env:
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \ GITHUB_WORKSPACE: ${{ github.workspace }}
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
docker exec ci_sglang pip install huggingface_hub[hf_xet]
- name: Nightly Test - name: Nightly Test
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" ci_sglang python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 bash scripts/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY
...@@ -35,37 +35,20 @@ jobs: ...@@ -35,37 +35,20 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Evaluate Accuracy - name: Evaluate Accuracy
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py bash scripts/amd_ci_exec.sh python3 test_eval_accuracy_large.py
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py bash scripts/amd_ci_exec.sh python3 models/test_qwen_models.py
accuracy-test-2-gpu-amd: accuracy-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -78,35 +61,18 @@ jobs: ...@@ -78,35 +61,18 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Evaluate accuracy (TP=2) - name: Evaluate accuracy (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_moe_eval_accuracy_large.py bash scripts/amd_ci_exec.sh python3 test_moe_eval_accuracy_large.py
mla-test-1-gpu-amd: mla-test-1-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -119,35 +85,18 @@ jobs: ...@@ -119,35 +85,18 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: MLA TEST - name: MLA TEST
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py bash scripts/amd_ci_exec.sh python3 test_mla.py
performance-test-1-gpu-part-1-amd: performance-test-1-gpu-part-1-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -160,56 +109,39 @@ jobs: ...@@ -160,56 +109,39 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Benchmark single latency - name: Benchmark single latency
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
- name: Benchmark online latency - name: Benchmark online latency
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
- name: Benchmark offline throughput - name: Benchmark offline throughput
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark offline throughput (Non-streaming, small batch size) - name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
- name: Benchmark online latency (EAGLE) - name: Benchmark online latency (EAGLE)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
performance-test-1-gpu-part-2-amd: performance-test-1-gpu-part-2-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -222,45 +154,28 @@ jobs: ...@@ -222,45 +154,28 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Benchmark offline throughput (w/o RadixAttention) - name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark offline throughput (w/ Triton) - name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
- name: Benchmark offline throughput (w/ FP8) - name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_AMD_CI=1 -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
bench-test-2-gpu-amd: bench-test-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -273,59 +188,38 @@ jobs: ...@@ -273,59 +188,38 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/
- name: Benchmark dummy grok (TP=2) - name: Benchmark dummy grok (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
- name: Benchmark single latency (TP=2) - name: Benchmark single latency (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2) - name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2) - name: Benchmark offline throughput (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2) - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 20 timeout-minutes: 20
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 ci_sglang python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
unit-test-backend-1-gpu-amd: unit-test-backend-1-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -338,35 +232,18 @@ jobs: ...@@ -338,35 +232,18 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Run test - name: Run test
timeout-minutes: 30 timeout-minutes: 30
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-amd bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd
unit-test-backend-2-gpu-amd: unit-test-backend-2-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -379,35 +256,18 @@ jobs: ...@@ -379,35 +256,18 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Run test - name: Run test
timeout-minutes: 30 timeout-minutes: 30
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-2-gpu-amd bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
unit-test-backend-8-gpu-amd: unit-test-backend-8-gpu-amd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...@@ -420,35 +280,18 @@ jobs: ...@@ -420,35 +280,18 @@ jobs:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Setup docker - name: Start CI container
run: | run: bash scripts/amd_ci_start_container.sh
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG. env:
if [ -f "/etc/podinfo/gha-render-devices" ]; then GITHUB_WORKSPACE: ${{ github.workspace }}
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.6.post3-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.6.post3-rocm630
- name: Install dependencies - name: Install dependencies
run: | run: bash scripts/amd_ci_install_dependency.sh
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
- name: Run test - name: Run test
timeout-minutes: 30 timeout-minutes: 30
run: | run: |
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 -e SGLANG_AMD_CI=1 -e SGLANG_AITER_MOE=1 ci_sglang python3 run_suite.py --suite per-commit-8-gpu-amd bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd
finish: finish:
if: always() if: always()
......
#!/bin/bash
set -euo pipefail
# Default working directory
WORKDIR="/sglang-checkout/test/srt"
ENV_ARGS=(
-e SGLANG_AMD_CI=1
-e SGLANG_IS_IN_CI=1
-e SGLANG_AITER_MOE=1
)
# Parse optional -w/--workdir and -e ENV=VAL flags
while [[ $# -gt 0 ]]; do
case "$1" in
-w|--workdir)
WORKDIR="$2"
shift 2
;;
-e)
ENV_ARGS+=("-e" "$2")
shift 2
;;
--)
shift
break
;;
*)
break
;;
esac
done
# Run docker exec
docker exec \
-w "$WORKDIR" \
"${ENV_ARGS[@]}" \
ci_sglang "$@"
#!/bin/bash
set -euo pipefail
# Install the required dependencies in CI.
docker exec ci_sglang pip install --upgrade pip
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang pip install -e "python[dev_hip]"
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
docker exec -w /human-eval ci_sglang pip install -e .
docker exec -w / ci_sglang mkdir -p /dummy-grok
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
docker cp ./dummy-grok ci_sglang:/
#!/bin/bash
set -euo pipefail
# Set up DEVICE_FLAG based on Kubernetes pod info
if [ -f "/etc/podinfo/gha-render-devices" ]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
# Pull the image
IMAGE="lmsysorg/sglang:v0.4.6.post3-rocm630"
echo "Pulling Docker image: $IMAGE"
docker pull "$IMAGE"
# Run the container
echo "Starting container: ci_sglang"
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
--ipc=host --group-add video \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
--security-opt seccomp=unconfined \
-w /sglang-checkout \
--name ci_sglang \
"$IMAGE"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment