Unverified Commit 81da16f6 authored by Hank Han's avatar Hank Han Committed by GitHub
Browse files

[CI] add deepseek w4a8 test on h20 ci (#7758)

parent bc938ea1
name: PR Test (H20)
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:
inputs:
version:
required: true
type: choice
default: 'release'
options:
- 'release'
- 'nightly'
concurrency:
group: pr-test-h20-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changes:
runs-on: ubuntu-latest
outputs:
src: ${{ steps.filter.outputs.src }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Detect file changes
id: filter
uses: dorny/paths-filter@v3
with:
filters: |
src:
- "python/sglang/srt/models/deepseek*"
- "python/sglang/srt/layers/moe/**"
- ".github/workflows/pr-test-h20.yml"
per-commit-8-gpu-h20:
needs: [check-changes]
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false &&
needs.check-changes.outputs.src == 'true'
runs-on: 8-gpu-h20
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install dependencies
run: |
bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-h20
pr-test-finish:
needs: [
check-changes,
per-commit-8-gpu-h20,
]
if: needs.check-changes.outputs.src == 'true'
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
results=(${{ join(needs.*.result, ' ') }})
for result in "${results[@]}"; do
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
echo "Job failed with result: $result"
exit 1
fi
done
echo "All jobs completed successfully"
exit 0
...@@ -78,6 +78,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( ...@@ -78,6 +78,7 @@ DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
) )
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B" DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST = "Barrrrry/DeepSeek-R1-W4AFP8"
# Nightly tests # Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
......
import unittest
from types import SimpleNamespace
import requests
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
from sglang.test.test_utils import (
DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_amd_ci,
is_in_ci,
popen_launch_server,
try_cached_model,
write_github_step_summary,
)
class TestDeepseekV3W4afp8(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST)
cls.base_url = DEFAULT_URL_FOR_TEST
other_args = ["--trust-remote-code", "--tp", "8", "--ep-size", "8"]
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=1200,
parallel=1200,
max_new_tokens=512,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval_few_shot_gsm8k(args)
print(f"Eval accuracy of GSM8K: {metrics=}")
self.assertGreater(metrics["accuracy"], 0.92)
class TestDeepseekV3W4Afp8Mtp(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = try_cached_model(DEFAULT_DEEPSEEK_W4AFP8_MODEL_FOR_TEST)
cls.base_url = DEFAULT_URL_FOR_TEST
other_args = [
"--tp",
"8",
"--trust-remote-code",
"--ep-size",
"8",
"--cuda-graph-bs",
"256",
"--disable-radix-cache",
"--speculative-algorithm",
"EAGLE",
"--speculative-num-steps",
"3",
"--speculative-eagle-topk",
"2",
"--speculative-num-draft-tokens",
"4",
]
if not is_in_amd_ci():
other_args += ["--mem-frac", "0.7"]
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_gsm8k(
self,
):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval_few_shot_gsm8k(args)
print(f"{metrics=}")
server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}")
if is_in_ci():
write_github_step_summary(
f"### test_gsm8k (deepseek-v3 mtp)\n"
f'{metrics["accuracy"]=:.3f}\n'
f"{avg_spec_accept_length=:.2f}\n"
)
self.assertGreater(metrics["accuracy"], 0.935)
self.assertGreater(avg_spec_accept_length, 2.9)
if __name__ == "__main__":
unittest.main()
...@@ -143,6 +143,9 @@ suites = { ...@@ -143,6 +143,9 @@ suites = {
"per-commit-8-gpu-deepep": [ "per-commit-8-gpu-deepep": [
TestFile("ep/test_deepep_large.py", 338), TestFile("ep/test_deepep_large.py", 338),
], ],
"per-commit-8-gpu-h20": [
TestFile("quant/test_w4a8_deepseek_v3.py", 371),
],
"nightly": [ "nightly": [
TestFile("test_nightly_gsm8k_eval.py"), TestFile("test_nightly_gsm8k_eval.py"),
], ],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment