Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
fee0ab0f
Unverified
Commit
fee0ab0f
authored
Aug 04, 2025
by
Even Zhou
Committed by
GitHub
Aug 03, 2025
Browse files
[CI] Ascend NPU CI enhancement (#8294)
Co-authored-by:
ronnie_zheng
<
zl19940307@163.com
>
parent
f57d2dc1
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
415 additions
and
189 deletions
+415
-189
.github/workflows/pr-test-npu.yml
.github/workflows/pr-test-npu.yml
+67
-3
python/sglang/srt/layers/moe/topk.py
python/sglang/srt/layers/moe/topk.py
+10
-2
scripts/npu_ci_install_dependency.sh
scripts/npu_ci_install_dependency.sh
+36
-24
test/srt/run_suite.py
test/srt/run_suite.py
+8
-2
test/srt/test_ascend_attention_backend.py
test/srt/test_ascend_attention_backend.py
+0
-62
test/srt/test_ascend_mla_backend.py
test/srt/test_ascend_mla_backend.py
+0
-96
test/srt/test_ascend_mla_w8a8int8.py
test/srt/test_ascend_mla_w8a8int8.py
+100
-0
test/srt/test_ascend_tp1_bf16.py
test/srt/test_ascend_tp1_bf16.py
+96
-0
test/srt/test_ascend_tp2_bf16.py
test/srt/test_ascend_tp2_bf16.py
+98
-0
No files found.
.github/workflows/pr-test-npu.yml
View file @
fee0ab0f
...
@@ -22,7 +22,7 @@ concurrency:
...
@@ -22,7 +22,7 @@ concurrency:
cancel-in-progress
:
true
cancel-in-progress
:
true
jobs
:
jobs
:
unit-test-basic
:
per-commit-1-ascend-npu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
github.event.pull_request.draft ==
false
runs-on
:
linux-arm64-npu-1
runs-on
:
linux-arm64-npu-1
...
@@ -44,13 +44,77 @@ jobs:
...
@@ -44,13 +44,77 @@ jobs:
timeout-minutes
:
30
timeout-minutes
:
30
env
:
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-npu
python3 run_suite.py --suite per-commit-1-ascend-npu
per-commit-2-ascend-npu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
linux-arm64-npu-2
container
:
image
:
swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Install dependencies
run
:
|
bash scripts/npu_ci_install_dependency.sh
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
-
name
:
Run test
timeout-minutes
:
30
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
run
:
|
cd test/srt
python3 run_suite.py --suite per-commit-2-ascend-npu
per-commit-4-ascend-npu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
linux-arm64-npu-4
container
:
image
:
swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1.alpha003-910b-ubuntu22.04-py3.11
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Install dependencies
run
:
|
bash scripts/npu_ci_install_dependency.sh
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
-
name
:
Run test
timeout-minutes
:
30
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
run
:
|
cd test/srt
python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
finish
:
finish
:
if
:
always()
if
:
always()
needs
:
[
unit-test-basic
]
needs
:
-
per-commit-1-ascend-npu
-
per-commit-2-ascend-npu
-
per-commit-4-ascend-npu
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Check all dependent job statuses
-
name
:
Check all dependent job statuses
...
...
python/sglang/srt/layers/moe/topk.py
View file @
fee0ab0f
...
@@ -398,8 +398,12 @@ def grouped_topk_gpu(
...
@@ -398,8 +398,12 @@ def grouped_topk_gpu(
.
reshape
(
num_token
,
-
1
)
.
reshape
(
num_token
,
-
1
)
)
# [n, e]
)
# [n, e]
tmp_scores
=
scores
.
masked_fill
(
~
score_mask
.
bool
(),
0.0
)
# [n, e]
tmp_scores
=
scores
.
masked_fill
(
~
score_mask
.
bool
(),
0.0
)
# [n, e]
# TODO: NPU can't support directly evaluating a comparison for now
topk_weights
,
topk_ids
=
torch
.
topk
(
topk_weights
,
topk_ids
=
torch
.
topk
(
tmp_scores
,
k
=
topk
,
dim
=-
1
,
sorted
=
num_fused_shared_experts
>
0
tmp_scores
,
k
=
topk
,
dim
=-
1
,
sorted
=
(
True
if
num_fused_shared_experts
>
0
else
False
),
)
)
if
num_fused_shared_experts
:
if
num_fused_shared_experts
:
topk_ids
[:,
-
1
]
=
torch
.
randint
(
topk_ids
[:,
-
1
]
=
torch
.
randint
(
...
@@ -489,8 +493,12 @@ def biased_grouped_topk_impl(
...
@@ -489,8 +493,12 @@ def biased_grouped_topk_impl(
tmp_scores
=
scores_for_choice
.
masked_fill
(
tmp_scores
=
scores_for_choice
.
masked_fill
(
~
score_mask
.
bool
(),
float
(
"-inf"
)
~
score_mask
.
bool
(),
float
(
"-inf"
)
)
# [n, e]
)
# [n, e]
# TODO: NPU can't support directly evaluating a comparison for now
_
,
topk_ids
=
torch
.
topk
(
_
,
topk_ids
=
torch
.
topk
(
tmp_scores
,
k
=
topk
,
dim
=-
1
,
sorted
=
num_fused_shared_experts
>
0
tmp_scores
,
k
=
topk
,
dim
=-
1
,
sorted
=
(
True
if
num_fused_shared_experts
>
0
else
False
),
)
)
topk_weights
=
scores
.
gather
(
1
,
topk_ids
)
topk_weights
=
scores
.
gather
(
1
,
topk_ids
)
...
...
scripts/npu_ci_install_dependency.sh
View file @
fee0ab0f
#!/bin/bash
#!/bin/bash
set
-euo
pipefail
set
-euo
pipefail
# Install the required dependencies from cache
CACHING_URL
=
"cache-service.nginx-pypi-cache.svc.cluster.local"
sed
-Ei
's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g'
/etc/apt/sources.list
PIP_INSTALL
=
"pip install --no-cache-dir"
apt update
-y
apt
install
-y
build-essential cmake python3-pip python3-dev wget net-tools zlib1g-dev lld clang software-properties-common curl
# Setup pip cache
pip config
set
global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
# Update apt & pip sources
pip config
set
global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
sed
-Ei
"s@(ports|archive).ubuntu.com@
${
CACHING_URL
}
:8081@g"
/etc/apt/sources.list
python3
-m
pip
install
--upgrade
pip
pip config
set
global.index-url http://
${
CACHING_URL
}
/pypi/simple
pip uninstall sgl-kernel
-y
||
true
pip config
set
global.trusted-host
${
CACHING_URL
}
# Install the required dependencies in CI.
apt update
-y
&&
apt
install
-y
\
build-essential
\
cmake
\
wget
\
curl
\
net-tools
\
zlib1g-dev
\
lld
\
clang
\
locales
\
ccache
\
ca-certificates
update-ca-certificates
python3
-m
${
PIP_INSTALL
}
--upgrade
pip
### Download MemFabricV2
### Download MemFabricV2
MF_WHL_NAME
=
"mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
MF_WHL_NAME
=
"mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl"
MEMFABRIC_URL
=
"https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com
:443
/sglang/
${
MF_WHL_NAME
}
"
MEMFABRIC_URL
=
"https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/
${
MF_WHL_NAME
}
"
wget
"
${
MEMFABRIC_URL
}
"
&&
pip
install
"./
${
MF_WHL_NAME
}
"
wget
"
${
MEMFABRIC_URL
}
"
&&
${
PIP_INSTALL
}
"./
${
MF_WHL_NAME
}
"
### Install vLLM
### Install vLLM
VLLM_TAG
=
v0.8.5
VLLM_TAG
=
v0.8.5
git clone
--depth
1 https://github.com/vllm-project/vllm.git
--branch
$VLLM_TAG
git clone
--depth
1 https://github.com/vllm-project/vllm.git
--branch
$VLLM_TAG
(
cd
vllm
&&
VLLM_TARGET_DEVICE
=
"empty"
pip
install
-v
-e
.
)
(
cd
vllm
&&
VLLM_TARGET_DEVICE
=
"empty"
${
PIP_INSTALL
}
-v
-e
.
)
### Install PyTorch and PTA
### Install PyTorch and PTA
PYTORCH_VERSION
=
2.6.0
PYTORCH_VERSION
=
2.6.0
TORCHVISION_VERSION
=
0.21.0
TORCHVISION_VERSION
=
0.21.0
PTA_VERSION
=
2.6.0
rc1
PTA_VERSION
=
2.6.0
pip
install
torch
==
$PYTORCH_VERSION
torchvision
==
$TORCHVISION_VERSION
--index-url
https://download.pytorch.org/whl/cpu
${
PIP_INSTALL
}
torch
==
$PYTORCH_VERSION
torchvision
==
$TORCHVISION_VERSION
--index-url
https://download.pytorch.org/whl/cpu
pip
install
torch_npu
==
$PTA_VERSION
${
PIP_INSTALL
}
torch_npu
==
$PTA_VERSION
### Install Triton-Ascend
### Install Triton-Ascend
TRITON_ASCEND_VERSION
=
3.2.0rc2
TRITON_ASCEND_NAME
=
"triton_ascend-3.2.0.dev20250729-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
pip
install
attrs
==
24.2.0
numpy
==
1.26.4
scipy
==
1.13.1
decorator
==
5.1.1
psutil
==
6.0.0
pytest
==
8.3.2 pytest-xdist
==
3.6.1 pyyaml pybind11
TRITON_ASCEND_URL
=
"https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/
${
TRITON_ASCEND_NAME
}
"
pip
install
triton-ascend
==
$TRITON_ASCEND_VERSION
${
PIP_INSTALL
}
attrs
==
24.2.0
numpy
==
1.26.4
scipy
==
1.13.1
decorator
==
5.1.1
psutil
==
6.0.0
pytest
==
8.3.2 pytest-xdist
==
3.6.1 pyyaml pybind11
wget
"
${
TRITON_ASCEND_URL
}
"
&&
${
PIP_INSTALL
}
"./
${
TRITON_ASCEND_NAME
}
"
pip
install
-e
"python[srt_npu]"
### Modify PyTorch TODO: to be removed later
### Install SGLang
TORCH_LOCATION
=
$(
python3
-c
'import torch; print(torch.__path__[0])'
)
${
PIP_INSTALL
}
-v
-e
"python[srt_npu]"
sed
-i
's/from triton.runtime.autotuner import OutOfResources/from triton.runtime.errors import OutOfResources/'
"
${
TORCH_LOCATION
}
/_inductor/runtime/triton_heuristics.py"
test/srt/run_suite.py
View file @
fee0ab0f
...
@@ -154,8 +154,14 @@ suites = {
...
@@ -154,8 +154,14 @@ suites = {
TestFile
(
"test_rope_rocm.py"
,
3
),
TestFile
(
"test_rope_rocm.py"
,
3
),
TestFile
(
"test_awq_dequant.py"
,
2
),
TestFile
(
"test_awq_dequant.py"
,
2
),
],
],
"per-commit-npu"
:
[
"per-commit-1-ascend-npu"
:
[
TestFile
(
"test_ascend_attention_backend.py"
,
400
),
TestFile
(
"test_ascend_tp1_bf16.py"
,
400
),
],
"per-commit-2-ascend-npu"
:
[
TestFile
(
"test_ascend_tp2_bf16.py"
,
400
),
],
"per-commit-4-ascend-npu"
:
[
TestFile
(
"test_ascend_mla_w8a8int8.py"
,
400
),
],
],
"per-commit-2-gpu"
:
[
"per-commit-2-gpu"
:
[
TestFile
(
"models/lora/test_lora_tp.py"
,
116
),
TestFile
(
"models/lora/test_lora_tp.py"
,
116
),
...
...
test/srt/test_ascend_attention_backend.py
deleted
100644 → 0
View file @
f57d2dc1
"""
Usage:
python3 -m unittest test_ascend_attention_backend.TestAscendAttnBackend.test_gsm8k
"""
import
unittest
from
types
import
SimpleNamespace
from
urllib.parse
import
urlparse
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_offline_throughput
,
)
DEFAULT_MODEL_NAME_FOR_TEST
=
"Qwen/Qwen2.5-7B-Instruct"
class
TestAscendAttnBackend
(
CustomTestCase
):
def
test_gsm8k
(
self
):
model
=
DEFAULT_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
url
=
urlparse
(
base_url
)
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--attention-backend"
,
"ascend"
,
"--mem-fraction-static"
,
0.8
,
],
)
try
:
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
1319
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
f
"http://
{
url
.
hostname
}
"
,
port
=
int
(
url
.
port
),
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
0.62
)
self
.
assertLessEqual
(
metrics
[
"latency"
],
150
)
finally
:
kill_process_tree
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_ascend_mla_backend.py
deleted
100644 → 0
View file @
f57d2dc1
"""
Usage:
python3 -m unittest test_ascend_mla_backend.TestAscendMLABackend.test_gsm8k
"""
import
os
import
unittest
from
types
import
SimpleNamespace
from
urllib.parse
import
urlparse
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MLA_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_offline_throughput
,
)
if
"ASCEND_RT_VISIBLE_DEVICES"
not
in
os
.
environ
:
os
.
environ
[
"ASCEND_RT_VISIBLE_DEVICES"
]
=
"0,1,2,3"
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
(
7000
+
int
(
os
.
environ
.
get
(
"ASCEND_RT_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
)
DEFAULT_URL_FOR_TEST
=
f
"http://127.0.0.1:
{
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
+
1000
}
"
DEFAULT_MODEL_NAME_FOR_TEST
=
"/models/DeepSeek-V2-Lite-Chat"
if
not
os
.
path
.
exists
(
DEFAULT_MODEL_NAME_FOR_TEST
):
DEFAULT_MODEL_NAME_FOR_TEST
=
DEFAULT_MLA_MODEL_NAME_FOR_TEST
class
TestAscendMLABackend
(
CustomTestCase
):
def
test_latency
(
self
):
output_throughput
=
run_bench_offline_throughput
(
DEFAULT_MODEL_NAME_FOR_TEST
,
[
"--attention-backend"
,
"ascend"
,
"--mem-fraction-static"
,
0.7
,
"--tp-size"
,
"4"
,
"--trust-remote-code"
,
"--disable-cuda-graph"
,
],
)
print
(
f
"
{
output_throughput
=
}
"
)
if
is_in_ci
():
self
.
assertGreater
(
output_throughput
,
18
)
def
test_gsm8k
(
self
):
model
=
DEFAULT_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
url
=
urlparse
(
base_url
)
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--attention-backend"
,
"ascend"
,
"--mem-fraction-static"
,
0.7
,
"--tp-size"
,
"4"
,
"--trust-remote-code"
,
"--disable-cuda-graph"
,
],
)
try
:
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
128
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
f
"http://
{
url
.
hostname
}
"
,
port
=
int
(
url
.
port
),
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
0.62
)
self
.
assertGreaterEqual
(
metrics
[
"output_throughput"
],
50
)
finally
:
kill_process_tree
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_ascend_mla_w8a8int8.py
0 → 100644
View file @
fee0ab0f
import
unittest
from
types
import
SimpleNamespace
from
urllib.parse
import
urlparse
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_offline_throughput
,
)
TEST_MODEL_MATRIX
=
{
"/root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V2-Lite-W8A8"
:
{
"accuracy"
:
0.34
,
"latency"
:
1000
,
"output_throughput"
:
6
,
},
}
class
TestAscendMlaW8A8Int8
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
models
=
TEST_MODEL_MATRIX
.
keys
()
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
url
=
urlparse
(
DEFAULT_URL_FOR_TEST
)
cls
.
common_args
=
[
"--trust-remote-code"
,
"--disable-cuda-graph"
,
"--mem-fraction-static"
,
0.8
,
"--attention-backend"
,
"ascend"
,
"--quantization"
,
"w8a8_int8"
,
"--tp-size"
,
4
,
]
def
test_a_gsm8k
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing accuracy:
{
model
}
===##"
)
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
*
self
.
common_args
,
],
)
try
:
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
1319
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
f
"http://
{
self
.
url
.
hostname
}
"
,
port
=
int
(
self
.
url
.
port
),
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
TEST_MODEL_MATRIX
[
model
][
"accuracy"
],
)
finally
:
kill_process_tree
(
process
.
pid
)
def
test_b_throughput
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing throughput:
{
model
}
===##"
)
output_throughput
=
run_bench_offline_throughput
(
model
,
[
*
self
.
common_args
,
],
)
print
(
f
"##===
{
model
}
throughput:
{
output_throughput
}
===##"
)
if
is_in_ci
():
self
.
assertGreater
(
output_throughput
,
TEST_MODEL_MATRIX
[
model
][
"output_throughput"
],
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_ascend_tp1_bf16.py
0 → 100644
View file @
fee0ab0f
import
unittest
from
types
import
SimpleNamespace
from
urllib.parse
import
urlparse
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_offline_throughput
,
)
TEST_MODEL_MATRIX
=
{
"Qwen/Qwen2.5-7B-Instruct"
:
{
"accuracy"
:
0.85
,
"latency"
:
150
,
"output_throughput"
:
30
,
},
}
class
TestAscendTp1Bf16
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
models
=
TEST_MODEL_MATRIX
.
keys
()
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
url
=
urlparse
(
DEFAULT_URL_FOR_TEST
)
cls
.
common_args
=
[
"--trust-remote-code"
,
"--disable-cuda-graph"
,
"--mem-fraction-static"
,
0.8
,
"--attention-backend"
,
"ascend"
,
]
def
test_a_gsm8k
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing accuracy:
{
model
}
===##"
)
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
*
self
.
common_args
,
],
)
try
:
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
1319
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
f
"http://
{
self
.
url
.
hostname
}
"
,
port
=
int
(
self
.
url
.
port
),
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
TEST_MODEL_MATRIX
[
model
][
"accuracy"
],
)
finally
:
kill_process_tree
(
process
.
pid
)
def
test_b_throughput
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing throughput:
{
model
}
===##"
)
output_throughput
=
run_bench_offline_throughput
(
model
,
[
*
self
.
common_args
,
],
)
print
(
f
"##===
{
model
}
throughput:
{
output_throughput
}
===##"
)
if
is_in_ci
():
self
.
assertGreater
(
output_throughput
,
TEST_MODEL_MATRIX
[
model
][
"output_throughput"
],
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_ascend_tp2_bf16.py
0 → 100644
View file @
fee0ab0f
import
unittest
from
types
import
SimpleNamespace
from
urllib.parse
import
urlparse
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
run_bench_offline_throughput
,
)
TEST_MODEL_MATRIX
=
{
"Qwen/Qwen2.5-7B-Instruct"
:
{
"accuracy"
:
0.85
,
"latency"
:
180
,
"output_throughput"
:
20
,
},
}
class
TestAscendTp2Bf16
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
models
=
TEST_MODEL_MATRIX
.
keys
()
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
url
=
urlparse
(
DEFAULT_URL_FOR_TEST
)
cls
.
common_args
=
[
"--trust-remote-code"
,
"--disable-cuda-graph"
,
"--mem-fraction-static"
,
0.8
,
"--attention-backend"
,
"ascend"
,
"--tp-size"
,
2
,
]
def
test_a_gsm8k
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing accuracy:
{
model
}
===##"
)
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
*
self
.
common_args
,
],
)
try
:
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
1319
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
f
"http://
{
self
.
url
.
hostname
}
"
,
port
=
int
(
self
.
url
.
port
),
)
metrics
=
run_eval_few_shot_gsm8k
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
TEST_MODEL_MATRIX
[
model
][
"accuracy"
],
)
finally
:
kill_process_tree
(
process
.
pid
)
def
test_b_throughput
(
self
):
for
model
in
self
.
models
:
with
self
.
subTest
(
model
=
model
):
print
(
f
"##=== Testing throughput:
{
model
}
===##"
)
output_throughput
=
run_bench_offline_throughput
(
model
,
[
*
self
.
common_args
,
],
)
print
(
f
"##===
{
model
}
throughput:
{
output_throughput
}
===##"
)
if
is_in_ci
():
self
.
assertGreater
(
output_throughput
,
TEST_MODEL_MATRIX
[
model
][
"output_throughput"
],
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment