Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
3cceaa38
Unverified
Commit
3cceaa38
authored
Oct 16, 2025
by
Even Zhou
Committed by
GitHub
Oct 16, 2025
Browse files
[Bugfix] Fix Qwen3/DSV3/DSV3.2 model support (#11510)
parent
b0d20cde
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
102 additions
and
33 deletions
+102
-33
.github/workflows/pr-test-npu.yml
.github/workflows/pr-test-npu.yml
+33
-13
.github/workflows/release-docker-npu-nightly.yml
.github/workflows/release-docker-npu-nightly.yml
+1
-1
.github/workflows/release-docker-npu.yml
.github/workflows/release-docker-npu.yml
+1
-1
docker/Dockerfile.npu
docker/Dockerfile.npu
+10
-2
python/sglang/srt/layers/attention/ascend_backend.py
python/sglang/srt/layers/attention/ascend_backend.py
+17
-0
python/sglang/srt/mem_cache/allocator_ascend.py
python/sglang/srt/mem_cache/allocator_ascend.py
+1
-1
python/sglang/srt/mem_cache/common.py
python/sglang/srt/mem_cache/common.py
+1
-5
python/sglang/srt/model_executor/npu_graph_runner.py
python/sglang/srt/model_executor/npu_graph_runner.py
+2
-2
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+1
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+20
-0
scripts/ci/npu_ci_install_dependency.sh
scripts/ci/npu_ci_install_dependency.sh
+13
-3
test/srt/ascend/test_ascend_deepep.py
test/srt/ascend/test_ascend_deepep.py
+2
-5
No files found.
.github/workflows/pr-test-npu.yml
View file @
3cceaa38
...
@@ -38,9 +38,10 @@ jobs:
...
@@ -38,9 +38,10 @@ jobs:
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.trusted-host ${CACHING_URL}
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple https://mirrors.aliyun.com/pypi/simple/"
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn mirrors.aliyun.com"
bash scripts/ci/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
910b
# copy required file from our daily cache
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
# copy download through proxy
...
@@ -53,13 +54,20 @@ jobs:
...
@@ -53,13 +54,20 @@ jobs:
SGLANG_IS_IN_CI
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
PYTORCH_NPU_ALLOC_CONF
:
"
expandable_segments:True"
STREAMS_PER_DEVICE
:
32
run
:
|
run
:
|
export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-1-ascend-npu
python3 run_suite.py --suite per-commit-1-ascend-npu
per-commit-2-ascend-npu
:
per-commit-2-ascend-npu
:
if
:
github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
if
:
github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
runs-on
:
linux-arm64-npu-2
runs-on
:
linux-arm64-npu-2
strategy
:
fail-fast
:
false
matrix
:
part
:
[
0
,
1
,
2
]
container
:
container
:
image
:
swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
image
:
swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
steps
:
steps
:
...
@@ -72,24 +80,28 @@ jobs:
...
@@ -72,24 +80,28 @@ jobs:
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.trusted-host ${CACHING_URL}
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple https://mirrors.aliyun.com/pypi/simple/"
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn mirrors.aliyun.com"
bash scripts/ci/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
910b
# copy required file from our daily cache
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
# copy download through proxy
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
-
name
:
Run test
-
name
:
Run test
timeout-minutes
:
9
0
timeout-minutes
:
6
0
env
:
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
PYTORCH_NPU_ALLOC_CONF
:
"
expandable_segments:True"
STREAMS_PER_DEVICE
:
32
run
:
|
run
:
|
export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-2-ascend-npu
python3 run_suite.py --suite per-commit-2-ascend-npu
--auto-partition-id ${{ matrix.part }} --auto-partition-size 3
per-commit-4-ascend-npu
:
per-commit-4-ascend-npu
:
if
:
github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
if
:
github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
...
@@ -106,22 +118,26 @@ jobs:
...
@@ -106,22 +118,26 @@ jobs:
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.trusted-host ${CACHING_URL}
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple https://mirrors.aliyun.com/pypi/simple/"
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn mirrors.aliyun.com"
bash scripts/ci/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
910b
# copy required file from our daily cache
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
# copy download through proxy
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
-
name
:
Run test
-
name
:
Run test
timeout-minutes
:
12
0
timeout-minutes
:
6
0
env
:
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
PYTORCH_NPU_ALLOC_CONF
:
"
expandable_segments:True"
STREAMS_PER_DEVICE
:
32
run
:
|
run
:
|
export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
python3 run_suite.py --suite per-commit-4-ascend-npu --timeout-per-file 3600
...
@@ -140,21 +156,25 @@ jobs:
...
@@ -140,21 +156,25 @@ jobs:
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.index-url http://${CACHING_URL}/pypi/simple
pip config set global.trusted-host ${CACHING_URL}
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple https://mirrors.aliyun.com/pypi/simple/"
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn mirrors.aliyun.com"
bash scripts/ci/npu_ci_install_dependency.sh
bash scripts/ci/npu_ci_install_dependency.sh
a3
# copy required file from our daily cache
# copy required file from our daily cache
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
# copy download through proxy
# copy download through proxy
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
-
name
:
Run test
-
name
:
Run test
timeout-minutes
:
9
0
timeout-minutes
:
6
0
env
:
env
:
SGLANG_USE_MODELSCOPE
:
true
SGLANG_USE_MODELSCOPE
:
true
SGLANG_IS_IN_CI
:
true
SGLANG_IS_IN_CI
:
true
HF_ENDPOINT
:
https://hf-mirror.com
HF_ENDPOINT
:
https://hf-mirror.com
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
TORCH_EXTENSIONS_DIR
:
/tmp/torch_extensions
PYTORCH_NPU_ALLOC_CONF
:
"
expandable_segments:True"
STREAMS_PER_DEVICE
:
32
run
:
|
run
:
|
export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-16-ascend-a3 --timeout-per-file
54
00
python3 run_suite.py --suite per-commit-16-ascend-a3 --timeout-per-file
36
00
.github/workflows/release-docker-npu-nightly.yml
View file @
3cceaa38
...
@@ -73,6 +73,6 @@ jobs:
...
@@ -73,6 +73,6 @@ jobs:
push
:
${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
push
:
${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
provenance
:
false
provenance
:
false
build-args
:
|
build-args
:
|
SGLANG_KERNEL_NPU_TAG=202509
13
SGLANG_KERNEL_NPU_TAG=202509
26
CANN_VERSION=${{ matrix.cann_version }}
CANN_VERSION=${{ matrix.cann_version }}
DEVICE_TYPE=${{ matrix.device_type }}
DEVICE_TYPE=${{ matrix.device_type }}
.github/workflows/release-docker-npu.yml
View file @
3cceaa38
...
@@ -69,6 +69,6 @@ jobs:
...
@@ -69,6 +69,6 @@ jobs:
push
:
${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
push
:
${{ github.repository == 'sgl-project/sglang' && github.event_name != 'pull_request' }}
provenance
:
false
provenance
:
false
build-args
:
|
build-args
:
|
SGLANG_KERNEL_NPU_TAG=202509
13
SGLANG_KERNEL_NPU_TAG=202509
26
CANN_VERSION=${{ matrix.cann_version }}
CANN_VERSION=${{ matrix.cann_version }}
DEVICE_TYPE=${{ matrix.device_type }}
DEVICE_TYPE=${{ matrix.device_type }}
docker/Dockerfile.npu
View file @
3cceaa38
...
@@ -6,12 +6,13 @@ ARG PYTHON_VERSION=py3.11
...
@@ -6,12 +6,13 @@ ARG PYTHON_VERSION=py3.11
FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION
FROM quay.io/ascend/cann:$CANN_VERSION-$DEVICE_TYPE-$OS-$PYTHON_VERSION
# Update pip & apt sources
# Update pip & apt sources
ARG DEVICE_TYPE
ARG PIP_INDEX_URL="https://pypi.org/simple/"
ARG PIP_INDEX_URL="https://pypi.org/simple/"
ARG APTMIRROR=""
ARG APTMIRROR=""
ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl
ARG MEMFABRIC_URL=https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/mf_adapter-1.0.0-cp311-cp311-linux_aarch64.whl
ARG PYTORCH_VERSION=2.6.0
ARG PYTORCH_VERSION=2.6.0
ARG TORCHVISION_VERSION=0.21.0
ARG TORCHVISION_VERSION=0.21.0
ARG PTA_URL="https://
gitee.com/ascend/pytorch/releases/download/v7.1.0.1-pytorch2.6.0
/torch_npu-2.6.0.post
1
-cp311-cp311-
many
linux_
2_28_
aarch64.whl"
ARG PTA_URL="https://
sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops
/torch_npu-2.6.0.post
2%2Bgit95d6260
-cp311-cp311-linux_aarch64.whl"
ARG VLLM_TAG=v0.8.5
ARG VLLM_TAG=v0.8.5
ARG TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0%2Bgitb0ea0850-cp311-cp311-linux_aarch64.whl"
ARG TRITON_ASCEND_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/triton_ascend-3.2.0%2Bgitb0ea0850-cp311-cp311-linux_aarch64.whl"
ARG BISHENG_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/Ascend-BiSheng-toolkit_aarch64.run"
ARG BISHENG_URL="https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/sglang/Ascend-BiSheng-toolkit_aarch64.run"
...
@@ -71,7 +72,7 @@ RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_
...
@@ -71,7 +72,7 @@ RUN git clone --depth 1 https://github.com/vllm-project/vllm.git --branch $VLLM_
# TODO: install from pypi released triton-ascend
# TODO: install from pypi released triton-ascend
RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \
RUN pip install torch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION --index-url https://download.pytorch.org/whl/cpu --no-cache-dir \
&& wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post
1
-cp311-cp311-
many
linux_
2_28_
aarch64.whl" --no-cache-dir \
&& wget ${PTA_URL} && pip install "./torch_npu-2.6.0.post
2+git95d6260
-cp311-cp311-linux_aarch64.whl" --no-cache-dir \
&& python3 -m pip install --no-cache-dir attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 \
&& python3 -m pip install --no-cache-dir attrs==24.2.0 numpy==1.26.4 scipy==1.13.1 decorator==5.1.1 psutil==6.0.0 pytest==8.3.2 pytest-xdist==3.6.1 pyyaml pybind11 \
&& pip install ${TRITON_ASCEND_URL} --no-cache-dir
&& pip install ${TRITON_ASCEND_URL} --no-cache-dir
...
@@ -92,6 +93,13 @@ RUN pip install wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG http
...
@@ -92,6 +93,13 @@ RUN pip install wheel==0.45.1 && git clone --branch $SGLANG_KERNEL_NPU_TAG http
&& cd .. && rm -rf sgl-kernel-npu \
&& cd .. && rm -rf sgl-kernel-npu \
&& cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so
&& cd "$(pip show deep-ep | awk '/^Location:/ {print $2}')" && ln -s deep_ep/deep_ep_cpp*.so
# Install CustomOps
RUN wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \
chmod a+x ./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run && \
./CANN-custom_ops-8.2.0.0-$DEVICE_TYPE-linux.aarch64.run --quiet --install-path=/usr/local/Ascend/ascend-toolkit/latest/opp && \
wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl && \
pip install ./custom_ops-1.0.$DEVICE_TYPE-cp311-cp311-linux_aarch64.whl
# Install Bisheng
# Install Bisheng
RUN wget ${BISHENG_URL} && chmod a+x Ascend-BiSheng-toolkit_aarch64.run && ./Ascend-BiSheng-toolkit_aarch64.run --install && rm Ascend-BiSheng-toolkit_aarch64.run
RUN wget ${BISHENG_URL} && chmod a+x Ascend-BiSheng-toolkit_aarch64.run && ./Ascend-BiSheng-toolkit_aarch64.run --install && rm Ascend-BiSheng-toolkit_aarch64.run
...
...
python/sglang/srt/layers/attention/ascend_backend.py
View file @
3cceaa38
...
@@ -356,6 +356,11 @@ class AscendAttnBackend(AttentionBackend):
...
@@ -356,6 +356,11 @@ class AscendAttnBackend(AttentionBackend):
assert
(
assert
(
layer
.
qk_head_dim
!=
layer
.
v_head_dim
layer
.
qk_head_dim
!=
layer
.
v_head_dim
),
"FIA only supports qk_head_dim != v_head_dim"
),
"FIA only supports qk_head_dim != v_head_dim"
num_token_padding
=
q
.
shape
[
0
]
q
,
k
,
v
=
[
data
[:
forward_batch
.
num_token_non_padded_cpu
]
for
data
in
[
q
,
k
,
v
]
]
q_nope
,
q_rope
=
q
.
split
([
layer
.
v_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
q_nope
,
q_rope
=
q
.
split
([
layer
.
v_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
k_nope
,
k_rope
=
k
.
split
([
layer
.
v_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
k_nope
,
k_rope
=
k
.
split
([
layer
.
v_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
...
@@ -375,6 +380,18 @@ class AscendAttnBackend(AttentionBackend):
...
@@ -375,6 +380,18 @@ class AscendAttnBackend(AttentionBackend):
next_tokens
=
0
,
next_tokens
=
0
,
)
)
attn_output
=
attn_output
.
reshape
(
-
1
,
layer
.
tp_q_head_num
,
layer
.
v_head_dim
)
if
num_token_padding
!=
forward_batch
.
num_token_non_padded_cpu
:
attn_output
=
torch
.
cat
(
[
attn_output
,
attn_output
.
new_zeros
(
num_token_padding
-
attn_output
.
shape
[
0
],
*
attn_output
.
shape
[
1
:],
),
],
dim
=
0
,
)
return
attn_output
return
attn_output
def
forward_decode_graph
(
def
forward_decode_graph
(
...
...
python/sglang/srt/mem_cache/allocator_ascend.py
View file @
3cceaa38
...
@@ -119,7 +119,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
...
@@ -119,7 +119,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
assert
len
(
torch
.
unique
(
out_indices
))
==
len
(
out_indices
)
assert
len
(
torch
.
unique
(
out_indices
))
==
len
(
out_indices
)
self
.
free_pages
=
self
.
free_pages
[
num_new_pages_item
:]
self
.
free_pages
=
self
.
free_pages
[
num_new_pages_item
:]
return
out_indices
return
out_indices
.
int
()
def
alloc_decode
(
def
alloc_decode
(
self
,
self
,
...
...
python/sglang/srt/mem_cache/common.py
View file @
3cceaa38
...
@@ -347,11 +347,7 @@ def alloc_for_extend(
...
@@ -347,11 +347,7 @@ def alloc_for_extend(
else
:
else
:
# Paged allocation - build last_loc
# Paged allocation - build last_loc
last_loc
=
[
last_loc
=
[
(
(
t
[
-
1
:]
if
len
(
t
)
>
0
else
torch
.
tensor
([
-
1
],
device
=
batch
.
device
))
t
[
-
1
:]
if
len
(
t
)
>
0
else
torch
.
tensor
([
-
1
],
device
=
batch
.
tree_cache
.
device
)
)
for
t
in
prefix_tensors
for
t
in
prefix_tensors
]
]
out_cache_loc
=
alloc_paged_token_slots_extend
(
out_cache_loc
=
alloc_paged_token_slots_extend
(
...
...
python/sglang/srt/model_executor/npu_graph_runner.py
View file @
3cceaa38
...
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Optional, Union
...
@@ -22,7 +22,7 @@ from typing import TYPE_CHECKING, Optional, Union
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
sglang.srt.configs.model_config
import
AttentionArch
from
sglang.srt.configs.model_config
import
AttentionArch
,
is_deepseek_nsa
from
sglang.srt.model_executor.cuda_graph_runner
import
CudaGraphRunner
from
sglang.srt.model_executor.cuda_graph_runner
import
CudaGraphRunner
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -75,7 +75,7 @@ class NPUGraphRunner(CudaGraphRunner):
...
@@ -75,7 +75,7 @@ class NPUGraphRunner(CudaGraphRunner):
self
.
positions
[:
self
.
raw_num_token
].
copy_
(
forward_batch
.
positions
)
self
.
positions
[:
self
.
raw_num_token
].
copy_
(
forward_batch
.
positions
)
# Replay
# Replay
if
self
.
model_runner
.
model_config
.
index_head_dim
is
None
:
if
not
is_deepseek_nsa
(
self
.
model_runner
.
model_config
.
hf_config
)
:
seq_lens
=
forward_batch
.
seq_lens
.
cpu
().
tolist
()
+
[
0
]
*
(
seq_lens
=
forward_batch
.
seq_lens
.
cpu
().
tolist
()
+
[
0
]
*
(
self
.
bs
-
self
.
raw_bs
self
.
bs
-
self
.
raw_bs
)
)
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
3cceaa38
...
@@ -1357,6 +1357,7 @@ class DeepseekV2AttentionMLA(nn.Module):
...
@@ -1357,6 +1357,7 @@ class DeepseekV2AttentionMLA(nn.Module):
inner_state
=
self
.
mla_preprocess
.
forward
(
inner_state
=
self
.
mla_preprocess
.
forward
(
positions
,
hidden_states
,
forward_batch
,
zero_allocator
positions
,
hidden_states
,
forward_batch
,
zero_allocator
)
)
inner_state
=
(
*
inner_state
,
None
)
# add a position for topk_indices
elif
attn_forward_method
==
AttnForwardMethod
.
NPU_MLA_SPARSE
:
elif
attn_forward_method
==
AttnForwardMethod
.
NPU_MLA_SPARSE
:
inner_state
=
self
.
forward_npu_sparse_prepare
(
inner_state
=
self
.
forward_npu_sparse_prepare
(
positions
,
hidden_states
,
forward_batch
,
zero_allocator
positions
,
hidden_states
,
forward_batch
,
zero_allocator
...
...
python/sglang/srt/server_args.py
View file @
3cceaa38
...
@@ -628,6 +628,16 @@ class ServerArgs:
...
@@ -628,6 +628,16 @@ class ServerArgs:
self
.
chunked_prefill_size
=
2048
self
.
chunked_prefill_size
=
2048
if
self
.
cuda_graph_max_bs
is
None
:
if
self
.
cuda_graph_max_bs
is
None
:
self
.
cuda_graph_max_bs
=
8
self
.
cuda_graph_max_bs
=
8
elif
is_npu
()
and
gpu_mem
<
32
*
1024
:
# Atlas A2B4
# (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
if
self
.
chunked_prefill_size
is
None
:
self
.
chunked_prefill_size
=
32768
if
self
.
cuda_graph_max_bs
is
None
:
if
self
.
tp_size
<
4
:
self
.
cuda_graph_max_bs
=
16
else
:
self
.
cuda_graph_max_bs
=
64
elif
gpu_mem
<
35
*
1024
:
elif
gpu_mem
<
35
*
1024
:
# A10, 4090, 5090
# A10, 4090, 5090
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
...
@@ -651,6 +661,16 @@ class ServerArgs:
...
@@ -651,6 +661,16 @@ class ServerArgs:
self
.
cuda_graph_max_bs
=
32
self
.
cuda_graph_max_bs
=
32
else
:
else
:
self
.
cuda_graph_max_bs
=
160
self
.
cuda_graph_max_bs
=
160
elif
is_npu
()
and
gpu_mem
<
64
*
1024
:
# Atlas A2 and Atlas A3
# (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
if
self
.
chunked_prefill_size
is
None
:
self
.
chunked_prefill_size
=
32768
if
self
.
cuda_graph_max_bs
is
None
:
if
self
.
tp_size
<
4
:
self
.
cuda_graph_max_bs
=
64
else
:
self
.
cuda_graph_max_bs
=
128
elif
gpu_mem
<
90
*
1024
:
elif
gpu_mem
<
90
*
1024
:
# H100, A100
# H100, A100
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
...
...
scripts/ci/npu_ci_install_dependency.sh
View file @
3cceaa38
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
set
-euo
pipefail
set
-euo
pipefail
PIP_INSTALL
=
"pip install --no-cache-dir"
PIP_INSTALL
=
"pip install --no-cache-dir"
DEVICE_TYPE
=
$1
# Install the required dependencies in CI.
# Install the required dependencies in CI.
...
@@ -39,8 +40,8 @@ TORCHVISION_VERSION=0.21.0
...
@@ -39,8 +40,8 @@ TORCHVISION_VERSION=0.21.0
${
PIP_INSTALL
}
torch
==
$PYTORCH_VERSION
torchvision
==
$TORCHVISION_VERSION
--index-url
https://download.pytorch.org/whl/cpu
${
PIP_INSTALL
}
torch
==
$PYTORCH_VERSION
torchvision
==
$TORCHVISION_VERSION
--index-url
https://download.pytorch.org/whl/cpu
PTA_VERSION
=
"v7.1.0.1-pytorch2.6.0"
PTA_VERSION
=
"v7.1.0.1-pytorch2.6.0"
PTA_NAME
=
"torch_npu-2.6.0.post
1
-cp311-cp311-
many
linux_
2_28_
aarch64.whl"
PTA_NAME
=
"torch_npu-2.6.0.post
2+git95d6260
-cp311-cp311-linux_aarch64.whl"
PTA_URL
=
"https://
gitee.com/ascend/pytorch/releases/download/
${
PTA_VERSION
}
/
${
PTA_NAME
}
"
PTA_URL
=
"https://
sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/torch_npu-2.6.0.post2%2Bgit95d6260-cp311-cp311-linux_aarch64.whl
"
wget
-O
"
${
PTA_NAME
}
"
"
${
PTA_URL
}
"
&&
${
PIP_INSTALL
}
"./
${
PTA_NAME
}
"
wget
-O
"
${
PTA_NAME
}
"
"
${
PTA_URL
}
"
&&
${
PIP_INSTALL
}
"./
${
PTA_NAME
}
"
...
@@ -58,11 +59,20 @@ wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./
...
@@ -58,11 +59,20 @@ wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./
### Install sgl-kernel-npu
### Install sgl-kernel-npu
SGL_KERNEL_NPU_TAG
=
"202509
13
"
SGL_KERNEL_NPU_TAG
=
"202509
26
"
git clone
--depth
1 https://github.com/sgl-project/sgl-kernel-npu.git
--branch
${
SGL_KERNEL_NPU_TAG
}
git clone
--depth
1 https://github.com/sgl-project/sgl-kernel-npu.git
--branch
${
SGL_KERNEL_NPU_TAG
}
# pin wheel to 0.45.1 ref: https://github.com/pypa/wheel/issues/662
pip
install
wheel
==
0.45.1
(
cd
sgl-kernel-npu
&&
bash ./build.sh
&&
pip
install
output/deep_ep
*
.whl output/sgl_kernel_npu
*
.whl
&&
cd
"
$(
pip show deep-ep |
grep
-E
'^Location:'
|
awk
'{print $2}'
)
"
&&
ln
-s
deep_ep/deep_ep_cpp
*
.so
)
(
cd
sgl-kernel-npu
&&
bash ./build.sh
&&
pip
install
output/deep_ep
*
.whl output/sgl_kernel_npu
*
.whl
&&
cd
"
$(
pip show deep-ep |
grep
-E
'^Location:'
|
awk
'{print $2}'
)
"
&&
ln
-s
deep_ep/deep_ep_cpp
*
.so
)
### Install CustomOps (TODO: to be removed once merged into sgl-kernel-npu)
wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/CANN-custom_ops-8.2.0.0-
$DEVICE_TYPE
-linux
.aarch64.run
chmod
a+x ./CANN-custom_ops-8.2.0.0-
$DEVICE_TYPE
-linux
.aarch64.run
./CANN-custom_ops-8.2.0.0-
$DEVICE_TYPE
-linux
.aarch64.run
--quiet
--install-path
=
/usr/local/Ascend/ascend-toolkit/latest/opp
wget https://sglang-ascend.obs.cn-east-3.myhuaweicloud.com/ops/custom_ops-1.0.
$DEVICE_TYPE
-cp311-cp311-linux_aarch64
.whl
pip
install
./custom_ops-1.0.
$DEVICE_TYPE
-cp311-cp311-linux_aarch64
.whl
### Install SGLang
### Install SGLang
rm
-rf
python/pyproject.toml
&&
mv
python/pyproject_other.toml python/pyproject.toml
rm
-rf
python/pyproject.toml
&&
mv
python/pyproject_other.toml python/pyproject.toml
${
PIP_INSTALL
}
-v
-e
"python[srt_npu]"
${
PIP_INSTALL
}
-v
-e
"python[srt_npu]"
test/srt/ascend/test_ascend_deepep.py
View file @
3cceaa38
...
@@ -38,13 +38,10 @@ class TestAscendDeepEP(CustomTestCase):
...
@@ -38,13 +38,10 @@ class TestAscendDeepEP(CustomTestCase):
"--quantization"
,
"--quantization"
,
"w8a8_int8"
,
"w8a8_int8"
,
"--mem-fraction-static"
,
"--mem-fraction-static"
,
0.9
,
0.8
,
"--max-running-requests"
,
32
,
"--disable-radix-cache"
,
"--disable-radix-cache"
,
"--chunked-prefill-size"
,
"--chunked-prefill-size"
,
32768
,
32768
,
"--disable-cuda-graph"
,
"--tp-size"
,
"--tp-size"
,
16
,
16
,
"--dp-size"
,
"--dp-size"
,
...
@@ -58,7 +55,7 @@ class TestAscendDeepEP(CustomTestCase):
...
@@ -58,7 +55,7 @@ class TestAscendDeepEP(CustomTestCase):
]
]
cls
.
extra_envs
=
{
cls
.
extra_envs
=
{
"HCCL_BUFFSIZE"
:
"
5
00"
,
"HCCL_BUFFSIZE"
:
"
10
00"
,
"SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK"
:
"32"
,
"SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK"
:
"32"
,
"SGLANG_NPU_USE_MLAPO"
:
"1"
,
"SGLANG_NPU_USE_MLAPO"
:
"1"
,
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment