Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
a39d9287
Unverified
Commit
a39d9287
authored
Jun 18, 2025
by
Yijie Zhu
Committed by
GitHub
Jun 17, 2025
Browse files
support qwen2 running on ascend npu device (#7022)
Co-authored-by:
刁莹煜
<
diaoyingyu1@hisilicon.com
>
parent
10d60cd4
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
63 additions
and
16 deletions
+63
-16
docs/backend/native_api.ipynb
docs/backend/native_api.ipynb
+4
-0
python/sglang/srt/_custom_ops.py
python/sglang/srt/_custom_ops.py
+2
-2
python/sglang/srt/layers/activation.py
python/sglang/srt/layers/activation.py
+3
-2
python/sglang/srt/layers/layernorm.py
python/sglang/srt/layers/layernorm.py
+3
-2
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+3
-2
python/sglang/srt/layers/quantization/fp8.py
python/sglang/srt/layers/quantization/fp8.py
+3
-1
python/sglang/srt/layers/quantization/utils.py
python/sglang/srt/layers/quantization/utils.py
+3
-2
python/sglang/srt/layers/rotary_embedding.py
python/sglang/srt/layers/rotary_embedding.py
+3
-2
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+39
-3
No files found.
docs/backend/native_api.ipynb
View file @
a39d9287
...
@@ -51,6 +51,10 @@
...
@@ -51,6 +51,10 @@
"server_process, port = launch_server_cmd(\n",
"server_process, port = launch_server_cmd(\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
" \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0\"\n",
")\n",
")\n",
"## To run qwen2.5-0.5b-instruct model on the Ascend-Npu, you can execute the following command:\n",
"# server_process, port = launch_server_cmd(\n",
"# \"python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0 --device npu --tp 2 --attention-backend torch_native\"\n",
"# )\n",
"\n",
"\n",
"wait_for_server(f\"http://localhost:{port}\")"
"wait_for_server(f\"http://localhost:{port}\")"
]
]
...
...
python/sglang/srt/_custom_ops.py
View file @
a39d9287
...
@@ -4,7 +4,7 @@ from typing import List, Tuple
...
@@ -4,7 +4,7 @@ from typing import List, Tuple
import
torch
import
torch
from
sglang.srt.utils
import
get_bool_env_var
,
is_hip
,
is_hpu
from
sglang.srt.utils
import
get_bool_env_var
,
is_hip
,
is_hpu
,
is_npu
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
use_vllm_custom_allreduce
=
get_bool_env_var
(
use_vllm_custom_allreduce
=
get_bool_env_var
(
...
@@ -25,7 +25,7 @@ if not is_hpu():
...
@@ -25,7 +25,7 @@ if not is_hpu():
logger
.
warning
(
"Failed to import from custom_ar with %r"
,
e
)
logger
.
warning
(
"Failed to import from custom_ar with %r"
,
e
)
if
not
is_hip
():
if
not
is_hip
()
and
not
is_npu
()
:
if
use_vllm_custom_allreduce
:
if
use_vllm_custom_allreduce
:
custom_op
=
torch
.
ops
.
_C_custom_ar
custom_op
=
torch
.
ops
.
_C_custom_ar
else
:
else
:
...
...
python/sglang/srt/layers/activation.py
View file @
a39d9287
...
@@ -29,10 +29,11 @@ from sglang.srt.distributed import (
...
@@ -29,10 +29,11 @@ from sglang.srt.distributed import (
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
)
)
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.utils
import
is_cuda
,
set_weight_attrs
from
sglang.srt.utils
import
is_cuda
,
is_npu
,
set_weight_attrs
from
sglang.utils
import
resolve_obj_by_qualname
from
sglang.utils
import
resolve_obj_by_qualname
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_npu
=
is_npu
()
if
_is_cuda
:
if
_is_cuda
:
from
sgl_kernel
import
gelu_and_mul
,
gelu_tanh_and_mul
,
silu_and_mul
from
sgl_kernel
import
gelu_and_mul
,
gelu_tanh_and_mul
,
silu_and_mul
...
@@ -184,7 +185,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
...
@@ -184,7 +185,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
return
nn
.
Identity
()
return
nn
.
Identity
()
if
not
_is_cuda
:
if
not
_is_cuda
and
not
_is_npu
:
logger
.
info
(
logger
.
info
(
"sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries."
"sgl-kernel is not available on Non-NV platforms. Fallback to other kernel libraries."
)
)
...
...
python/sglang/srt/layers/layernorm.py
View file @
a39d9287
...
@@ -20,10 +20,11 @@ import torch
...
@@ -20,10 +20,11 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
sglang.srt.custom_op
import
CustomOp
from
sglang.srt.custom_op
import
CustomOp
from
sglang.srt.utils
import
get_bool_env_var
,
is_cuda
,
is_hip
from
sglang.srt.utils
import
get_bool_env_var
,
is_cuda
,
is_hip
,
is_npu
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
_is_npu
=
is_npu
()
_use_aiter
=
get_bool_env_var
(
"SGLANG_USE_AITER"
)
and
_is_hip
_use_aiter
=
get_bool_env_var
(
"SGLANG_USE_AITER"
)
and
_is_hip
if
_is_cuda
:
if
_is_cuda
:
...
@@ -187,7 +188,7 @@ class Gemma3RMSNorm(nn.Module):
...
@@ -187,7 +188,7 @@ class Gemma3RMSNorm(nn.Module):
return
f
"
{
tuple
(
self
.
weight
.
shape
)
}
, eps=
{
self
.
eps
}
"
return
f
"
{
tuple
(
self
.
weight
.
shape
)
}
, eps=
{
self
.
eps
}
"
if
not
(
_is_cuda
or
_is_hip
):
if
not
(
_is_cuda
or
_is_hip
or
_is_npu
):
logger
.
info
(
logger
.
info
(
"sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries."
"sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries."
)
)
...
...
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
a39d9287
...
@@ -17,11 +17,12 @@ from sglang.srt.layers.quantization.utils import (
...
@@ -17,11 +17,12 @@ from sglang.srt.layers.quantization.utils import (
per_tensor_dequantize
,
per_tensor_dequantize
,
replace_parameter
,
replace_parameter
,
)
)
from
sglang.srt.utils
import
is_cuda
,
set_weight_attrs
from
sglang.srt.utils
import
is_cuda
,
is_npu
,
set_weight_attrs
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_npu
=
is_npu
()
if
not
_is_cuda
:
if
not
_is_cuda
and
not
_is_npu
:
from
vllm
import
_custom_ops
as
vllm_ops
from
vllm
import
_custom_ops
as
vllm_ops
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm._custom_ops
import
scaled_fp8_quant
...
...
python/sglang/srt/layers/quantization/fp8.py
View file @
a39d9287
...
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
...
@@ -67,6 +67,7 @@ from sglang.srt.utils import (
get_bool_env_var
,
get_bool_env_var
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
is_npu
,
log_info_on_rank0
,
log_info_on_rank0
,
print_warning_once
,
print_warning_once
,
set_weight_attrs
,
set_weight_attrs
,
...
@@ -74,6 +75,7 @@ from sglang.srt.utils import (
...
@@ -74,6 +75,7 @@ from sglang.srt.utils import (
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_npu
=
is_npu
()
_is_fp8_fnuz
=
is_fp8_fnuz
()
_is_fp8_fnuz
=
is_fp8_fnuz
()
...
@@ -86,7 +88,7 @@ if _is_hip:
...
@@ -86,7 +88,7 @@ if _is_hip:
from
aiter.fused_moe_bf16_asm
import
asm_moe
,
ck_moe_2stages
from
aiter.fused_moe_bf16_asm
import
asm_moe
,
ck_moe_2stages
from
aiter.ops.shuffle
import
shuffle_weight
from
aiter.ops.shuffle
import
shuffle_weight
if
not
_is_cuda
:
if
not
_is_cuda
and
not
_is_npu
:
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm._custom_ops
import
scaled_fp8_quant
...
...
python/sglang/srt/layers/quantization/utils.py
View file @
a39d9287
...
@@ -6,11 +6,12 @@ from typing import List, Mapping, Tuple, Union
...
@@ -6,11 +6,12 @@ from typing import List, Mapping, Tuple, Union
import
torch
import
torch
from
sglang.srt.layers.quantization.fp8_kernel
import
scaled_fp8_quant
from
sglang.srt.layers.quantization.fp8_kernel
import
scaled_fp8_quant
from
sglang.srt.utils
import
is_cuda
from
sglang.srt.utils
import
is_cuda
,
is_npu
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_npu
=
is_npu
()
if
not
_is_cuda
:
if
not
_is_cuda
and
not
_is_npu
:
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm._custom_ops
import
scaled_fp8_quant
...
...
python/sglang/srt/layers/rotary_embedding.py
View file @
a39d9287
...
@@ -8,10 +8,11 @@ import torch
...
@@ -8,10 +8,11 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
sglang.srt.custom_op
import
CustomOp
from
sglang.srt.custom_op
import
CustomOp
from
sglang.srt.utils
import
is_cuda
,
is_hip
from
sglang.srt.utils
import
is_cuda
,
is_hip
,
is_npu
_is_cuda
=
is_cuda
()
_is_cuda
=
is_cuda
()
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
_is_npu
=
is_npu
()
if
_is_cuda
:
if
_is_cuda
:
from
sgl_kernel
import
apply_rope_with_cos_sin_cache_inplace
from
sgl_kernel
import
apply_rope_with_cos_sin_cache_inplace
...
@@ -84,7 +85,7 @@ class RotaryEmbedding(CustomOp):
...
@@ -84,7 +85,7 @@ class RotaryEmbedding(CustomOp):
if
not
_is_cuda
:
if
not
_is_cuda
:
cache
=
cache
.
to
(
dtype
)
cache
=
cache
.
to
(
dtype
)
if
not
_is_cuda
or
self
.
head_size
not
in
[
64
,
128
,
256
,
512
]:
if
not
(
_is_cuda
or
_is_npu
)
or
self
.
head_size
not
in
[
64
,
128
,
256
,
512
]:
from
vllm._custom_ops
import
rotary_embedding
from
vllm._custom_ops
import
rotary_embedding
self
.
vllm_rotary_embedding
=
rotary_embedding
self
.
vllm_rotary_embedding
=
rotary_embedding
...
...
python/sglang/srt/utils.py
View file @
a39d9287
...
@@ -1291,6 +1291,15 @@ def get_hpu_memory_capacity():
...
@@ -1291,6 +1291,15 @@ def get_hpu_memory_capacity():
)
)
def
get_npu_memory_capacity
():
try
:
import
torch_npu
return
torch
.
npu
.
mem_get_info
()[
1
]
//
1024
//
1024
# unit: MB
except
ImportError
as
e
:
raise
ImportError
(
"torch_npu is required when run on npu device."
)
def
get_device_memory_capacity
(
device
:
str
=
None
):
def
get_device_memory_capacity
(
device
:
str
=
None
):
if
is_cuda
():
if
is_cuda
():
gpu_mem
=
get_nvgpu_memory_capacity
()
gpu_mem
=
get_nvgpu_memory_capacity
()
...
@@ -1298,6 +1307,8 @@ def get_device_memory_capacity(device: str = None):
...
@@ -1298,6 +1307,8 @@ def get_device_memory_capacity(device: str = None):
gpu_mem
=
get_amdgpu_memory_capacity
()
gpu_mem
=
get_amdgpu_memory_capacity
()
elif
device
==
"hpu"
:
elif
device
==
"hpu"
:
gpu_mem
=
get_hpu_memory_capacity
()
gpu_mem
=
get_hpu_memory_capacity
()
elif
device
==
"npu"
:
gpu_mem
=
get_npu_memory_capacity
()
else
:
else
:
# GPU memory is not known yet or no GPU is available.
# GPU memory is not known yet or no GPU is available.
gpu_mem
=
None
gpu_mem
=
None
...
@@ -1423,6 +1434,11 @@ def get_device(device_id: Optional[int] = None) -> str:
...
@@ -1423,6 +1434,11 @@ def get_device(device_id: Optional[int] = None) -> str:
return
"xpu"
return
"xpu"
return
"xpu:{}"
.
format
(
device_id
)
return
"xpu:{}"
.
format
(
device_id
)
if
hasattr
(
torch
,
"npu"
)
and
torch
.
npu
.
is_available
():
if
device_id
==
None
:
return
"npu"
return
"npu:{}"
.
format
(
device_id
)
if
is_habana_available
():
if
is_habana_available
():
try
:
try
:
import
habana_frameworks.torch.hpu
import
habana_frameworks.torch.hpu
...
@@ -1497,15 +1513,35 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
...
@@ -1497,15 +1513,35 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
return
major
,
minor
return
major
,
minor
def
get_npu_compiler_config
():
config
=
{
"frozen_parameter"
:
True
,
"tiling_schedule_optimize"
:
True
,
"topology_sorting_strategy"
:
"StableRDFS"
,
}
return
config
def
get_compiler_backend
()
->
str
:
def
get_compiler_backend
()
->
str
:
if
hasattr
(
torch
,
"hpu"
)
and
torch
.
hpu
.
is_available
():
if
hasattr
(
torch
,
"hpu"
)
and
torch
.
hpu
.
is_available
():
return
"hpu_backend"
return
"hpu_backend"
if
hasattr
(
torch
,
"npu"
)
and
torch
.
npu
.
is_available
():
if
hasattr
(
torch
,
"npu"
)
and
torch
.
npu
.
is_available
():
try
:
import
torchair
import
torchair
import
torchair.ge_concrete_graph.ge_converter.experimental.patch_for_hcom_allreduce
from
torchair.configs.compiler_config
import
CompilerConfig
except
ImportError
as
e
:
raise
ImportError
(
"NPU detected, but torchair package is not installed. "
"Please install torchair for torch.compile support on NPU."
)
compiler_config
=
CompilerConfig
()
predefined_config
=
get_npu_compiler_config
()
for
k
,
v
in
predefined_config
.
items
():
setattr
(
compiler_config
.
experimental_config
,
k
,
v
)
config
=
torchair
.
CompilerConfig
()
npu_backend
=
torchair
.
get_npu_backend
(
compiler_config
=
compiler_config
)
npu_backend
=
torchair
.
get_npu_backend
(
compiler_config
=
config
)
return
npu_backend
return
npu_backend
return
"inductor"
return
"inductor"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment