Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c550ab91
Unverified
Commit
c550ab91
authored
Nov 02, 2025
by
Zhihao Lyu
Committed by
GitHub
Nov 01, 2025
Browse files
[Ascend] Add Ascend NPU support for sglang.check_env & rework proposal (#11052)
Co-authored-by:
ronnie_zheng
<
zl19940307@163.com
>
parent
086f0b79
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
286 additions
and
166 deletions
+286
-166
python/sglang/check_env.py
python/sglang/check_env.py
+286
-166
No files found.
python/sglang/check_env.py
View file @
c550ab91
...
@@ -5,11 +5,12 @@ import os
...
@@ -5,11 +5,12 @@ import os
import
resource
import
resource
import
subprocess
import
subprocess
import
sys
import
sys
from
abc
import
abstractmethod
from
collections
import
OrderedDict
,
defaultdict
from
collections
import
OrderedDict
,
defaultdict
import
torch
import
torch
from
sglang.srt.utils
import
is_hip
from
sglang.srt.utils
import
is_hip
,
is_npu
def
is_cuda_v2
():
def
is_cuda_v2
():
...
@@ -51,104 +52,124 @@ PACKAGE_LIST = [
...
@@ -51,104 +52,124 @@ PACKAGE_LIST = [
]
]
def
get_package_versions
(
packages
):
class
BaseEnv
:
"""
"""Base class for environment check"""
Get versions of specified packages.
"""
def
__init__
(
self
):
versions
=
{}
self
.
package_list
=
PACKAGE_LIST
for
package
in
packages
:
package_name
=
package
.
split
(
"=="
)[
0
].
split
(
">="
)[
0
].
split
(
"<="
)[
0
]
@
abstractmethod
try
:
def
get_info
(
self
)
->
dict
:
version
=
importlib
.
metadata
.
version
(
package_name
)
"""
versions
[
package_name
]
=
version
Get CUDA-related information if available.
except
ModuleNotFoundError
:
"""
versions
[
package_name
]
=
"Module Not Found"
raise
NotImplementedError
return
versions
@
abstractmethod
def
get_topology
(
self
)
->
dict
:
raise
NotImplementedError
def
get_package_versions
(
self
)
->
dict
:
"""
Get versions of specified packages.
"""
versions
=
{}
for
package
in
self
.
package_list
:
package_name
=
package
.
split
(
"=="
)[
0
].
split
(
">="
)[
0
].
split
(
"<="
)[
0
]
try
:
version
=
importlib
.
metadata
.
version
(
package_name
)
versions
[
package_name
]
=
version
except
ModuleNotFoundError
:
versions
[
package_name
]
=
"Module Not Found"
return
versions
def
get_device_info
(
self
):
"""
Get information about available GPU devices.
"""
devices
=
defaultdict
(
list
)
capabilities
=
defaultdict
(
list
)
for
k
in
range
(
torch
.
cuda
.
device_count
()):
devices
[
torch
.
cuda
.
get_device_name
(
k
)].
append
(
str
(
k
))
capability
=
torch
.
cuda
.
get_device_capability
(
k
)
capabilities
[
f
"
{
capability
[
0
]
}
.
{
capability
[
1
]
}
"
].
append
(
str
(
k
))
gpu_info
=
{}
for
name
,
device_ids
in
devices
.
items
():
gpu_info
[
f
"GPU
{
','
.
join
(
device_ids
)
}
"
]
=
name
if
len
(
capabilities
)
==
1
:
# All GPUs have the same compute capability
cap
,
gpu_ids
=
list
(
capabilities
.
items
())[
0
]
gpu_info
[
f
"GPU
{
','
.
join
(
gpu_ids
)
}
Compute Capability"
]
=
cap
else
:
# GPUs have different compute capabilities
for
cap
,
gpu_ids
in
capabilities
.
items
():
gpu_info
[
f
"GPU
{
','
.
join
(
gpu_ids
)
}
Compute Capability"
]
=
cap
return
gpu_info
def
get_cuda_info
():
def
get_hypervisor_vendor
(
self
)
->
dict
:
"""
try
:
Get CUDA-related information if available.
output
=
subprocess
.
check_output
([
"lscpu"
],
text
=
True
)
"""
for
line
in
output
.
split
(
"
\n
"
):
if
is_cuda_v2
():
if
"Hypervisor vendor:"
in
line
:
return
{
"Hypervisor vendor:"
:
line
.
split
(
":"
)[
1
].
strip
()}
return
{}
except
:
return
{}
def
get_ulimit_soft
(
self
)
->
dict
:
ulimit_soft
,
_
=
resource
.
getrlimit
(
resource
.
RLIMIT_NOFILE
)
return
{
"ulimit soft"
:
ulimit_soft
}
def
check_env
(
self
):
"""
Check and print environment information.
"""
env_info
=
OrderedDict
()
env_info
[
"Python"
]
=
sys
.
version
.
replace
(
"
\n
"
,
""
)
env_info
.
update
(
self
.
get_info
())
env_info
[
"PyTorch"
]
=
torch
.
__version__
env_info
.
update
(
self
.
get_package_versions
())
env_info
.
update
(
self
.
get_topology
())
env_info
.
update
(
self
.
get_hypervisor_vendor
())
env_info
.
update
(
self
.
get_ulimit_soft
())
for
k
,
v
in
env_info
.
items
():
print
(
f
"
{
k
}
:
{
v
}
"
)
class
GPUEnv
(
BaseEnv
):
"""Environment checker for Nvidia GPU"""
def
get_info
(
self
):
cuda_info
=
{
"CUDA available"
:
torch
.
cuda
.
is_available
()}
cuda_info
=
{
"CUDA available"
:
torch
.
cuda
.
is_available
()}
if
cuda_info
[
"CUDA available"
]:
if
cuda_info
[
"CUDA available"
]:
cuda_info
.
update
(
_get_gpu_info
())
cuda_info
.
update
(
self
.
get_device_info
())
cuda_info
.
update
(
_get_cuda_version_info
())
cuda_info
.
update
(
self
.
_get_cuda_version_info
())
return
cuda_info
elif
is_hip
():
cuda_info
=
{
"ROCM available"
:
torch
.
cuda
.
is_available
()}
if
cuda_info
[
"ROCM available"
]:
cuda_info
.
update
(
_get_gpu_info
())
cuda_info
.
update
(
_get_cuda_version_info
())
return
cuda_info
return
cuda_info
def
_get_cuda_version_info
(
self
):
def
_get_gpu_info
():
"""
"""
Get CUDA version information.
Get information about available GPUs.
"""
"""
devices
=
defaultdict
(
list
)
capabilities
=
defaultdict
(
list
)
for
k
in
range
(
torch
.
cuda
.
device_count
()):
devices
[
torch
.
cuda
.
get_device_name
(
k
)].
append
(
str
(
k
))
capability
=
torch
.
cuda
.
get_device_capability
(
k
)
capabilities
[
f
"
{
capability
[
0
]
}
.
{
capability
[
1
]
}
"
].
append
(
str
(
k
))
gpu_info
=
{}
for
name
,
device_ids
in
devices
.
items
():
gpu_info
[
f
"GPU
{
','
.
join
(
device_ids
)
}
"
]
=
name
if
len
(
capabilities
)
==
1
:
# All GPUs have the same compute capability
cap
,
gpu_ids
=
list
(
capabilities
.
items
())[
0
]
gpu_info
[
f
"GPU
{
','
.
join
(
gpu_ids
)
}
Compute Capability"
]
=
cap
else
:
# GPUs have different compute capabilities
for
cap
,
gpu_ids
in
capabilities
.
items
():
gpu_info
[
f
"GPU
{
','
.
join
(
gpu_ids
)
}
Compute Capability"
]
=
cap
return
gpu_info
def
_get_cuda_version_info
():
"""
Get CUDA version information.
"""
if
is_cuda_v2
():
from
torch.utils.cpp_extension
import
CUDA_HOME
from
torch.utils.cpp_extension
import
CUDA_HOME
cuda_info
=
{
"CUDA_HOME"
:
CUDA_HOME
}
cuda_info
=
{
"CUDA_HOME"
:
CUDA_HOME
}
if
CUDA_HOME
and
os
.
path
.
isdir
(
CUDA_HOME
):
if
CUDA_HOME
and
os
.
path
.
isdir
(
CUDA_HOME
):
cuda_info
.
update
(
_get_nvcc_info
())
cuda_info
.
update
(
self
.
_get_nvcc_info
())
cuda_info
.
update
(
_get_cuda_driver_version
())
cuda_info
.
update
(
self
.
_get_cuda_driver_version
())
return
cuda_info
return
cuda_info
elif
is_hip
():
from
torch.utils.cpp_extension
import
ROCM_HOME
as
ROCM_HOME
cuda_info
=
{
"ROCM_HOME"
:
ROCM_HOME
}
if
ROCM_HOME
and
os
.
path
.
isdir
(
ROCM_HOME
):
def
_get_nvcc_info
(
self
):
cuda_info
.
update
(
_get_nvcc_info
())
"""
cuda_info
.
update
(
_get_cuda_driver_version
())
Get NVCC version information.
"""
return
cuda_info
else
:
cuda_info
=
{
"CUDA_HOME"
:
""
}
return
cuda_info
def
_get_nvcc_info
():
"""
Get NVCC version information.
"""
if
is_cuda_v2
():
from
torch.utils.cpp_extension
import
CUDA_HOME
from
torch.utils.cpp_extension
import
CUDA_HOME
try
:
try
:
...
@@ -167,7 +188,73 @@ def _get_nvcc_info():
...
@@ -167,7 +188,73 @@ def _get_nvcc_info():
}
}
except
subprocess
.
SubprocessError
:
except
subprocess
.
SubprocessError
:
return
{
"NVCC"
:
"Not Available"
}
return
{
"NVCC"
:
"Not Available"
}
elif
is_hip
():
def
_get_cuda_driver_version
(
self
):
"""
Get CUDA driver version.
"""
versions
=
set
()
try
:
output
=
subprocess
.
check_output
(
[
"nvidia-smi"
,
"--query-gpu=driver_version"
,
"--format=csv,noheader,nounits"
,
]
)
versions
=
set
(
output
.
decode
().
strip
().
split
(
"
\n
"
))
if
len
(
versions
)
==
1
:
return
{
"CUDA Driver Version"
:
versions
.
pop
()}
else
:
return
{
"CUDA Driver Versions"
:
", "
.
join
(
sorted
(
versions
))}
except
subprocess
.
SubprocessError
:
return
{
"CUDA Driver Version"
:
"Not Available"
}
def
get_topology
(
self
):
"""
Get GPU topology information.
"""
try
:
result
=
subprocess
.
run
(
[
"nvidia-smi"
,
"topo"
,
"-m"
],
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
check
=
True
,
)
return
{
"NVIDIA Topology"
:
(
"
\n
"
+
result
.
stdout
if
result
.
returncode
==
0
else
None
)
}
except
subprocess
.
SubprocessError
:
return
{}
class
HIPEnv
(
BaseEnv
):
"""Environment checker for ROCm/HIP"""
def
get_info
(
self
):
cuda_info
=
{
"ROCM available"
:
torch
.
cuda
.
is_available
()}
if
cuda_info
[
"ROCM available"
]:
cuda_info
.
update
(
self
.
get_device_info
())
cuda_info
.
update
(
self
.
_get_cuda_version_info
())
return
cuda_info
def
_get_cuda_version_info
(
self
):
from
torch.utils.cpp_extension
import
ROCM_HOME
as
ROCM_HOME
cuda_info
=
{
"ROCM_HOME"
:
ROCM_HOME
}
if
ROCM_HOME
and
os
.
path
.
isdir
(
ROCM_HOME
):
cuda_info
.
update
(
self
.
_get_hipcc_info
())
cuda_info
.
update
(
self
.
_get_rocm_driver_version
())
return
cuda_info
def
_get_hipcc_info
(
self
):
from
torch.utils.cpp_extension
import
ROCM_HOME
from
torch.utils.cpp_extension
import
ROCM_HOME
try
:
try
:
...
@@ -184,32 +271,8 @@ def _get_nvcc_info():
...
@@ -184,32 +271,8 @@ def _get_nvcc_info():
}
}
except
subprocess
.
SubprocessError
:
except
subprocess
.
SubprocessError
:
return
{
"HIPCC"
:
"Not Available"
}
return
{
"HIPCC"
:
"Not Available"
}
else
:
return
{
"NVCC"
:
"Not Available"
}
def
_get_cuda_driver_version
():
def
_get_rocm_driver_version
(
self
):
"""
Get CUDA driver version.
"""
versions
=
set
()
if
is_cuda_v2
():
try
:
output
=
subprocess
.
check_output
(
[
"nvidia-smi"
,
"--query-gpu=driver_version"
,
"--format=csv,noheader,nounits"
,
]
)
versions
=
set
(
output
.
decode
().
strip
().
split
(
"
\n
"
))
if
len
(
versions
)
==
1
:
return
{
"CUDA Driver Version"
:
versions
.
pop
()}
else
:
return
{
"CUDA Driver Versions"
:
", "
.
join
(
sorted
(
versions
))}
except
subprocess
.
SubprocessError
:
return
{
"CUDA Driver Version"
:
"Not Available"
}
elif
is_hip
():
try
:
try
:
output
=
subprocess
.
check_output
(
output
=
subprocess
.
check_output
(
[
[
...
@@ -226,80 +289,137 @@ def _get_cuda_driver_version():
...
@@ -226,80 +289,137 @@ def _get_cuda_driver_version():
return
{
"ROCM Driver Version"
:
ver
}
return
{
"ROCM Driver Version"
:
ver
}
except
subprocess
.
SubprocessError
:
except
subprocess
.
SubprocessError
:
return
{
"ROCM Driver Version"
:
"Not Available"
}
return
{
"ROCM Driver Version"
:
"Not Available"
}
else
:
return
{
"CUDA Driver Version"
:
"Not Available"
}
def
get_gpu_topology
():
def
get_topology
(
self
):
"""
Get GPU topology information.
"""
if
is_cuda_v2
():
try
:
try
:
result
=
subprocess
.
run
(
result
=
subprocess
.
run
(
[
"
nvidia
-smi"
,
"
topo"
,
"-m
"
],
[
"
rocm
-smi"
,
"
--showtopotype
"
],
stdout
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
text
=
True
,
check
=
True
,
check
=
True
,
)
)
return
"
\n
"
+
result
.
stdout
if
result
.
returncode
==
0
else
None
return
{
"AMD Topology"
:
"
\n
"
+
result
.
stdout
if
result
.
returncode
==
0
else
None
}
except
subprocess
.
SubprocessError
:
except
subprocess
.
SubprocessError
:
return
None
return
{}
elif
is_hip
():
class
NPUEnv
(
BaseEnv
):
"""Environment checker for Ascend NPU"""
def
__init__
(
self
):
super
().
__init__
()
self
.
package_list
=
[
"torch_npu"
,
"sgl-kernel-npu"
]
+
self
.
package_list
def
get_info
(
self
):
cuda_info
=
{
"NPU available"
:
torch
.
npu
.
is_available
()}
if
cuda_info
[
"NPU available"
]:
cuda_info
.
update
(
self
.
get_device_info
())
cuda_info
.
update
(
self
.
_get_cann_version_info
())
return
cuda_info
def
get_device_info
(
self
):
"""
Get information about available NPUs.
Need to override due to torch_npu interface differences.
"""
devices
=
defaultdict
(
list
)
for
k
in
range
(
torch
.
npu
.
device_count
()):
devices
[
torch
.
npu
.
get_device_name
(
k
)].
append
(
str
(
k
))
npu_info
=
{}
for
name
,
device_ids
in
devices
.
items
():
npu_info
[
f
"NPU
{
','
.
join
(
device_ids
)
}
"
]
=
name
return
npu_info
def
_get_cann_version_info
(
self
):
cann_envs
=
[
"ASCEND_TOOLKIT_HOME"
,
"ASCEND_INSTALL_PATH"
]
for
var
in
cann_envs
:
path
=
os
.
environ
.
get
(
var
)
if
path
and
os
.
path
.
exists
(
path
):
CANN_HOME
=
path
break
else
:
default_path
=
"/usr/local/Ascend/ascend-toolkit/latest"
CANN_HOME
=
default_path
if
os
.
path
.
exists
(
default_path
)
else
None
if
CANN_HOME
:
npu_info
=
{
"CANN_HOME"
:
CANN_HOME
}
npu_info
.
update
(
self
.
_get_cann_info
(
CANN_HOME
))
npu_info
.
update
(
self
.
_get_ascend_driver_version
())
return
npu_info
else
:
return
{
"CANN_HOME"
:
"Not found"
}
def
_get_cann_info
(
self
,
CANN_HOME
:
str
):
cann_info
=
{}
cann_version_file
=
os
.
path
.
join
(
CANN_HOME
,
"version.cfg"
)
if
os
.
path
.
exists
(
cann_version_file
):
with
open
(
cann_version_file
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
f
.
readline
()
# discard first line comment in version.cfg
cann_info
[
"CANN"
]
=
f
.
readline
().
split
(
"["
)[
1
].
split
(
"]"
)[
0
]
else
:
cann_info
[
"CANN"
]
=
"Not Available"
try
:
bisheng
=
os
.
path
.
join
(
CANN_HOME
,
"compiler/ccec_compiler/bin/bisheng"
)
bisheng_output
=
(
subprocess
.
check_output
([
bisheng
,
"--version"
]).
decode
(
"utf-8"
).
strip
()
)
cann_info
[
"BiSheng"
]
=
bisheng_output
.
split
(
"
\n
"
)[
0
].
strip
()
except
subprocess
.
SubprocessError
:
cann_info
[
"BiSheng"
]
=
"Not Available"
return
cann_info
def
_get_ascend_driver_version
(
self
):
try
:
output
=
subprocess
.
check_output
(
[
"npu-smi"
,
"info"
,
"-t"
,
"board"
,
"-i"
,
"0"
,
]
)
for
line
in
output
.
decode
().
strip
().
split
(
"
\n
"
):
if
"Software Version"
in
line
:
version
=
line
.
split
(
":"
)[
-
1
].
strip
()
break
else
:
version
=
"Not Available"
return
{
"Ascend Driver Version"
:
version
}
except
subprocess
.
SubprocessError
:
return
{
"Ascend Driver Version"
:
"Not Available"
}
def
get_topology
(
self
):
try
:
try
:
result
=
subprocess
.
run
(
result
=
subprocess
.
run
(
[
"
rocm-smi"
,
"--showtopotype
"
],
[
"
npu-smi"
,
"info"
,
"-t"
,
"topo
"
],
stdout
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
text
=
True
,
check
=
True
,
check
=
True
,
)
)
return
"
\n
"
+
result
.
stdout
if
result
.
returncode
==
0
else
None
return
{
"Ascend Topology"
:
(
"
\n
"
+
result
.
stdout
if
result
.
returncode
==
0
else
None
)
}
except
subprocess
.
SubprocessError
:
except
subprocess
.
SubprocessError
:
return
None
return
{}
else
:
return
None
def
get_hypervisor_vendor
():
try
:
output
=
subprocess
.
check_output
([
"lscpu"
],
text
=
True
)
for
line
in
output
.
split
(
"
\n
"
):
if
"Hypervisor vendor:"
in
line
:
return
line
.
split
(
":"
)[
1
].
strip
()
return
None
except
:
return
None
def
check_env
():
"""
Check and print environment information.
"""
env_info
=
OrderedDict
()
env_info
[
"Python"
]
=
sys
.
version
.
replace
(
"
\n
"
,
""
)
env_info
.
update
(
get_cuda_info
())
env_info
[
"PyTorch"
]
=
torch
.
__version__
env_info
.
update
(
get_package_versions
(
PACKAGE_LIST
))
gpu_topo
=
get_gpu_topology
()
if
gpu_topo
:
if
is_cuda_v2
():
env_info
[
"NVIDIA Topology"
]
=
gpu_topo
elif
is_hip
():
env_info
[
"AMD Topology"
]
=
gpu_topo
hypervisor_vendor
=
get_hypervisor_vendor
()
if
hypervisor_vendor
:
env_info
[
"Hypervisor vendor"
]
=
hypervisor_vendor
ulimit_soft
,
_
=
resource
.
getrlimit
(
resource
.
RLIMIT_NOFILE
)
env_info
[
"ulimit soft"
]
=
ulimit_soft
for
k
,
v
in
env_info
.
items
():
print
(
f
"
{
k
}
:
{
v
}
"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
check_env
()
if
is_cuda_v2
():
env
=
GPUEnv
()
elif
is_hip
():
env
=
HIPEnv
()
elif
is_npu
():
env
=
NPUEnv
()
env
.
check_env
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment