[Ascend] Add Ascend NPU support for sglang.check_env & rework proposal (#11052)

Co-authored-by: ronnie_zheng <zl19940307@163.com>

[Ascend] Add Ascend NPU support for sglang.check_env & rework proposal (#11052)
Co-authored-by: ronnie_zheng <zl19940307@163.com>
c550ab91 · Zhihao Lyu · GitHub · 086f0b79 · c550ab91
Unverified Commit c550ab91 authored Nov 02, 2025 by Zhihao Lyu Committed by GitHub Nov 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 286 additions and 166 deletions

python/sglang/check_env.py python/sglang/check_env.py +286 -166

No files found.
--- a/python/sglang/check_env.py
+++ b/python/sglang/check_env.py
@@ -5,11 +5,12 @@ import os
 import resource
 import subprocess
 import sys
+from abc import abstractmethod
 from collections import OrderedDict, defaultdict

 import torch

-from sglang.srt.utils import is_hip
+from sglang.srt.utils import is_hip, is_npu


 def is_cuda_v2():
@@ -51,104 +52,124 @@ PACKAGE_LIST = [
 ]


-def get_package_versions(packages):
-    """
-    Get versions of specified packages.
-    """
-    versions = {}
-    for package in packages:
-        package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
-        try:
-            version = importlib.metadata.version(package_name)
-            versions[package_name] = version
-        except ModuleNotFoundError:
-            versions[package_name] = "Module Not Found"
-    return versions
+class BaseEnv:
+    """Base class for environment check"""
+
+    def __init__(self):
+        self.package_list = PACKAGE_LIST
+
+    @abstractmethod
+    def get_info(self) -> dict:
+        """
+        Get CUDA-related information if available.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_topology(self) -> dict:
+        raise NotImplementedError
+
+    def get_package_versions(self) -> dict:
+        """
+        Get versions of specified packages.
+        """
+        versions = {}
+        for package in self.package_list:
+            package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
+            try:
+                version = importlib.metadata.version(package_name)
+                versions[package_name] = version
+            except ModuleNotFoundError:
+                versions[package_name] = "Module Not Found"
+        return versions
+
+    def get_device_info(self):
+        """
+        Get information about available GPU devices.
+        """
+        devices = defaultdict(list)
+        capabilities = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+            capability = torch.cuda.get_device_capability(k)
+            capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
+
+        gpu_info = {}
+        for name, device_ids in devices.items():
+            gpu_info[f"GPU {','.join(device_ids)}"] = name
+
+        if len(capabilities) == 1:
+            # All GPUs have the same compute capability
+            cap, gpu_ids = list(capabilities.items())[0]
+            gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
+        else:
+            # GPUs have different compute capabilities
+            for cap, gpu_ids in capabilities.items():
+                gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap

+        return gpu_info

-def get_cuda_info():
-    """
-    Get CUDA-related information if available.
-    """
-    if is_cuda_v2():
+    def get_hypervisor_vendor(self) -> dict:
+        try:
+            output = subprocess.check_output(["lscpu"], text=True)
+            for line in output.split("\n"):
+                if "Hypervisor vendor:" in line:
+                    return {"Hypervisor vendor:": line.split(":")[1].strip()}
+            return {}
+        except:
+            return {}
+
+    def get_ulimit_soft(self) -> dict:
+        ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+        return {"ulimit soft": ulimit_soft}
+
+    def check_env(self):
+        """
+        Check and print environment information.
+        """
+        env_info = OrderedDict()
+        env_info["Python"] = sys.version.replace("\n", "")
+        env_info.update(self.get_info())
+        env_info["PyTorch"] = torch.__version__
+        env_info.update(self.get_package_versions())
+        env_info.update(self.get_topology())
+        env_info.update(self.get_hypervisor_vendor())
+        env_info.update(self.get_ulimit_soft())
+
+        for k, v in env_info.items():
+            print(f"{k}: {v}")
+
+
+class GPUEnv(BaseEnv):
+    """Environment checker for Nvidia GPU"""
+
+    def get_info(self):
        cuda_info = {"CUDA available": torch.cuda.is_available()}

        if cuda_info["CUDA available"]:
-            cuda_info.update(_get_gpu_info())
-            cuda_info.update(_get_cuda_version_info())
-
-        return cuda_info
-    elif is_hip():
-        cuda_info = {"ROCM available": torch.cuda.is_available()}
-
-        if cuda_info["ROCM available"]:
-            cuda_info.update(_get_gpu_info())
-            cuda_info.update(_get_cuda_version_info())
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cuda_version_info())

        return cuda_info

-
-def _get_gpu_info():
-    """
-    Get information about available GPUs.
-    """
-    devices = defaultdict(list)
-    capabilities = defaultdict(list)
-    for k in range(torch.cuda.device_count()):
-        devices[torch.cuda.get_device_name(k)].append(str(k))
-        capability = torch.cuda.get_device_capability(k)
-        capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
-
-    gpu_info = {}
-    for name, device_ids in devices.items():
-        gpu_info[f"GPU {','.join(device_ids)}"] = name
-
-    if len(capabilities) == 1:
-        # All GPUs have the same compute capability
-        cap, gpu_ids = list(capabilities.items())[0]
-        gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
-    else:
-        # GPUs have different compute capabilities
-        for cap, gpu_ids in capabilities.items():
-            gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
-
-    return gpu_info
-
-
-def _get_cuda_version_info():
-    """
-    Get CUDA version information.
-    """
-    if is_cuda_v2():
+    def _get_cuda_version_info(self):
+        """
+        Get CUDA version information.
+        """
        from torch.utils.cpp_extension import CUDA_HOME

        cuda_info = {"CUDA_HOME": CUDA_HOME}

        if CUDA_HOME and os.path.isdir(CUDA_HOME):
-            cuda_info.update(_get_nvcc_info())
-            cuda_info.update(_get_cuda_driver_version())
+            cuda_info.update(self._get_nvcc_info())
+            cuda_info.update(self._get_cuda_driver_version())

        return cuda_info
-    elif is_hip():
-        from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
-
-        cuda_info = {"ROCM_HOME": ROCM_HOME}

-        if ROCM_HOME and os.path.isdir(ROCM_HOME):
-            cuda_info.update(_get_nvcc_info())
-            cuda_info.update(_get_cuda_driver_version())
-
-        return cuda_info
-    else:
-        cuda_info = {"CUDA_HOME": ""}
-        return cuda_info
-
-
-def _get_nvcc_info():
-    """
-    Get NVCC version information.
-    """
-    if is_cuda_v2():
+    def _get_nvcc_info(self):
+        """
+        Get NVCC version information.
+        """
        from torch.utils.cpp_extension import CUDA_HOME

        try:
@@ -167,7 +188,73 @@ def _get_nvcc_info():
            }
        except subprocess.SubprocessError:
            return {"NVCC": "Not Available"}
-    elif is_hip():
+
+    def _get_cuda_driver_version(self):
+        """
+        Get CUDA driver version.
+        """
+        versions = set()
+        try:
+            output = subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=driver_version",
+                    "--format=csv,noheader,nounits",
+                ]
+            )
+            versions = set(output.decode().strip().split("\n"))
+            if len(versions) == 1:
+                return {"CUDA Driver Version": versions.pop()}
+            else:
+                return {"CUDA Driver Versions": ", ".join(sorted(versions))}
+        except subprocess.SubprocessError:
+            return {"CUDA Driver Version": "Not Available"}
+
+    def get_topology(self):
+        """
+        Get GPU topology information.
+        """
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "topo", "-m"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True,
+            )
+            return {
+                "NVIDIA Topology": (
+                    "\n" + result.stdout if result.returncode == 0 else None
+                )
+            }
+        except subprocess.SubprocessError:
+            return {}
+
+
+class HIPEnv(BaseEnv):
+    """Environment checker for ROCm/HIP"""
+
+    def get_info(self):
+        cuda_info = {"ROCM available": torch.cuda.is_available()}
+
+        if cuda_info["ROCM available"]:
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cuda_version_info())
+
+        return cuda_info
+
+    def _get_cuda_version_info(self):
+        from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
+
+        cuda_info = {"ROCM_HOME": ROCM_HOME}
+
+        if ROCM_HOME and os.path.isdir(ROCM_HOME):
+            cuda_info.update(self._get_hipcc_info())
+            cuda_info.update(self._get_rocm_driver_version())
+
+        return cuda_info
+
+    def _get_hipcc_info(self):
        from torch.utils.cpp_extension import ROCM_HOME

        try:
@@ -184,32 +271,8 @@ def _get_nvcc_info():
            }
        except subprocess.SubprocessError:
            return {"HIPCC": "Not Available"}
-    else:
-        return {"NVCC": "Not Available"}
-

-def _get_cuda_driver_version():
-    """
-    Get CUDA driver version.
-    """
-    versions = set()
-    if is_cuda_v2():
-        try:
-            output = subprocess.check_output(
-                [
-                    "nvidia-smi",
-                    "--query-gpu=driver_version",
-                    "--format=csv,noheader,nounits",
-                ]
-            )
-            versions = set(output.decode().strip().split("\n"))
-            if len(versions) == 1:
-                return {"CUDA Driver Version": versions.pop()}
-            else:
-                return {"CUDA Driver Versions": ", ".join(sorted(versions))}
-        except subprocess.SubprocessError:
-            return {"CUDA Driver Version": "Not Available"}
-    elif is_hip():
+    def _get_rocm_driver_version(self):
        try:
            output = subprocess.check_output(
                [
@@ -226,80 +289,137 @@ def _get_cuda_driver_version():
            return {"ROCM Driver Version": ver}
        except subprocess.SubprocessError:
            return {"ROCM Driver Version": "Not Available"}
-    else:
-        return {"CUDA Driver Version": "Not Available"}
-

-def get_gpu_topology():
-    """
-    Get GPU topology information.
-    """
-    if is_cuda_v2():
+    def get_topology(self):
        try:
            result = subprocess.run(
-                ["nvidia-smi", "topo", "-m"],
+                ["rocm-smi", "--showtopotype"],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                check=True,
            )
-            return "\n" + result.stdout if result.returncode == 0 else None
+            return {
+                "AMD Topology": "\n" + result.stdout if result.returncode == 0 else None
+            }
        except subprocess.SubprocessError:
-            return None
-    elif is_hip():
+            return {}
+
+
+class NPUEnv(BaseEnv):
+    """Environment checker for Ascend NPU"""
+
+    def __init__(self):
+        super().__init__()
+        self.package_list = ["torch_npu", "sgl-kernel-npu"] + self.package_list
+
+    def get_info(self):
+        cuda_info = {"NPU available": torch.npu.is_available()}
+        if cuda_info["NPU available"]:
+            cuda_info.update(self.get_device_info())
+            cuda_info.update(self._get_cann_version_info())
+
+        return cuda_info
+
+    def get_device_info(self):
+        """
+        Get information about available NPUs.
+        Need to override due to torch_npu interface differences.
+        """
+        devices = defaultdict(list)
+        for k in range(torch.npu.device_count()):
+            devices[torch.npu.get_device_name(k)].append(str(k))
+
+        npu_info = {}
+        for name, device_ids in devices.items():
+            npu_info[f"NPU {','.join(device_ids)}"] = name
+
+        return npu_info
+
+    def _get_cann_version_info(self):
+        cann_envs = ["ASCEND_TOOLKIT_HOME", "ASCEND_INSTALL_PATH"]
+        for var in cann_envs:
+            path = os.environ.get(var)
+            if path and os.path.exists(path):
+                CANN_HOME = path
+                break
+        else:
+            default_path = "/usr/local/Ascend/ascend-toolkit/latest"
+            CANN_HOME = default_path if os.path.exists(default_path) else None
+
+        if CANN_HOME:
+            npu_info = {"CANN_HOME": CANN_HOME}
+            npu_info.update(self._get_cann_info(CANN_HOME))
+            npu_info.update(self._get_ascend_driver_version())
+            return npu_info
+        else:
+            return {"CANN_HOME": "Not found"}
+
+    def _get_cann_info(self, CANN_HOME: str):
+        cann_info = {}
+        cann_version_file = os.path.join(CANN_HOME, "version.cfg")
+        if os.path.exists(cann_version_file):
+            with open(cann_version_file, "r", encoding="utf-8") as f:
+                f.readline()  # discard first line comment in version.cfg
+                cann_info["CANN"] = f.readline().split("[")[1].split("]")[0]
+        else:
+            cann_info["CANN"] = "Not Available"
+        try:
+            bisheng = os.path.join(CANN_HOME, "compiler/ccec_compiler/bin/bisheng")
+            bisheng_output = (
+                subprocess.check_output([bisheng, "--version"]).decode("utf-8").strip()
+            )
+            cann_info["BiSheng"] = bisheng_output.split("\n")[0].strip()
+        except subprocess.SubprocessError:
+            cann_info["BiSheng"] = "Not Available"
+        return cann_info
+
+    def _get_ascend_driver_version(self):
+        try:
+            output = subprocess.check_output(
+                [
+                    "npu-smi",
+                    "info",
+                    "-t",
+                    "board",
+                    "-i",
+                    "0",
+                ]
+            )
+            for line in output.decode().strip().split("\n"):
+                if "Software Version" in line:
+                    version = line.split(":")[-1].strip()
+                    break
+            else:
+                version = "Not Available"
+
+            return {"Ascend Driver Version": version}
+        except subprocess.SubprocessError:
+            return {"Ascend Driver Version": "Not Available"}
+
+    def get_topology(self):
        try:
            result = subprocess.run(
-                ["rocm-smi", "--showtopotype"],
+                ["npu-smi", "info", "-t", "topo"],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                check=True,
            )
-            return "\n" + result.stdout if result.returncode == 0 else None
+            return {
+                "Ascend Topology": (
+                    "\n" + result.stdout if result.returncode == 0 else None
+                )
+            }
        except subprocess.SubprocessError:
-            return None
-    else:
-        return None
-
-
-def get_hypervisor_vendor():
-    try:
-        output = subprocess.check_output(["lscpu"], text=True)
-        for line in output.split("\n"):
-            if "Hypervisor vendor:" in line:
-                return line.split(":")[1].strip()
-        return None
-    except:
-        return None
-
-
-def check_env():
-    """
-    Check and print environment information.
-    """
-    env_info = OrderedDict()
-    env_info["Python"] = sys.version.replace("\n", "")
-    env_info.update(get_cuda_info())
-    env_info["PyTorch"] = torch.__version__
-    env_info.update(get_package_versions(PACKAGE_LIST))
-
-    gpu_topo = get_gpu_topology()
-    if gpu_topo:
-        if is_cuda_v2():
-            env_info["NVIDIA Topology"] = gpu_topo
-        elif is_hip():
-            env_info["AMD Topology"] = gpu_topo
-
-    hypervisor_vendor = get_hypervisor_vendor()
-    if hypervisor_vendor:
-        env_info["Hypervisor vendor"] = hypervisor_vendor
-
-    ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
-    env_info["ulimit soft"] = ulimit_soft
-
-    for k, v in env_info.items():
-        print(f"{k}: {v}")
+            return {}


 if __name__ == "__main__":
-    check_env()
+    if is_cuda_v2():
+        env = GPUEnv()
+    elif is_hip():
+        env = HIPEnv()
+    elif is_npu():
+        env = NPUEnv()
+    env.check_env()