Unverified Commit c550ab91 authored by Zhihao Lyu's avatar Zhihao Lyu Committed by GitHub
Browse files

[Ascend] Add Ascend NPU support for sglang.check_env & rework proposal (#11052)


Co-authored-by: default avatarronnie_zheng <zl19940307@163.com>
parent 086f0b79
...@@ -5,11 +5,12 @@ import os ...@@ -5,11 +5,12 @@ import os
import resource import resource
import subprocess import subprocess
import sys import sys
from abc import abstractmethod
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
import torch import torch
from sglang.srt.utils import is_hip from sglang.srt.utils import is_hip, is_npu
def is_cuda_v2(): def is_cuda_v2():
...@@ -51,104 +52,124 @@ PACKAGE_LIST = [ ...@@ -51,104 +52,124 @@ PACKAGE_LIST = [
] ]
def get_package_versions(packages): class BaseEnv:
""" """Base class for environment check"""
Get versions of specified packages.
""" def __init__(self):
versions = {} self.package_list = PACKAGE_LIST
for package in packages:
package_name = package.split("==")[0].split(">=")[0].split("<=")[0] @abstractmethod
try: def get_info(self) -> dict:
version = importlib.metadata.version(package_name) """
versions[package_name] = version Get CUDA-related information if available.
except ModuleNotFoundError: """
versions[package_name] = "Module Not Found" raise NotImplementedError
return versions
@abstractmethod
def get_topology(self) -> dict:
raise NotImplementedError
def get_package_versions(self) -> dict:
"""
Get versions of specified packages.
"""
versions = {}
for package in self.package_list:
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
try:
version = importlib.metadata.version(package_name)
versions[package_name] = version
except ModuleNotFoundError:
versions[package_name] = "Module Not Found"
return versions
def get_device_info(self):
"""
Get information about available GPU devices.
"""
devices = defaultdict(list)
capabilities = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
capability = torch.cuda.get_device_capability(k)
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
gpu_info = {}
for name, device_ids in devices.items():
gpu_info[f"GPU {','.join(device_ids)}"] = name
if len(capabilities) == 1:
# All GPUs have the same compute capability
cap, gpu_ids = list(capabilities.items())[0]
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
else:
# GPUs have different compute capabilities
for cap, gpu_ids in capabilities.items():
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
return gpu_info
def get_cuda_info(): def get_hypervisor_vendor(self) -> dict:
""" try:
Get CUDA-related information if available. output = subprocess.check_output(["lscpu"], text=True)
""" for line in output.split("\n"):
if is_cuda_v2(): if "Hypervisor vendor:" in line:
return {"Hypervisor vendor:": line.split(":")[1].strip()}
return {}
except:
return {}
def get_ulimit_soft(self) -> dict:
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
return {"ulimit soft": ulimit_soft}
def check_env(self):
"""
Check and print environment information.
"""
env_info = OrderedDict()
env_info["Python"] = sys.version.replace("\n", "")
env_info.update(self.get_info())
env_info["PyTorch"] = torch.__version__
env_info.update(self.get_package_versions())
env_info.update(self.get_topology())
env_info.update(self.get_hypervisor_vendor())
env_info.update(self.get_ulimit_soft())
for k, v in env_info.items():
print(f"{k}: {v}")
class GPUEnv(BaseEnv):
"""Environment checker for Nvidia GPU"""
def get_info(self):
cuda_info = {"CUDA available": torch.cuda.is_available()} cuda_info = {"CUDA available": torch.cuda.is_available()}
if cuda_info["CUDA available"]: if cuda_info["CUDA available"]:
cuda_info.update(_get_gpu_info()) cuda_info.update(self.get_device_info())
cuda_info.update(_get_cuda_version_info()) cuda_info.update(self._get_cuda_version_info())
return cuda_info
elif is_hip():
cuda_info = {"ROCM available": torch.cuda.is_available()}
if cuda_info["ROCM available"]:
cuda_info.update(_get_gpu_info())
cuda_info.update(_get_cuda_version_info())
return cuda_info return cuda_info
def _get_cuda_version_info(self):
def _get_gpu_info(): """
""" Get CUDA version information.
Get information about available GPUs. """
"""
devices = defaultdict(list)
capabilities = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
capability = torch.cuda.get_device_capability(k)
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
gpu_info = {}
for name, device_ids in devices.items():
gpu_info[f"GPU {','.join(device_ids)}"] = name
if len(capabilities) == 1:
# All GPUs have the same compute capability
cap, gpu_ids = list(capabilities.items())[0]
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
else:
# GPUs have different compute capabilities
for cap, gpu_ids in capabilities.items():
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
return gpu_info
def _get_cuda_version_info():
"""
Get CUDA version information.
"""
if is_cuda_v2():
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME
cuda_info = {"CUDA_HOME": CUDA_HOME} cuda_info = {"CUDA_HOME": CUDA_HOME}
if CUDA_HOME and os.path.isdir(CUDA_HOME): if CUDA_HOME and os.path.isdir(CUDA_HOME):
cuda_info.update(_get_nvcc_info()) cuda_info.update(self._get_nvcc_info())
cuda_info.update(_get_cuda_driver_version()) cuda_info.update(self._get_cuda_driver_version())
return cuda_info return cuda_info
elif is_hip():
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
cuda_info = {"ROCM_HOME": ROCM_HOME}
if ROCM_HOME and os.path.isdir(ROCM_HOME): def _get_nvcc_info(self):
cuda_info.update(_get_nvcc_info()) """
cuda_info.update(_get_cuda_driver_version()) Get NVCC version information.
"""
return cuda_info
else:
cuda_info = {"CUDA_HOME": ""}
return cuda_info
def _get_nvcc_info():
"""
Get NVCC version information.
"""
if is_cuda_v2():
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME
try: try:
...@@ -167,7 +188,73 @@ def _get_nvcc_info(): ...@@ -167,7 +188,73 @@ def _get_nvcc_info():
} }
except subprocess.SubprocessError: except subprocess.SubprocessError:
return {"NVCC": "Not Available"} return {"NVCC": "Not Available"}
elif is_hip():
def _get_cuda_driver_version(self):
"""
Get CUDA driver version.
"""
versions = set()
try:
output = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=driver_version",
"--format=csv,noheader,nounits",
]
)
versions = set(output.decode().strip().split("\n"))
if len(versions) == 1:
return {"CUDA Driver Version": versions.pop()}
else:
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
except subprocess.SubprocessError:
return {"CUDA Driver Version": "Not Available"}
def get_topology(self):
"""
Get GPU topology information.
"""
try:
result = subprocess.run(
["nvidia-smi", "topo", "-m"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return {
"NVIDIA Topology": (
"\n" + result.stdout if result.returncode == 0 else None
)
}
except subprocess.SubprocessError:
return {}
class HIPEnv(BaseEnv):
"""Environment checker for ROCm/HIP"""
def get_info(self):
cuda_info = {"ROCM available": torch.cuda.is_available()}
if cuda_info["ROCM available"]:
cuda_info.update(self.get_device_info())
cuda_info.update(self._get_cuda_version_info())
return cuda_info
def _get_cuda_version_info(self):
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
cuda_info = {"ROCM_HOME": ROCM_HOME}
if ROCM_HOME and os.path.isdir(ROCM_HOME):
cuda_info.update(self._get_hipcc_info())
cuda_info.update(self._get_rocm_driver_version())
return cuda_info
def _get_hipcc_info(self):
from torch.utils.cpp_extension import ROCM_HOME from torch.utils.cpp_extension import ROCM_HOME
try: try:
...@@ -184,32 +271,8 @@ def _get_nvcc_info(): ...@@ -184,32 +271,8 @@ def _get_nvcc_info():
} }
except subprocess.SubprocessError: except subprocess.SubprocessError:
return {"HIPCC": "Not Available"} return {"HIPCC": "Not Available"}
else:
return {"NVCC": "Not Available"}
def _get_cuda_driver_version(): def _get_rocm_driver_version(self):
"""
Get CUDA driver version.
"""
versions = set()
if is_cuda_v2():
try:
output = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=driver_version",
"--format=csv,noheader,nounits",
]
)
versions = set(output.decode().strip().split("\n"))
if len(versions) == 1:
return {"CUDA Driver Version": versions.pop()}
else:
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
except subprocess.SubprocessError:
return {"CUDA Driver Version": "Not Available"}
elif is_hip():
try: try:
output = subprocess.check_output( output = subprocess.check_output(
[ [
...@@ -226,80 +289,137 @@ def _get_cuda_driver_version(): ...@@ -226,80 +289,137 @@ def _get_cuda_driver_version():
return {"ROCM Driver Version": ver} return {"ROCM Driver Version": ver}
except subprocess.SubprocessError: except subprocess.SubprocessError:
return {"ROCM Driver Version": "Not Available"} return {"ROCM Driver Version": "Not Available"}
else:
return {"CUDA Driver Version": "Not Available"}
def get_gpu_topology(): def get_topology(self):
"""
Get GPU topology information.
"""
if is_cuda_v2():
try: try:
result = subprocess.run( result = subprocess.run(
["nvidia-smi", "topo", "-m"], ["rocm-smi", "--showtopotype"],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
check=True, check=True,
) )
return "\n" + result.stdout if result.returncode == 0 else None return {
"AMD Topology": "\n" + result.stdout if result.returncode == 0 else None
}
except subprocess.SubprocessError: except subprocess.SubprocessError:
return None return {}
elif is_hip():
class NPUEnv(BaseEnv):
"""Environment checker for Ascend NPU"""
def __init__(self):
super().__init__()
self.package_list = ["torch_npu", "sgl-kernel-npu"] + self.package_list
def get_info(self):
cuda_info = {"NPU available": torch.npu.is_available()}
if cuda_info["NPU available"]:
cuda_info.update(self.get_device_info())
cuda_info.update(self._get_cann_version_info())
return cuda_info
def get_device_info(self):
"""
Get information about available NPUs.
Need to override due to torch_npu interface differences.
"""
devices = defaultdict(list)
for k in range(torch.npu.device_count()):
devices[torch.npu.get_device_name(k)].append(str(k))
npu_info = {}
for name, device_ids in devices.items():
npu_info[f"NPU {','.join(device_ids)}"] = name
return npu_info
def _get_cann_version_info(self):
cann_envs = ["ASCEND_TOOLKIT_HOME", "ASCEND_INSTALL_PATH"]
for var in cann_envs:
path = os.environ.get(var)
if path and os.path.exists(path):
CANN_HOME = path
break
else:
default_path = "/usr/local/Ascend/ascend-toolkit/latest"
CANN_HOME = default_path if os.path.exists(default_path) else None
if CANN_HOME:
npu_info = {"CANN_HOME": CANN_HOME}
npu_info.update(self._get_cann_info(CANN_HOME))
npu_info.update(self._get_ascend_driver_version())
return npu_info
else:
return {"CANN_HOME": "Not found"}
def _get_cann_info(self, CANN_HOME: str):
cann_info = {}
cann_version_file = os.path.join(CANN_HOME, "version.cfg")
if os.path.exists(cann_version_file):
with open(cann_version_file, "r", encoding="utf-8") as f:
f.readline() # discard first line comment in version.cfg
cann_info["CANN"] = f.readline().split("[")[1].split("]")[0]
else:
cann_info["CANN"] = "Not Available"
try:
bisheng = os.path.join(CANN_HOME, "compiler/ccec_compiler/bin/bisheng")
bisheng_output = (
subprocess.check_output([bisheng, "--version"]).decode("utf-8").strip()
)
cann_info["BiSheng"] = bisheng_output.split("\n")[0].strip()
except subprocess.SubprocessError:
cann_info["BiSheng"] = "Not Available"
return cann_info
def _get_ascend_driver_version(self):
try:
output = subprocess.check_output(
[
"npu-smi",
"info",
"-t",
"board",
"-i",
"0",
]
)
for line in output.decode().strip().split("\n"):
if "Software Version" in line:
version = line.split(":")[-1].strip()
break
else:
version = "Not Available"
return {"Ascend Driver Version": version}
except subprocess.SubprocessError:
return {"Ascend Driver Version": "Not Available"}
def get_topology(self):
try: try:
result = subprocess.run( result = subprocess.run(
["rocm-smi", "--showtopotype"], ["npu-smi", "info", "-t", "topo"],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
check=True, check=True,
) )
return "\n" + result.stdout if result.returncode == 0 else None return {
"Ascend Topology": (
"\n" + result.stdout if result.returncode == 0 else None
)
}
except subprocess.SubprocessError: except subprocess.SubprocessError:
return None return {}
else:
return None
def get_hypervisor_vendor():
try:
output = subprocess.check_output(["lscpu"], text=True)
for line in output.split("\n"):
if "Hypervisor vendor:" in line:
return line.split(":")[1].strip()
return None
except:
return None
def check_env():
"""
Check and print environment information.
"""
env_info = OrderedDict()
env_info["Python"] = sys.version.replace("\n", "")
env_info.update(get_cuda_info())
env_info["PyTorch"] = torch.__version__
env_info.update(get_package_versions(PACKAGE_LIST))
gpu_topo = get_gpu_topology()
if gpu_topo:
if is_cuda_v2():
env_info["NVIDIA Topology"] = gpu_topo
elif is_hip():
env_info["AMD Topology"] = gpu_topo
hypervisor_vendor = get_hypervisor_vendor()
if hypervisor_vendor:
env_info["Hypervisor vendor"] = hypervisor_vendor
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
env_info["ulimit soft"] = ulimit_soft
for k, v in env_info.items():
print(f"{k}: {v}")
if __name__ == "__main__": if __name__ == "__main__":
check_env() if is_cuda_v2():
env = GPUEnv()
elif is_hip():
env = HIPEnv()
elif is_npu():
env = NPUEnv()
env.check_env()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment