"benchmark/vscode:/vscode.git/clone" did not exist on "e2574ee986d2bdd3d7a1dd1561d69920a6581cd8"
Unverified Commit c550ab91 authored by Zhihao Lyu's avatar Zhihao Lyu Committed by GitHub
Browse files

[Ascend] Add Ascend NPU support for sglang.check_env & rework proposal (#11052)


Co-authored-by: default avatarronnie_zheng <zl19940307@163.com>
parent 086f0b79
......@@ -5,11 +5,12 @@ import os
import resource
import subprocess
import sys
from abc import abstractmethod
from collections import OrderedDict, defaultdict
import torch
from sglang.srt.utils import is_hip
from sglang.srt.utils import is_hip, is_npu
def is_cuda_v2():
......@@ -51,104 +52,124 @@ PACKAGE_LIST = [
]
def get_package_versions(packages):
"""
Get versions of specified packages.
"""
versions = {}
for package in packages:
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
try:
version = importlib.metadata.version(package_name)
versions[package_name] = version
except ModuleNotFoundError:
versions[package_name] = "Module Not Found"
return versions
class BaseEnv:
"""Base class for environment check"""
def __init__(self):
self.package_list = PACKAGE_LIST
@abstractmethod
def get_info(self) -> dict:
"""
Get CUDA-related information if available.
"""
raise NotImplementedError
@abstractmethod
def get_topology(self) -> dict:
raise NotImplementedError
def get_package_versions(self) -> dict:
"""
Get versions of specified packages.
"""
versions = {}
for package in self.package_list:
package_name = package.split("==")[0].split(">=")[0].split("<=")[0]
try:
version = importlib.metadata.version(package_name)
versions[package_name] = version
except ModuleNotFoundError:
versions[package_name] = "Module Not Found"
return versions
def get_device_info(self):
"""
Get information about available GPU devices.
"""
devices = defaultdict(list)
capabilities = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
capability = torch.cuda.get_device_capability(k)
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
gpu_info = {}
for name, device_ids in devices.items():
gpu_info[f"GPU {','.join(device_ids)}"] = name
if len(capabilities) == 1:
# All GPUs have the same compute capability
cap, gpu_ids = list(capabilities.items())[0]
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
else:
# GPUs have different compute capabilities
for cap, gpu_ids in capabilities.items():
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
return gpu_info
def get_cuda_info():
"""
Get CUDA-related information if available.
"""
if is_cuda_v2():
def get_hypervisor_vendor(self) -> dict:
try:
output = subprocess.check_output(["lscpu"], text=True)
for line in output.split("\n"):
if "Hypervisor vendor:" in line:
return {"Hypervisor vendor:": line.split(":")[1].strip()}
return {}
except:
return {}
def get_ulimit_soft(self) -> dict:
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
return {"ulimit soft": ulimit_soft}
def check_env(self):
"""
Check and print environment information.
"""
env_info = OrderedDict()
env_info["Python"] = sys.version.replace("\n", "")
env_info.update(self.get_info())
env_info["PyTorch"] = torch.__version__
env_info.update(self.get_package_versions())
env_info.update(self.get_topology())
env_info.update(self.get_hypervisor_vendor())
env_info.update(self.get_ulimit_soft())
for k, v in env_info.items():
print(f"{k}: {v}")
class GPUEnv(BaseEnv):
"""Environment checker for Nvidia GPU"""
def get_info(self):
cuda_info = {"CUDA available": torch.cuda.is_available()}
if cuda_info["CUDA available"]:
cuda_info.update(_get_gpu_info())
cuda_info.update(_get_cuda_version_info())
return cuda_info
elif is_hip():
cuda_info = {"ROCM available": torch.cuda.is_available()}
if cuda_info["ROCM available"]:
cuda_info.update(_get_gpu_info())
cuda_info.update(_get_cuda_version_info())
cuda_info.update(self.get_device_info())
cuda_info.update(self._get_cuda_version_info())
return cuda_info
def _get_gpu_info():
"""
Get information about available GPUs.
"""
devices = defaultdict(list)
capabilities = defaultdict(list)
for k in range(torch.cuda.device_count()):
devices[torch.cuda.get_device_name(k)].append(str(k))
capability = torch.cuda.get_device_capability(k)
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
gpu_info = {}
for name, device_ids in devices.items():
gpu_info[f"GPU {','.join(device_ids)}"] = name
if len(capabilities) == 1:
# All GPUs have the same compute capability
cap, gpu_ids = list(capabilities.items())[0]
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
else:
# GPUs have different compute capabilities
for cap, gpu_ids in capabilities.items():
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
return gpu_info
def _get_cuda_version_info():
"""
Get CUDA version information.
"""
if is_cuda_v2():
def _get_cuda_version_info(self):
"""
Get CUDA version information.
"""
from torch.utils.cpp_extension import CUDA_HOME
cuda_info = {"CUDA_HOME": CUDA_HOME}
if CUDA_HOME and os.path.isdir(CUDA_HOME):
cuda_info.update(_get_nvcc_info())
cuda_info.update(_get_cuda_driver_version())
cuda_info.update(self._get_nvcc_info())
cuda_info.update(self._get_cuda_driver_version())
return cuda_info
elif is_hip():
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
cuda_info = {"ROCM_HOME": ROCM_HOME}
if ROCM_HOME and os.path.isdir(ROCM_HOME):
cuda_info.update(_get_nvcc_info())
cuda_info.update(_get_cuda_driver_version())
return cuda_info
else:
cuda_info = {"CUDA_HOME": ""}
return cuda_info
def _get_nvcc_info():
"""
Get NVCC version information.
"""
if is_cuda_v2():
def _get_nvcc_info(self):
"""
Get NVCC version information.
"""
from torch.utils.cpp_extension import CUDA_HOME
try:
......@@ -167,7 +188,73 @@ def _get_nvcc_info():
}
except subprocess.SubprocessError:
return {"NVCC": "Not Available"}
elif is_hip():
def _get_cuda_driver_version(self):
"""
Get CUDA driver version.
"""
versions = set()
try:
output = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=driver_version",
"--format=csv,noheader,nounits",
]
)
versions = set(output.decode().strip().split("\n"))
if len(versions) == 1:
return {"CUDA Driver Version": versions.pop()}
else:
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
except subprocess.SubprocessError:
return {"CUDA Driver Version": "Not Available"}
def get_topology(self):
"""
Get GPU topology information.
"""
try:
result = subprocess.run(
["nvidia-smi", "topo", "-m"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return {
"NVIDIA Topology": (
"\n" + result.stdout if result.returncode == 0 else None
)
}
except subprocess.SubprocessError:
return {}
class HIPEnv(BaseEnv):
"""Environment checker for ROCm/HIP"""
def get_info(self):
cuda_info = {"ROCM available": torch.cuda.is_available()}
if cuda_info["ROCM available"]:
cuda_info.update(self.get_device_info())
cuda_info.update(self._get_cuda_version_info())
return cuda_info
def _get_cuda_version_info(self):
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
cuda_info = {"ROCM_HOME": ROCM_HOME}
if ROCM_HOME and os.path.isdir(ROCM_HOME):
cuda_info.update(self._get_hipcc_info())
cuda_info.update(self._get_rocm_driver_version())
return cuda_info
def _get_hipcc_info(self):
from torch.utils.cpp_extension import ROCM_HOME
try:
......@@ -184,32 +271,8 @@ def _get_nvcc_info():
}
except subprocess.SubprocessError:
return {"HIPCC": "Not Available"}
else:
return {"NVCC": "Not Available"}
def _get_cuda_driver_version():
"""
Get CUDA driver version.
"""
versions = set()
if is_cuda_v2():
try:
output = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=driver_version",
"--format=csv,noheader,nounits",
]
)
versions = set(output.decode().strip().split("\n"))
if len(versions) == 1:
return {"CUDA Driver Version": versions.pop()}
else:
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
except subprocess.SubprocessError:
return {"CUDA Driver Version": "Not Available"}
elif is_hip():
def _get_rocm_driver_version(self):
try:
output = subprocess.check_output(
[
......@@ -226,80 +289,137 @@ def _get_cuda_driver_version():
return {"ROCM Driver Version": ver}
except subprocess.SubprocessError:
return {"ROCM Driver Version": "Not Available"}
else:
return {"CUDA Driver Version": "Not Available"}
def get_gpu_topology():
"""
Get GPU topology information.
"""
if is_cuda_v2():
def get_topology(self):
try:
result = subprocess.run(
["nvidia-smi", "topo", "-m"],
["rocm-smi", "--showtopotype"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return "\n" + result.stdout if result.returncode == 0 else None
return {
"AMD Topology": "\n" + result.stdout if result.returncode == 0 else None
}
except subprocess.SubprocessError:
return None
elif is_hip():
return {}
class NPUEnv(BaseEnv):
"""Environment checker for Ascend NPU"""
def __init__(self):
super().__init__()
self.package_list = ["torch_npu", "sgl-kernel-npu"] + self.package_list
def get_info(self):
cuda_info = {"NPU available": torch.npu.is_available()}
if cuda_info["NPU available"]:
cuda_info.update(self.get_device_info())
cuda_info.update(self._get_cann_version_info())
return cuda_info
def get_device_info(self):
"""
Get information about available NPUs.
Need to override due to torch_npu interface differences.
"""
devices = defaultdict(list)
for k in range(torch.npu.device_count()):
devices[torch.npu.get_device_name(k)].append(str(k))
npu_info = {}
for name, device_ids in devices.items():
npu_info[f"NPU {','.join(device_ids)}"] = name
return npu_info
def _get_cann_version_info(self):
cann_envs = ["ASCEND_TOOLKIT_HOME", "ASCEND_INSTALL_PATH"]
for var in cann_envs:
path = os.environ.get(var)
if path and os.path.exists(path):
CANN_HOME = path
break
else:
default_path = "/usr/local/Ascend/ascend-toolkit/latest"
CANN_HOME = default_path if os.path.exists(default_path) else None
if CANN_HOME:
npu_info = {"CANN_HOME": CANN_HOME}
npu_info.update(self._get_cann_info(CANN_HOME))
npu_info.update(self._get_ascend_driver_version())
return npu_info
else:
return {"CANN_HOME": "Not found"}
def _get_cann_info(self, CANN_HOME: str):
cann_info = {}
cann_version_file = os.path.join(CANN_HOME, "version.cfg")
if os.path.exists(cann_version_file):
with open(cann_version_file, "r", encoding="utf-8") as f:
f.readline() # discard first line comment in version.cfg
cann_info["CANN"] = f.readline().split("[")[1].split("]")[0]
else:
cann_info["CANN"] = "Not Available"
try:
bisheng = os.path.join(CANN_HOME, "compiler/ccec_compiler/bin/bisheng")
bisheng_output = (
subprocess.check_output([bisheng, "--version"]).decode("utf-8").strip()
)
cann_info["BiSheng"] = bisheng_output.split("\n")[0].strip()
except subprocess.SubprocessError:
cann_info["BiSheng"] = "Not Available"
return cann_info
def _get_ascend_driver_version(self):
try:
output = subprocess.check_output(
[
"npu-smi",
"info",
"-t",
"board",
"-i",
"0",
]
)
for line in output.decode().strip().split("\n"):
if "Software Version" in line:
version = line.split(":")[-1].strip()
break
else:
version = "Not Available"
return {"Ascend Driver Version": version}
except subprocess.SubprocessError:
return {"Ascend Driver Version": "Not Available"}
def get_topology(self):
try:
result = subprocess.run(
["rocm-smi", "--showtopotype"],
["npu-smi", "info", "-t", "topo"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return "\n" + result.stdout if result.returncode == 0 else None
return {
"Ascend Topology": (
"\n" + result.stdout if result.returncode == 0 else None
)
}
except subprocess.SubprocessError:
return None
else:
return None
def get_hypervisor_vendor():
try:
output = subprocess.check_output(["lscpu"], text=True)
for line in output.split("\n"):
if "Hypervisor vendor:" in line:
return line.split(":")[1].strip()
return None
except:
return None
def check_env():
"""
Check and print environment information.
"""
env_info = OrderedDict()
env_info["Python"] = sys.version.replace("\n", "")
env_info.update(get_cuda_info())
env_info["PyTorch"] = torch.__version__
env_info.update(get_package_versions(PACKAGE_LIST))
gpu_topo = get_gpu_topology()
if gpu_topo:
if is_cuda_v2():
env_info["NVIDIA Topology"] = gpu_topo
elif is_hip():
env_info["AMD Topology"] = gpu_topo
hypervisor_vendor = get_hypervisor_vendor()
if hypervisor_vendor:
env_info["Hypervisor vendor"] = hypervisor_vendor
ulimit_soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
env_info["ulimit soft"] = ulimit_soft
for k, v in env_info.items():
print(f"{k}: {v}")
return {}
if __name__ == "__main__":
check_env()
if is_cuda_v2():
env = GPUEnv()
elif is_hip():
env = HIPEnv()
elif is_npu():
env = NPUEnv()
env.check_env()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment