Unverified Commit 5ce9daea authored by Hui Liu's avatar Hui Liu Committed by GitHub
Browse files

ROCm support for sglang.check_env (#2426)

parent ce094a5d
...@@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict ...@@ -9,6 +9,13 @@ from collections import OrderedDict, defaultdict
import torch import torch
from sglang.srt.utils import is_hip
def is_cuda_v2():
return torch.version.cuda is not None
# List of packages to check versions # List of packages to check versions
PACKAGE_LIST = [ PACKAGE_LIST = [
"sglang", "sglang",
...@@ -63,6 +70,7 @@ def get_cuda_info(): ...@@ -63,6 +70,7 @@ def get_cuda_info():
""" """
Get CUDA-related information if available. Get CUDA-related information if available.
""" """
if is_cuda_v2():
cuda_info = {"CUDA available": torch.cuda.is_available()} cuda_info = {"CUDA available": torch.cuda.is_available()}
if cuda_info["CUDA available"]: if cuda_info["CUDA available"]:
...@@ -70,6 +78,14 @@ def get_cuda_info(): ...@@ -70,6 +78,14 @@ def get_cuda_info():
cuda_info.update(_get_cuda_version_info()) cuda_info.update(_get_cuda_version_info())
return cuda_info return cuda_info
elif is_hip():
cuda_info = {"ROCM available": torch.cuda.is_available()}
if cuda_info["ROCM available"]:
cuda_info.update(_get_gpu_info())
cuda_info.update(_get_cuda_version_info())
return cuda_info
def _get_gpu_info(): def _get_gpu_info():
...@@ -103,6 +119,7 @@ def _get_cuda_version_info(): ...@@ -103,6 +119,7 @@ def _get_cuda_version_info():
""" """
Get CUDA version information. Get CUDA version information.
""" """
if is_cuda_v2():
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME
cuda_info = {"CUDA_HOME": CUDA_HOME} cuda_info = {"CUDA_HOME": CUDA_HOME}
...@@ -112,26 +129,63 @@ def _get_cuda_version_info(): ...@@ -112,26 +129,63 @@ def _get_cuda_version_info():
cuda_info.update(_get_cuda_driver_version()) cuda_info.update(_get_cuda_driver_version())
return cuda_info return cuda_info
elif is_hip():
from torch.utils.cpp_extension import ROCM_HOME as ROCM_HOME
cuda_info = {"ROCM_HOME": ROCM_HOME}
if ROCM_HOME and os.path.isdir(ROCM_HOME):
cuda_info.update(_get_nvcc_info())
cuda_info.update(_get_cuda_driver_version())
return cuda_info
else:
cuda_info = {"CUDA_HOME": ""}
return cuda_info
def _get_nvcc_info(): def _get_nvcc_info():
""" """
Get NVCC version information. Get NVCC version information.
""" """
if is_cuda_v2():
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME
try: try:
nvcc = os.path.join(CUDA_HOME, "bin/nvcc") nvcc = os.path.join(CUDA_HOME, "bin/nvcc")
nvcc_output = ( nvcc_output = (
subprocess.check_output(f'"{nvcc}" -V', shell=True).decode("utf-8").strip() subprocess.check_output(f'"{nvcc}" -V', shell=True)
.decode("utf-8")
.strip()
) )
return { return {
"NVCC": nvcc_output[ "NVCC": nvcc_output[
nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind("Build") nvcc_output.rfind("Cuda compilation tools") : nvcc_output.rfind(
"Build"
)
].strip() ].strip()
} }
except subprocess.SubprocessError: except subprocess.SubprocessError:
return {"NVCC": "Not Available"} return {"NVCC": "Not Available"}
elif is_hip():
from torch.utils.cpp_extension import ROCM_HOME
try:
hipcc = os.path.join(ROCM_HOME, "bin/hipcc")
hipcc_output = (
subprocess.check_output(f'"{hipcc}" --version', shell=True)
.decode("utf-8")
.strip()
)
return {
"HIPCC": hipcc_output[
hipcc_output.rfind("HIP version") : hipcc_output.rfind("AMD clang")
].strip()
}
except subprocess.SubprocessError:
return {"HIPCC": "Not Available"}
else:
return {"NVCC": "Not Available"}
def _get_cuda_driver_version(): def _get_cuda_driver_version():
...@@ -139,6 +193,7 @@ def _get_cuda_driver_version(): ...@@ -139,6 +193,7 @@ def _get_cuda_driver_version():
Get CUDA driver version. Get CUDA driver version.
""" """
versions = set() versions = set()
if is_cuda_v2():
try: try:
output = subprocess.check_output( output = subprocess.check_output(
[ [
...@@ -154,12 +209,32 @@ def _get_cuda_driver_version(): ...@@ -154,12 +209,32 @@ def _get_cuda_driver_version():
return {"CUDA Driver Versions": ", ".join(sorted(versions))} return {"CUDA Driver Versions": ", ".join(sorted(versions))}
except subprocess.SubprocessError: except subprocess.SubprocessError:
return {"CUDA Driver Version": "Not Available"} return {"CUDA Driver Version": "Not Available"}
elif is_hip():
try:
output = subprocess.check_output(
[
"rocm-smi",
"--showdriverversion",
"--csv",
]
)
versions = set(output.decode().strip().split("\n"))
versions.discard("name, value")
ver = versions.pop()
ver = ver.replace('"Driver version", ', "").replace('"', "")
return {"ROCM Driver Version": ver}
except subprocess.SubprocessError:
return {"ROCM Driver Version": "Not Available"}
else:
return {"CUDA Driver Version": "Not Available"}
def get_gpu_topology(): def get_gpu_topology():
""" """
Get GPU topology information. Get GPU topology information.
""" """
if is_cuda_v2():
try: try:
result = subprocess.run( result = subprocess.run(
["nvidia-smi", "topo", "-m"], ["nvidia-smi", "topo", "-m"],
...@@ -171,6 +246,20 @@ def get_gpu_topology(): ...@@ -171,6 +246,20 @@ def get_gpu_topology():
return "\n" + result.stdout if result.returncode == 0 else None return "\n" + result.stdout if result.returncode == 0 else None
except subprocess.SubprocessError: except subprocess.SubprocessError:
return None return None
elif is_hip():
try:
result = subprocess.run(
["rocm-smi", "--showtopotype"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True,
)
return "\n" + result.stdout if result.returncode == 0 else None
except subprocess.SubprocessError:
return None
else:
return None
def get_hypervisor_vendor(): def get_hypervisor_vendor():
...@@ -196,7 +285,10 @@ def check_env(): ...@@ -196,7 +285,10 @@ def check_env():
gpu_topo = get_gpu_topology() gpu_topo = get_gpu_topology()
if gpu_topo: if gpu_topo:
if is_cuda_v2():
env_info["NVIDIA Topology"] = gpu_topo env_info["NVIDIA Topology"] = gpu_topo
elif is_hip():
env_info["AMD Topology"] = gpu_topo
hypervisor_vendor = get_hypervisor_vendor() hypervisor_vendor = get_hypervisor_vendor()
if hypervisor_vendor: if hypervisor_vendor:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment