Unverified Commit dfbcdbe0 authored by William Arnold's avatar William Arnold Committed by GitHub
Browse files

feat: add trtllm env variables + gpu part numbers to config dump (#3912)


Signed-off-by: default avatarWilliam Arnold <7565007+Aphoh@users.noreply.github.com>
parent 21c4739b
...@@ -21,6 +21,20 @@ DEFAULT_ENV_PREFIXES = [ ...@@ -21,6 +21,20 @@ DEFAULT_ENV_PREFIXES = [
"UCX_", # UCX "UCX_", # UCX
"NIXL_", # NIXL "NIXL_", # NIXL
"OMPI_", # OpenMPI "OMPI_", # OpenMPI
"LLM_", # Misc trtllm variables
"TLLM_",
"TRT_LLM_",
"TRTLLM_",
"NVIDIA_",
"NSYS_",
"GENERATE_CU_",
"OVERRIDE_",
"TOKENIZERS_",
"DISABLE_TORCH_",
"PYTORCH_",
"ENABLE_PERFECT_ROUTER",
"FLA_",
"NEMOTRON_",
] ]
# Sensitive variable patterns to redact (case-insensitive) # Sensitive variable patterns to redact (case-insensitive)
......
...@@ -101,40 +101,75 @@ def get_gpu_info() -> Optional[Dict[str, Any]]: ...@@ -101,40 +101,75 @@ def get_gpu_info() -> Optional[Dict[str, Any]]:
Returns: Returns:
Dictionary containing GPU details if available, None otherwise. Dictionary containing GPU details if available, None otherwise.
Attempts to use nvidia-smi via subprocess. Attempts to use nvidia-smi via subprocess with XML output format.
Note: Note:
This is a best-effort function and returns None if GPU info cannot be obtained. This is a best-effort function and returns None if GPU info cannot be obtained.
""" """
try: try:
import subprocess import subprocess
import xml.etree.ElementTree as ET
result = subprocess.run( result = subprocess.run(
[ [
"nvidia-smi", "nvidia-smi",
"--query-gpu=name,driver_version,memory.total", "-q",
"--format=csv,noheader", "-x",
], ],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=5, timeout=5,
) )
if result.returncode == 0: if result.returncode == 0:
gpu_lines = result.stdout.strip().split("\n") root = ET.fromstring(result.stdout)
# Get driver version from root level
driver_version_elem = root.find("driver_version")
driver_version = (
driver_version_elem.text
if driver_version_elem is not None
else "unknown"
)
gpus = [] gpus = []
for line in gpu_lines:
if line: # Parse each GPU element
parts = [p.strip() for p in line.split(",")] for gpu_elem in root.findall("gpu"):
if len(parts) >= 3: gpu_info = {}
gpus.append(
{ # Extract product name
"name": parts[0], product_name = gpu_elem.find("product_name")
"driver_version": parts[1], if product_name is not None:
"memory_total": parts[2], gpu_info["name"] = product_name.text
}
) # Extract driver version
gpu_info["driver_version"] = driver_version
# Extract memory total
fb_memory = gpu_elem.find("fb_memory_usage/total")
if fb_memory is not None:
gpu_info["memory_total"] = fb_memory.text
# Extract board part number
board_part = gpu_elem.find("board_part_number")
if board_part is not None:
gpu_info["board_part_number"] = board_part.text
# Extract GPU part number
gpu_part = gpu_elem.find("gpu_part_number")
if gpu_part is not None:
gpu_info["gpu_part_number"] = gpu_part.text
if gpu_info:
gpus.append(gpu_info)
return {"gpus": gpus, "count": len(gpus)} if gpus else None return {"gpus": gpus, "count": len(gpus)} if gpus else None
except (FileNotFoundError, subprocess.TimeoutExpired, Exception) as e: except (
FileNotFoundError,
subprocess.TimeoutExpired,
ET.ParseError,
Exception,
) as e:
logger.debug(f"Failed to get GPU info: {e}") logger.debug(f"Failed to get GPU info: {e}")
return None return None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment