Unverified Commit 60468da4 authored by Garry Fang's avatar Garry Fang Committed by GitHub
Browse files

bugfix: fix sglang crash in NVIDIA MIG container (#8167)


Signed-off-by: default avatarGarrybest <garrybest@foxmail.com>
parent 41d33e47
......@@ -1422,6 +1422,13 @@ def get_nvgpu_memory_capacity():
]
if not memory_values:
# Fallback to torch.cuda.mem_get_info() when failed to get memory capacity from nvidia-smi,
# typically in NVIDIA MIG mode.
if torch.cuda.is_available():
logger.warning(
"Failed to get GPU memory capacity from nvidia-smi, falling back to torch.cuda.mem_get_info()."
)
return torch.cuda.mem_get_info()[1] // 1024 // 1024 # unit: MB
raise ValueError("No GPU memory values found.")
# Return the minimum memory value
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment