"docs/vscode:/vscode.git/clone" did not exist on "e9cdf6225fa73484c0c09b7aaec1e8acbda729cc"
Unverified Commit 27e9538a authored by mlmz's avatar mlmz Committed by GitHub
Browse files

Fix: fix the exception 'the memory capacity is unbalanced. Some GPUs … (#5426)


Co-authored-by: default avatarocss884 <ocss.lin@gmail.com>
parent 211c7b31
...@@ -73,6 +73,7 @@ from sglang.srt.utils import ( ...@@ -73,6 +73,7 @@ from sglang.srt.utils import (
MultiprocessingSerializer, MultiprocessingSerializer,
enable_show_time_cost, enable_show_time_cost,
get_available_gpu_memory, get_available_gpu_memory,
get_bool_env_var,
init_custom_process_group, init_custom_process_group,
is_cuda, is_cuda,
is_fa3_default_architecture, is_fa3_default_architecture,
...@@ -378,6 +379,12 @@ class ModelRunner: ...@@ -378,6 +379,12 @@ class ModelRunner:
local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id) local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
if self.tp_size > 1: if self.tp_size > 1:
if min_per_gpu_memory < local_gpu_memory * 0.9: if min_per_gpu_memory < local_gpu_memory * 0.9:
if get_bool_env_var("SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK"):
logger.warning(
"The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
)
else:
raise ValueError( raise ValueError(
"The memory capacity is unbalanced. Some GPUs may be occupied by other processes. " "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}" f"{min_per_gpu_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment