Unverified Commit 4ed51308 authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[CI] Fix GPU memory leak when RemoteOpenAIServer fails to start in __init__ (#37230)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent c781fbba
...@@ -225,13 +225,31 @@ class RemoteVLLMServer: ...@@ -225,13 +225,31 @@ class RemoteVLLMServer:
) )
self._start_server(model, vllm_serve_args, env_dict) self._start_server(model, vllm_serve_args, env_dict)
max_wait_seconds = max_wait_seconds or 360 max_wait_seconds = max_wait_seconds or 480
self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) try:
self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
except Exception:
# If the server never became healthy, we must still clean up
# the subprocess tree. Without this, a timeout in __init__
# leaks the server + EngineCore processes (and their GPU
# memory), because __exit__ is never called when __init__
# raises inside a ``with`` statement.
self._shutdown()
raise
def __enter__(self): def __enter__(self):
return self return self
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
self._shutdown()
def _shutdown(self) -> None:
"""Kill the server process tree and wait for GPU memory release.
Called from both ``__exit__`` (normal path) and ``__init__``
(when the server fails to start). Must be safe to call even if
the process is already dead.
"""
pid = self.proc.pid pid = self.proc.pid
# Get the process group ID. Because we used # Get the process group ID. Because we used
...@@ -265,33 +283,92 @@ class RemoteVLLMServer: ...@@ -265,33 +283,92 @@ class RemoteVLLMServer:
self.proc.wait(timeout=10) self.proc.wait(timeout=10)
print(f"[RemoteOpenAIServer] Server {pid} killed") print(f"[RemoteOpenAIServer] Server {pid} killed")
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
# Phase 3: last resort - find and kill any orphaned children pass
self._kill_orphaned_children(pid)
# Wait for GPU memory to actually be *freed*, not just # After killing the root process, ensure all children in the
# process group (e.g. EngineCore workers) are also dead.
# On ROCm especially, surviving children hold GPU contexts and
# prevent VRAM from being reclaimed by the driver.
self._kill_process_group_survivors(pgid)
# Wait for GPU memory to actually be freed, not just
# "stabilized at whatever level it's at". # "stabilized at whatever level it's at".
self._wait_for_gpu_memory_release() self._wait_for_gpu_memory_release()
def _kill_orphaned_children(self, parent_pid: int) -> None: def _kill_process_group_survivors(
"""Best-effort cleanup of any lingering child processes.""" self, pgid: int | None, timeout: float = 15.0
try: ) -> None:
import psutil """SIGKILL any processes still in the server's process group
and wait for them to exit.
parent = psutil.Process(parent_pid) Because the server is launched with ``start_new_session=True``,
children = parent.children(recursive=True) all its children (EngineCore, workers, etc.) share the same
for child in children: pgid. After the root process is killed, stragglers -- especially
print( on ROCm where GPU contexts linger until the *process* exits --
f"[RemoteOpenAIServer] Killing orphaned child " must be reaped explicitly.
f"pid={child.pid} name={child.name()}"
) Uses ``/proc`` to scan for pgid members so this works even after
child.kill() the parent has been reaped (unlike ``psutil.Process.children``).
psutil.wait_procs(children, timeout=5) """
except Exception as e: if pgid is None:
# psutil may not be installed, or processes already gone return
print(f"[RemoteOpenAIServer] Orphan cleanup failed: {e}")
# Fallback: try to kill by pgid one more time # Send SIGKILL to the entire process group one more time.
with contextlib.suppress(ProcessLookupError, OSError): # This is cheap and harmless if everyone is already dead.
os.killpg(parent_pid, signal.SIGKILL) with contextlib.suppress(ProcessLookupError, OSError):
os.killpg(pgid, signal.SIGKILL)
# Collect surviving PIDs by scanning /proc for matching pgid.
# This works on Linux even after the parent has been waited on
# and is more reliable than psutil.Process(parent).children().
survivor_pids = self._find_pgid_members(pgid)
if not survivor_pids:
return
print(
f"[RemoteOpenAIServer] {len(survivor_pids)} process(es) still "
f"in pgid {pgid} after SIGKILL: {survivor_pids}"
)
# Wait for each survivor to actually exit so the GPU driver
# releases its VRAM.
deadline = time.time() + timeout
while survivor_pids and time.time() < deadline:
still_alive = []
for spid in survivor_pids:
try:
os.kill(spid, 0) # Check if still alive
still_alive.append(spid)
except (ProcessLookupError, OSError):
pass
survivor_pids = still_alive
if survivor_pids:
time.sleep(0.5)
if survivor_pids:
print(
f"[RemoteOpenAIServer] WARNING: processes {survivor_pids} "
f"in pgid {pgid} could not be killed within {timeout}s"
)
@staticmethod
def _find_pgid_members(pgid: int) -> list[int]:
"""Return PIDs of all living processes whose pgid matches."""
members: list[int] = []
proc_path = Path("/proc")
if not proc_path.is_dir():
return members
for entry in proc_path.iterdir():
if not entry.name.isdigit():
continue
pid = int(entry.name)
try:
if os.getpgid(pid) == pgid:
members.append(pid)
except OSError:
continue
return members
def _get_gpu_memory_used(self) -> float | None: def _get_gpu_memory_used(self) -> float | None:
"""Get total GPU memory used across all visible devices in bytes.""" """Get total GPU memory used across all visible devices in bytes."""
...@@ -318,13 +395,16 @@ class RemoteVLLMServer: ...@@ -318,13 +395,16 @@ class RemoteVLLMServer:
return None return None
return None return None
def _wait_for_gpu_memory_release(self, timeout: float = 60.0): def _wait_for_gpu_memory_release(
self, timeout: float = 120.0, log_interval: float = 10.0
):
"""Wait for GPU memory to drop back toward pre-server levels. """Wait for GPU memory to drop back toward pre-server levels.
Two-phase strategy: Waits the full timeout for memory to return close to the
1. Try to wait for memory to return close to pre-server baseline. pre-server baseline. Does NOT fall back to a "stabilization"
2. If that doesn't happen, fall back to waiting for stabilization heuristic -- if memory is still held when the timeout expires,
and log a warning (the next server might still OOM). the test fails so the problem is surfaced immediately rather
than causing cascading OOM failures in every subsequent test.
""" """
baseline = self._pre_server_gpu_memory baseline = self._pre_server_gpu_memory
if baseline is None: if baseline is None:
...@@ -337,8 +417,7 @@ class RemoteVLLMServer: ...@@ -337,8 +417,7 @@ class RemoteVLLMServer:
target = baseline + headroom_bytes target = baseline + headroom_bytes
start = time.time() start = time.time()
last_used: float | None = None next_log_time = start + log_interval
stable_count = 0
while time.time() - start < timeout: while time.time() - start < timeout:
used = self._get_gpu_memory_used() used = self._get_gpu_memory_used()
...@@ -350,7 +429,6 @@ class RemoteVLLMServer: ...@@ -350,7 +429,6 @@ class RemoteVLLMServer:
target_gb = target / 1e9 target_gb = target / 1e9
elapsed = time.time() - start elapsed = time.time() - start
# Phase 1: memory dropped to near baseline - we're done.
if used <= target: if used <= target:
print( print(
f"[RemoteOpenAIServer] GPU memory released to " f"[RemoteOpenAIServer] GPU memory released to "
...@@ -359,28 +437,19 @@ class RemoteVLLMServer: ...@@ -359,28 +437,19 @@ class RemoteVLLMServer:
) )
return return
# Phase 2 (after 40s): fall back to stabilization check. now = time.time()
# This handles cases where another process is using GPU memory if now >= next_log_time:
# and we'll never reach baseline. print(
if elapsed > 40.0 and last_used is not None: f"[RemoteOpenAIServer] Waiting for GPU memory release: "
delta = abs(used - last_used) f"{used_gb:.2f} GB (target: {target_gb:.2f} GB) "
if delta < 200 * 1024 * 1024: # 200 MB f"[{elapsed:.0f}s/{timeout:.0f}s]"
stable_count += 1 )
if stable_count >= 3: next_log_time = now + log_interval
print(
f"[RemoteOpenAIServer] WARNING: GPU memory "
f"stabilized at {used_gb:.2f} GB "
f"(target was {target_gb:.2f} GB). "
f"Proceeding - next server may OOM."
)
return
else:
stable_count = 0
last_used = used
time.sleep(1.0) time.sleep(1.0)
# Timeout - log clearly so CI failures are diagnosable # Timeout -- raise so the current test fails with a clear
# message instead of silently poisoning subsequent tests.
final_used = self._get_gpu_memory_used() final_used = self._get_gpu_memory_used()
final_gb = final_used / 1e9 if final_used else 0.0 final_gb = final_used / 1e9 if final_used else 0.0
raise RuntimeError( raise RuntimeError(
...@@ -534,7 +603,9 @@ class RemoteLaunchRenderServer(RemoteVLLMServer): ...@@ -534,7 +603,9 @@ class RemoteLaunchRenderServer(RemoteVLLMServer):
revision=model_config.tokenizer_revision, revision=model_config.tokenizer_revision,
) )
def _wait_for_gpu_memory_release(self, timeout: float = 30.0): def _wait_for_gpu_memory_release(
self, timeout: float = 30.0, log_interval: float = 10.0
):
pass # No GPU used pass # No GPU used
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment