[Bugfix] Fix workspace resize leaking reserved GPU memory (#39226)

Signed-off-by: root <conway.zhu@cohere.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>

[Bugfix] Fix workspace resize leaking reserved GPU memory (#39226)
Signed-off-by: root <conway.zhu@cohere.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
7ff65b19 · czhu-cohere · GitHub · 7f95a66c · 7ff65b19
Unverified Commit 7ff65b19 authored Apr 23, 2026 by czhu-cohere Committed by GitHub Apr 23, 2026
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 21 deletions

vllm/v1/worker/workspace.py vllm/v1/worker/workspace.py +18 -21

No files found.
--- a/vllm/v1/worker/workspace.py
+++ b/vllm/v1/worker/workspace.py
@@ -161,36 +161,33 @@ class WorkspaceManager:
                    "Workspace growth is not allowed after locking."
                )

-            for ubatch_id in range(self._num_ubatches):
-                current_workspace = self._current_workspaces[ubatch_id]
-                if (
-                    current_workspace is None
-                    or self._workspace_size_bytes(current_workspace) < required_bytes
-                ):
-                    # Delete old tensor before allocating new one to avoid
-                    # memory spike from resize_(). resize_() allocates new
-                    # memory before freeing old, which can cause OOM.
-                    # Must clear the list reference first since local var
-                    # is just a copy of the reference.
+            # Only resize the requesting ubatch's workspace.  Other
+            # ubatches resize lazily on their next get_simultaneous call.
+            # Resizing all ubatches here would orphan the other ubatch's
+            # old tensor when it still holds views into it (DBO leak).
            self._current_workspaces[ubatch_id] = None
            del current_workspace
+            # Release the freed segment back to CUDA so the caching
+            # allocator can reuse the GPU memory for the larger
+            # allocation below. Without this, each resize may leave a
+            # dead segment in reserved memory which can cause higher peak
+            # memory usage.
+            torch.accelerator.empty_cache()
            self._current_workspaces[ubatch_id] = torch.empty(
                (required_bytes,), dtype=torch.uint8, device=self._device
            )
+            current_workspace = self._current_workspaces[ubatch_id]

            if envs.VLLM_DEBUG_WORKSPACE:
                logger.info(
                    "[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> "
-                    "%.2f MB (%d ubatches, total memory %.2f MB)",
+                    "%.2f MB (ubatch %d)",
                    get_caller_info(),
                    current_size / _MB,
                    required_bytes / _MB,
-                    self._num_ubatches,
-                    required_bytes * self._num_ubatches / _MB,
+                    ubatch_id,
                )

-            current_workspace = self._current_workspaces[dbo_current_ubatch_id()]
-
        return current_workspace