[Perf] Optimize async scheduling placeholder using empty (#32056)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Perf] Optimize async scheduling placeholder using empty (#32056)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
e18464a5 · Wentao Ye · GitHub · 1963245e · e18464a5
Unverified Commit e18464a5 authored Jan 09, 2026 by Wentao Ye Committed by GitHub Jan 10, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

vllm/v1/engine/output_processor.py vllm/v1/engine/output_processor.py +4 -1

No files found.
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -31,6 +31,9 @@ from vllm.v1.metrics.stats import (
    SchedulerStats,
 )

+# shared empty CPU tensor used as a placeholder pooling output
+EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
+

 class RequestOutputCollector:
    """
@@ -426,7 +429,7 @@ class OutputProcessor:
                        new_token_ids=[],
                        # Set pooling_output is not None to
                        # correctly enter the abort pooling branch
-                        pooling_output=torch.randn(0, device="cpu")
+                        pooling_output=EMPTY_CPU_TENSOR
                        if req_state.detokenizer is None
                        else None,
                        finish_reason=FinishReason.ABORT,