[CI] Stabilize test_cpu_offloading by waiting for async offload before cache reset (#37335)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>

[CI] Stabilize test_cpu_offloading by waiting for async offload before cache reset (#37335)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
ce2ef42f · Andreas Karatzas · GitHub · 8b632575 · ce2ef42f
Unverified Commit ce2ef42f authored Mar 18, 2026 by Andreas Karatzas Committed by GitHub Mar 18, 2026
Show whitespace changes
Inline Side-by-side

Showing with 46 additions and 7 deletions

tests/v1/kv_offload/test_cpu_offloading.py tests/v1/kv_offload/test_cpu_offloading.py +46 -7

No files found.
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -22,6 +22,17 @@ if current_platform.is_cuda():
 elif current_platform.is_rocm():
    ATTN_BACKENDS = ["TRITON_ATTN"]
+# Maximum time (seconds) to wait for the async CPU offload transfer
+# to complete before giving up.
+_RESET_CACHE_TIMEOUT = 30 if current_platform.is_rocm() else 10
+# ZMQ poll timeout (ms) for the first event.
+_FIRST_EVENT_POLL_MS = 10_000 if current_platform.is_rocm() else 1000
+# Hard ceiling (seconds) on how long get_new_cpu_stored_events may loop,
+# to prevent hangs if non-CPU events keep arriving indefinitely.
+_EVENT_DRAIN_TIMEOUT = 60
 class MockSubscriber:
    """Helper class to receive and verify published events"""
@@ -47,9 +58,10 @@ class MockSubscriber:
        poller = zmq.Poller()
        poller.register(self.sub, zmq.POLLIN)
-        timeout = 1000  # 1 second
+        poll_ms = _FIRST_EVENT_POLL_MS
-        while True:
+        deadline = time.monotonic() + _EVENT_DRAIN_TIMEOUT
-            events = dict(poller.poll(timeout))
+        while time.monotonic() < deadline:
+            events = dict(poller.poll(poll_ms))
            if events.get(self.sub) != zmq.POLLIN:
                return cpu_stored_events
@@ -63,13 +75,32 @@ class MockSubscriber:
            for event in event_batch.events:
                if isinstance(event, BlockStored) and event.medium == "CPU":
                    cpu_stored_events.append(event)
-                    timeout = 100
+                    poll_ms = 100
+        return cpu_stored_events
    def close(self):
        """Clean up resources"""
        self.sub.close()
+def _wait_for_prefix_cache_reset(llm: LLM) -> None:
+    """Wait for async offload transfers to finish so prefix cache can reset.
+    The GPU-to-CPU offload runs on a CUDA stream asynchronously.  While blocks
+    are still held by the offload worker, ``reset_prefix_cache`` returns
+    ``False``.  Retry with a short sleep until it succeeds or we time out.
+    """
+    deadline = time.monotonic() + _RESET_CACHE_TIMEOUT
+    while not llm.reset_prefix_cache():
+        if time.monotonic() > deadline:
+            raise TimeoutError(
+                "reset_prefix_cache did not succeed within "
+                f"{_RESET_CACHE_TIMEOUT}s - async offload may be stuck"
+            )
+        time.sleep(0.1)
 def _latency_test(llm: LLM, subscriber: MockSubscriber):
    sampling_params = SamplingParams(max_tokens=1)
@@ -95,10 +126,16 @@ def _latency_test(llm: LLM, subscriber: MockSubscriber):
        gpu_hit_time = time.time() - start_time
        total_gpu_hit_time += gpu_hit_time
-        # reset prefix cache to avoid GPU hit.
+        # Wait for the async CPU offload to finish, then reset prefix cache
-        llm.reset_prefix_cache()
+        # so the next generate() must reload from CPU rather than GPU.
+        _wait_for_prefix_cache_reset(llm)
-        assert subscriber.get_new_cpu_stored_events()
+        # Verify CPU stored events arrived (offload is done before we
+        # attempt to load from CPU).
+        assert subscriber.get_new_cpu_stored_events(), (
+            f"No CPU stored events received on iteration {i}; "
+            "async offload may not have completed in time"
+        )
        # run generation again - this should trigger loading from CPU
        start_time = time.time()
@@ -185,6 +222,8 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
        kv_events_config=kv_events_config,
        kv_transfer_config=kv_transfer_config,
        attention_config={"backend": attn_backend},
+        # ROCm: batch size 1 to reduce variability
+        **({"max_num_seqs": 1} if current_platform.is_rocm() else {}),
    )
    events_endpoint = events_endpoint.replace("*", "127.0.0.1")