"csrc/vscode:/vscode.git/clone" did not exist on "04149cce2775a5d7ba4ec5dec693003f81e58cba"
Unverified Commit 7c139ab2 authored by Ronen Schaffer's avatar Ronen Schaffer Committed by GitHub
Browse files

[KV Offload] Clean up ARC/LRU refactoring leftovers: group ARC tests and fix stale comment (#38217)


Signed-off-by: default avatarRonen Schaffer <ronen.schaffer@ibm.com>
parent 0be9516e
...@@ -15,6 +15,7 @@ from vllm.v1.kv_offload.abstract import ( ...@@ -15,6 +15,7 @@ from vllm.v1.kv_offload.abstract import (
from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
@dataclass @dataclass
...@@ -243,335 +244,300 @@ def test_cpu_manager(): ...@@ -243,335 +244,300 @@ def test_cpu_manager():
) )
def test_arc_manager_basic(): class TestARCPolicy:
""" """Unit tests for CPUOffloadingManager with ARC eviction policy."""
Tests CPUOffloadingManager with arc policy.
Verifies that ARC handles store, load, and lookup operations correctly. def _make_manager(
""" self, num_blocks: int = 4, enable_events: bool = True
block_size = 256 ) -> tuple[CPUOffloadingManager, ARCCachePolicy]:
arc_manager = CPUOffloadingManager( manager = CPUOffloadingManager(
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=True block_size=256,
) num_blocks=num_blocks,
arc_policy = arc_manager._policy cache_policy="arc",
assert isinstance(arc_policy, ARCCachePolicy) enable_events=enable_events,
)
# prepare store [1, 2] policy = manager._policy
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2])) assert isinstance(policy, ARCCachePolicy)
verify_store_output( return manager, policy
prepare_store_output,
ExpectedPrepareStoreOutput( def test_basic(self):
block_hashes_to_store=[1, 2], """
store_block_ids=[0, 1], Tests CPUOffloadingManager with arc policy.
block_hashes_evicted=[], Verifies that ARC handles store, load, and lookup operations correctly.
), """
) cpu_manager, arc_policy = self._make_manager()
# lookup [1, 2] -> not ready # prepare store [1, 2]
assert arc_manager.lookup(to_hashes([1, 2])) == 0 prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
verify_store_output(
# no events so far prepare_store_output,
assert list(arc_manager.take_events()) == [] ExpectedPrepareStoreOutput(
block_hashes_to_store=[1, 2],
# complete store [1, 2] store_block_ids=[0, 1],
arc_manager.complete_store(to_hashes([1, 2])) block_hashes_evicted=[],
verify_events( ),
arc_manager.take_events(), block_size=block_size, expected_stores=({1, 2},) )
)
# lookup [1, 2] -> not ready
# lookup [1, 2] assert cpu_manager.lookup(to_hashes([1, 2])) == 0
assert arc_manager.lookup(to_hashes([1])) == 1
assert arc_manager.lookup(to_hashes([1, 2])) == 2 # no events so far
assert arc_manager.lookup(to_hashes([1, 2, 3])) == 2 assert list(cpu_manager.take_events()) == []
# blocks should be in T1 (recent) # complete store [1, 2]
assert len(arc_policy.t1) == 2 cpu_manager.complete_store(to_hashes([1, 2]))
assert len(arc_policy.t2) == 0 verify_events(
cpu_manager.take_events(), block_size=256, expected_stores=({1, 2},)
)
def test_arc_manager_t1_to_t2_promotion():
""" # lookup [1, 2]
Tests that accessing a block in T1 promotes it to T2 (frequent). assert cpu_manager.lookup(to_hashes([1])) == 1
This is a key feature of ARC's adaptive behavior. assert cpu_manager.lookup(to_hashes([1, 2])) == 2
""" assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
block_size = 256
arc_manager = CPUOffloadingManager( # blocks should be in T1 (recent)
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=False assert len(arc_policy.t1) == 2
) assert len(arc_policy.t2) == 0
arc_policy = arc_manager._policy
assert isinstance(arc_policy, ARCCachePolicy) def test_t1_to_t2_promotion(self):
"""
# store and complete block 1 Tests that accessing a block in T1 promotes it to T2 (frequent).
arc_manager.prepare_store(to_hashes([1])) This is a key feature of ARC's adaptive behavior.
arc_manager.complete_store(to_hashes([1])) """
cpu_manager, arc_policy = self._make_manager(enable_events=False)
# block 1 starts in T1 (recent)
assert to_hashes([1])[0] in arc_policy.t1 # store and complete block 1
assert to_hashes([1])[0] not in arc_policy.t2 cpu_manager.prepare_store(to_hashes([1]))
cpu_manager.complete_store(to_hashes([1]))
# touch block 1 (simulate second access)
arc_manager.touch(to_hashes([1])) # block 1 starts in T1 (recent)
assert to_hashes([1])[0] in arc_policy.t1
# block 1 should now be in T2 (frequent) assert to_hashes([1])[0] not in arc_policy.t2
assert to_hashes([1])[0] not in arc_policy.t1
assert to_hashes([1])[0] in arc_policy.t2 # touch block 1 (simulate second access)
cpu_manager.touch(to_hashes([1]))
def test_arc_manager_eviction_with_load(): # block 1 should now be in T2 (frequent)
""" assert to_hashes([1])[0] not in arc_policy.t1
Tests ARC eviction behavior similar to LRU test. assert to_hashes([1])[0] in arc_policy.t2
Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
""" def test_eviction_with_load(self):
block_size = 256 """
arc_manager = CPUOffloadingManager( Tests ARC eviction behavior similar to LRU test.
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=True Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
) """
cpu_manager, _ = self._make_manager()
# prepare and complete store [1, 2, 3, 4]
prepare_store_output = arc_manager.prepare_store(to_hashes([1, 2, 3, 4])) # prepare and complete store [1, 2, 3, 4]
verify_store_output( prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2, 3, 4]))
prepare_store_output, verify_store_output(
ExpectedPrepareStoreOutput( prepare_store_output,
block_hashes_to_store=[1, 2, 3, 4], ExpectedPrepareStoreOutput(
store_block_ids=[0, 1, 2, 3], block_hashes_to_store=[1, 2, 3, 4],
block_hashes_evicted=[], store_block_ids=[0, 1, 2, 3],
), block_hashes_evicted=[],
) ),
arc_manager.complete_store(to_hashes([1, 2, 3, 4])) )
cpu_manager.complete_store(to_hashes([1, 2, 3, 4]))
# prepare load [2, 3] (increases ref_cnt)
prepare_load_output = arc_manager.prepare_load(to_hashes([2, 3])) # prepare load [2, 3] (increases ref_cnt)
verify_load_output(prepare_load_output, [1, 2]) prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
verify_load_output(prepare_load_output, [1, 2])
# prepare store [5, 6, 7] with [2, 3] being loaded
# should fail because [2, 3] have ref_cnt > 0 # prepare store [5, 6, 7] with [2, 3] being loaded
assert arc_manager.prepare_store(to_hashes([5, 6, 7])) is None # should fail because [2, 3] have ref_cnt > 0
assert cpu_manager.prepare_store(to_hashes([5, 6, 7])) is None
# complete load [2, 3]
arc_manager.complete_load(to_hashes([2, 3])) # complete load [2, 3]
cpu_manager.complete_load(to_hashes([2, 3]))
# now prepare store [5, 6, 7] should succeed
# ARC will evict blocks one at a time from T1 as needed # now prepare store [5, 6, 7] should succeed
prepare_store_output = arc_manager.prepare_store(to_hashes([5, 6, 7])) # ARC will evict blocks one at a time from T1 as needed
assert prepare_store_output is not None prepare_store_output = cpu_manager.prepare_store(to_hashes([5, 6, 7]))
# Should successfully evict enough blocks to make room (at least 1) assert prepare_store_output is not None
assert len(prepare_store_output.block_hashes_evicted) >= 1 # Should successfully evict enough blocks to make room (at least 1)
assert len(prepare_store_output.block_hashes_evicted) >= 1
def test_arc_manager_adaptive_target(): def test_adaptive_target(self):
""" """
Tests ARC's adaptive target adjustment via ghost lists. Tests ARC's adaptive target adjustment via ghost lists.
When a block in B1 (ghost list) is accessed, target_t1_size increases. When a block in B1 (ghost list) is accessed, target_t1_size increases.
When a block in B2 is accessed, target_t1_size decreases. When a block in B2 is accessed, target_t1_size decreases.
""" """
block_size = 256 cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)
arc_manager = CPUOffloadingManager(
block_size=block_size, num_blocks=2, cache_policy="arc", enable_events=False # store blocks 1, 2 (fills cache)
) cpu_manager.prepare_store(to_hashes([1, 2]))
arc_policy = arc_manager._policy cpu_manager.complete_store(to_hashes([1, 2]))
assert isinstance(arc_policy, ARCCachePolicy)
initial_target = arc_policy.target_t1_size
# store blocks 1, 2 (fills cache)
arc_manager.prepare_store(to_hashes([1, 2])) # store block 3, evicting block 1 (moves to B1 ghost list)
arc_manager.complete_store(to_hashes([1, 2])) cpu_manager.prepare_store(to_hashes([3]))
cpu_manager.complete_store(to_hashes([3]))
initial_target = arc_policy.target_t1_size
# block 1 should be in B1 (ghost list)
# store block 3, evicting block 1 (moves to B1 ghost list) assert to_hashes([1])[0] in arc_policy.b1
arc_manager.prepare_store(to_hashes([3]))
arc_manager.complete_store(to_hashes([3])) # touch block 1 (cache miss, but in B1)
# this should increase target_t1_size (favor recency)
# block 1 should be in B1 (ghost list) cpu_manager.touch(to_hashes([1]))
assert to_hashes([1])[0] in arc_policy.b1
# target should have increased
# touch block 1 (cache miss, but in B1) assert arc_policy.target_t1_size > initial_target
# this should increase target_t1_size (favor recency)
arc_manager.touch(to_hashes([1])) def test_t1_t2_eviction_policy(self):
"""
# target should have increased Tests that ARC evicts from T1 or T2 based on target_t1_size.
assert arc_policy.target_t1_size > initial_target If |T1| >= target_t1_size, evict from T1, otherwise from T2.
"""
cpu_manager, arc_policy = self._make_manager(enable_events=False)
def test_arc_manager_t1_t2_eviction_policy():
""" # store blocks 1, 2, 3, 4
Tests that ARC evicts from T1 or T2 based on target_t1_size. cpu_manager.prepare_store(to_hashes([1, 2, 3, 4]))
If |T1| >= target_t1_size, evict from T1, otherwise from T2. cpu_manager.complete_store(to_hashes([1, 2, 3, 4]))
"""
block_size = 256 # promote blocks 3, 4 to T2 by touching them
arc_manager = CPUOffloadingManager( cpu_manager.touch(to_hashes([3, 4]))
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=False
) # now: T1 = {1, 2}, T2 = {3, 4}
arc_policy = arc_manager._policy assert len(arc_policy.t1) == 2
assert isinstance(arc_policy, ARCCachePolicy) assert len(arc_policy.t2) == 2
# store blocks 1, 2, 3, 4 # set target_t1_size to prefer evicting from T1
arc_manager.prepare_store(to_hashes([1, 2, 3, 4])) # (when |T1| >= target, evict from T1)
arc_manager.complete_store(to_hashes([1, 2, 3, 4])) arc_policy.target_t1_size = 1
# promote blocks 3, 4 to T2 by touching them # store block 5, should evict from T1 (block 1, LRU in T1)
arc_manager.touch(to_hashes([3, 4])) output = cpu_manager.prepare_store(to_hashes([5]))
assert output is not None
# now: T1 = {1, 2}, T2 = {3, 4} assert to_hashes([1]) == output.block_hashes_evicted
assert len(arc_policy.t1) == 2
assert len(arc_policy.t2) == 2 cpu_manager.complete_store(to_hashes([5]))
# set target_t1_size to prefer evicting from T1 # block 1 should be in B1 (ghost list)
# (when |T1| >= target, evict from T1) assert to_hashes([1])[0] in arc_policy.b1
arc_policy.target_t1_size = 1 # block 5 should be in T1
assert to_hashes([5])[0] in arc_policy.t1
# store block 5, should evict from T1 (block 1, LRU in T1)
output = arc_manager.prepare_store(to_hashes([5])) def test_ghost_list_bounds(self):
assert output is not None """
assert to_hashes([1]) == output.block_hashes_evicted Tests that ghost lists (B1, B2) don't grow unbounded.
They should be capped at cache_capacity.
arc_manager.complete_store(to_hashes([5])) """
cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)
# block 1 should be in B1 (ghost list)
assert to_hashes([1])[0] in arc_policy.b1 # fill cache with blocks 1, 2
# block 5 should be in T1 cpu_manager.prepare_store(to_hashes([1, 2]))
assert to_hashes([5])[0] in arc_policy.t1 cpu_manager.complete_store(to_hashes([1, 2]))
# store many blocks to fill ghost lists
def test_arc_manager_ghost_list_bounds(): for i in range(3, 20):
""" cpu_manager.prepare_store(to_hashes([i]))
Tests that ghost lists (B1, B2) don't grow unbounded. cpu_manager.complete_store(to_hashes([i]))
They should be capped at cache_capacity.
""" # ghost lists should not exceed cache_capacity
block_size = 256 assert len(arc_policy.b1) <= arc_policy.cache_capacity
arc_manager = CPUOffloadingManager( assert len(arc_policy.b2) <= arc_policy.cache_capacity
block_size=block_size, num_blocks=2, cache_policy="arc", enable_events=False
) def test_touch_ordering(self):
arc_policy = arc_manager._policy """
assert isinstance(arc_policy, ARCCachePolicy) Tests that touch() correctly updates access patterns.
Similar to LRU test but verifies T1/T2 ordering.
# fill cache with blocks 1, 2 """
arc_manager.prepare_store(to_hashes([1, 2])) cpu_manager, arc_policy = self._make_manager()
arc_manager.complete_store(to_hashes([1, 2]))
# store blocks 1, 2, 3, 4
# store many blocks to fill ghost lists cpu_manager.prepare_store(to_hashes([1, 2, 3, 4]))
for i in range(3, 20): cpu_manager.complete_store(to_hashes([1, 2, 3, 4]))
arc_manager.prepare_store(to_hashes([i]))
arc_manager.complete_store(to_hashes([i])) # promote 3, 4 to T2
cpu_manager.touch(to_hashes([3, 4]))
# ghost lists should not exceed cache_capacity
assert len(arc_policy.b1) <= arc_policy.cache_capacity # T1 = {1, 2}, T2 = {3, 4}
assert len(arc_policy.b2) <= arc_policy.cache_capacity # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
cpu_manager.touch(to_hashes([1, 3, 4]))
def test_arc_manager_touch_ordering(): # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
""" assert len(arc_policy.t1) == 1
Tests that touch() correctly updates access patterns. assert len(arc_policy.t2) == 3
Similar to LRU test but verifies T1/T2 ordering.
""" # store block 5, should evict from T1 (block 2, only one in T1)
block_size = 256 prepare_store_output = cpu_manager.prepare_store(to_hashes([5]))
arc_manager = CPUOffloadingManager( verify_store_output(
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=True prepare_store_output,
) ExpectedPrepareStoreOutput(
arc_policy = arc_manager._policy block_hashes_to_store=[5],
assert isinstance(arc_policy, ARCCachePolicy) store_block_ids=[1], # reuses block 2's storage
block_hashes_evicted=[2],
# store blocks 1, 2, 3, 4 ),
arc_manager.prepare_store(to_hashes([1, 2, 3, 4])) )
arc_manager.complete_store(to_hashes([1, 2, 3, 4]))
def test_failed_store(self):
# promote 3, 4 to T2 """
arc_manager.touch(to_hashes([3, 4])) Tests that failed store operations clean up correctly.
Similar to LRU test but for ARC.
# T1 = {1, 2}, T2 = {3, 4} """
# touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2 cpu_manager, arc_policy = self._make_manager()
arc_manager.touch(to_hashes([1, 3, 4]))
# store blocks 1, 2, 3, 4
# T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent) cpu_manager.prepare_store(to_hashes([1, 2, 3, 4]))
assert len(arc_policy.t1) == 1 cpu_manager.complete_store(to_hashes([1, 2, 3, 4]))
assert len(arc_policy.t2) == 3
# prepare store block 5 (will evict block 1)
# store block 5, should evict from T1 (block 2, only one in T1) prepare_store_output = cpu_manager.prepare_store(to_hashes([5]))
prepare_store_output = arc_manager.prepare_store(to_hashes([5])) assert prepare_store_output is not None
verify_store_output( assert len(prepare_store_output.block_hashes_evicted) == 1
prepare_store_output,
ExpectedPrepareStoreOutput( # complete store with failure
block_hashes_to_store=[5], cpu_manager.complete_store(to_hashes([5]), success=False)
store_block_ids=[1], # reuses block 2's storage
block_hashes_evicted=[2], # block 5 should not be in cache
), assert cpu_manager.lookup(to_hashes([5])) == 0
) # block 5 should not be in T1 or T2
assert to_hashes([5])[0] not in arc_policy.t1
assert to_hashes([5])[0] not in arc_policy.t2
def test_arc_manager_failed_store():
""" # evicted block should still be gone (in B1 ghost list)
Tests that failed store operations clean up correctly. evicted_hash = prepare_store_output.block_hashes_evicted[0]
Similar to LRU test but for ARC. assert evicted_hash in arc_policy.b1
"""
block_size = 256 def test_full_scenario(self):
arc_manager = CPUOffloadingManager( """
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=True Comprehensive test covering multiple ARC operations in sequence.
) Similar to the full LRU test but adapted for ARC behavior.
arc_policy = arc_manager._policy """
assert isinstance(arc_policy, ARCCachePolicy) cpu_manager, arc_policy = self._make_manager()
# store blocks 1, 2, 3, 4 # store [1, 2]
arc_manager.prepare_store(to_hashes([1, 2, 3, 4])) cpu_manager.prepare_store(to_hashes([1, 2]))
arc_manager.complete_store(to_hashes([1, 2, 3, 4])) cpu_manager.complete_store(to_hashes([1, 2]))
# prepare store block 5 (will evict block 1) # store [3, 4, 5] -> evicts [1]
prepare_store_output = arc_manager.prepare_store(to_hashes([5])) prepare_store_output = cpu_manager.prepare_store(to_hashes([3, 4, 5]))
assert prepare_store_output is not None assert prepare_store_output is not None
assert len(prepare_store_output.block_hashes_evicted) == 1 assert len(prepare_store_output.block_hashes_evicted) == 1
cpu_manager.complete_store(to_hashes([3, 4, 5]))
# complete store with failure
arc_manager.complete_store(to_hashes([5]), success=False) # promote some blocks to T2
cpu_manager.touch(to_hashes([2, 3]))
# block 5 should not be in cache
assert arc_manager.lookup(to_hashes([5])) == 0 # T1 has {4, 5}, T2 has {2, 3}
# block 5 should not be in T1 or T2 assert len(arc_policy.t1) == 2
assert to_hashes([5])[0] not in arc_policy.t1 assert len(arc_policy.t2) == 2
assert to_hashes([5])[0] not in arc_policy.t2
# store [6] -> should evict from T1 (4 is oldest in T1)
# evicted block should still be gone (in B1 ghost list) prepare_store_output = cpu_manager.prepare_store(to_hashes([6]))
evicted_hash = prepare_store_output.block_hashes_evicted[0] assert prepare_store_output is not None
assert evicted_hash in arc_policy.b1 cpu_manager.complete_store(to_hashes([6]))
# verify blocks 2, 3 (in T2) are still present
def test_arc_manager_full_scenario(): assert cpu_manager.lookup(to_hashes([2])) == 1
""" assert cpu_manager.lookup(to_hashes([3])) == 1
Comprehensive test covering multiple ARC operations in sequence.
Similar to the full LRU test but adapted for ARC behavior. # verify events
""" events = list(cpu_manager.take_events())
block_size = 256 assert len(events) > 0 # should have store and eviction events
arc_manager = CPUOffloadingManager(
block_size=block_size, num_blocks=4, cache_policy="arc", enable_events=True
)
arc_policy = arc_manager._policy
assert isinstance(arc_policy, ARCCachePolicy)
# store [1, 2]
arc_manager.prepare_store(to_hashes([1, 2]))
arc_manager.complete_store(to_hashes([1, 2]))
# store [3, 4, 5] -> evicts [1]
prepare_store_output = arc_manager.prepare_store(to_hashes([3, 4, 5]))
assert prepare_store_output is not None
assert len(prepare_store_output.block_hashes_evicted) == 1
arc_manager.complete_store(to_hashes([3, 4, 5]))
# promote some blocks to T2
arc_manager.touch(to_hashes([2, 3]))
# T1 has {4, 5}, T2 has {2, 3}
assert len(arc_policy.t1) == 2
assert len(arc_policy.t2) == 2
# store [6] -> should evict from T1 (4 is oldest in T1)
prepare_store_output = arc_manager.prepare_store(to_hashes([6]))
assert prepare_store_output is not None
arc_manager.complete_store(to_hashes([6]))
# verify blocks 2, 3 (in T2) are still present
assert arc_manager.lookup(to_hashes([2])) == 1
assert arc_manager.lookup(to_hashes([3])) == 1
# verify events
events = list(arc_manager.take_events())
assert len(events) > 0 # should have store and eviction events
def test_filter_reused_manager(): def test_filter_reused_manager():
...@@ -583,8 +549,6 @@ def test_filter_reused_manager(): ...@@ -583,8 +549,6 @@ def test_filter_reused_manager():
block_size=block_size, num_blocks=4, cache_policy="lru", enable_events=True block_size=block_size, num_blocks=4, cache_policy="lru", enable_events=True
) )
from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
manager = FilterReusedOffloadingManager( manager = FilterReusedOffloadingManager(
backing=lru_manager, store_threshold=2, max_tracker_size=3 backing=lru_manager, store_threshold=2, max_tracker_size=3
) )
......
...@@ -93,9 +93,8 @@ class FilterReusedOffloadingManager(OffloadingManager): ...@@ -93,9 +93,8 @@ class FilterReusedOffloadingManager(OffloadingManager):
] ]
# Delegate to the backing manager with only the eligible hashes. # Delegate to the backing manager with only the eligible hashes.
# Passing an empty list is intentional and safe — both # Passing an empty list is intentional and safe — CPUOffloadingManager
# LRUOffloadingManager and ARCOffloadingManager handle it correctly, # handles it correctly, returning a PrepareStoreOutput with empty lists.
# returning a PrepareStoreOutput with empty lists.
return self._backing.prepare_store(eligible) return self._backing.prepare_store(eligible)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment