test_cpu_manager.py 19.9 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from dataclasses import dataclass

import numpy as np
7
import pytest
8

9
10
11
from vllm.v1.kv_offload.abstract import (
    LoadStoreSpec,
    OffloadingEvent,
12
    OffloadKey,
13
    PrepareStoreOutput,
14
    make_offload_key,
15
)
16
17
from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
18
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
19
from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
20
21
22
23


@dataclass
class ExpectedPrepareStoreOutput:
24
    keys_to_store: list[int]
25
    store_block_ids: list[int]
26
    evicted_keys: list[int]
27
28


29
30
def to_keys(int_ids: list[int]) -> list[OffloadKey]:
    return [make_offload_key(str(i).encode(), 0) for i in int_ids]
31
32
33


def verify_store_output(
34
    prepare_store_output: PrepareStoreOutput | None,
35
36
    expected_prepare_store_output: ExpectedPrepareStoreOutput,
):
37
    assert prepare_store_output is not None
38
39
    assert prepare_store_output.keys_to_store == to_keys(
        expected_prepare_store_output.keys_to_store
40
    )
41
42
    assert prepare_store_output.evicted_keys == to_keys(
        expected_prepare_store_output.evicted_keys
43
    )
44
45
    store_spec = prepare_store_output.store_spec
    assert isinstance(store_spec, CPULoadStoreSpec)
46
47
48
    expected_array = np.array(
        expected_prepare_store_output.store_block_ids, dtype=np.int64
    )
49
50
51
    assert np.array_equal(expected_array, store_spec.block_ids)


52
53
54
def verify_load_output(
    prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
):
55
56
57
58
59
    assert isinstance(prepare_load_output, CPULoadStoreSpec)
    expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
    assert np.array_equal(expected_array, prepare_load_output.block_ids)


60
61
62
63
64
def verify_events(
    events: Iterable[OffloadingEvent],
    expected_stores: tuple[set[int], ...] = (),
    expected_evictions: tuple[set[int], ...] = (),
):
65
66
    stores: list[set[OffloadKey]] = []
    evictions: list[set[OffloadKey]] = []
67
68
69
    for event in events:
        assert event.medium == CPULoadStoreSpec.medium()
        if event.removed:
70
            evictions.append(set(event.keys))
71
        else:
72
            stores.append(set(event.keys))
73

74
75
76
77
    def to_key_sets(
        int_sets: tuple[set[int], ...],
    ) -> tuple[set[OffloadKey], ...]:
        return tuple([set(to_keys(list(int_set))) for int_set in int_sets])
78

79
80
    assert tuple(evictions) == to_key_sets(expected_evictions)
    assert tuple(stores) == to_key_sets(expected_stores)
81
82


83
84
@pytest.mark.parametrize("eviction_policy", ["lru", "arc"])
def test_already_stored_block_not_evicted_during_prepare_store(eviction_policy):
85
86
87
    """
    Regression test: a block that is already stored must not be evicted
    by prepare_store() when it needs to make room for new blocks.
88
    Applies to both lru and arc policies.
89
90
91
92
93
94
95
96
97
98

    Scenario:
        - Store blocks [1, 2] and complete.
        - touch([1]) makes block 2 the LRU candidate.
        - prepare_store([2, 3, 4, 5]):
            * block 2 is filtered out as "already stored"
            * but without the fix, block 2 would be evicted as the LRU
              candidate to make room for [3, 4, 5]
        - After complete_store([2, 3, 4, 5]), block 2 must still be present.
    """
99
100
101
102
103
    manager = CPUOffloadingManager(
        num_blocks=4,
        cache_policy=eviction_policy,
        enable_events=True,
    )
104
105

    # store [1, 2] and complete
106
107
    manager.prepare_store(to_keys([1, 2]))
    manager.complete_store(to_keys([1, 2]))
108
109

    # touch [1] to make block 2 the LRU candidate
110
    manager.touch(to_keys([1]))
111
112

    # prepare_store([2, 3, 4, 5]):
113
    #   - block 2 is already stored -> filtered out of keys_to_store
114
115
    #   - block 2 must NOT be evicted even though it is the LRU candidate
    #   - block 1 (ID 0) is evicted instead; new blocks [3,4,5] get IDs 2,3,0
116
    prepare_store_output = manager.prepare_store(to_keys([2, 3, 4, 5]))
117
118
119
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
120
            keys_to_store=[3, 4, 5],
121
            store_block_ids=[2, 3, 0],
122
            evicted_keys=[1],  # block 1 evicted, not block 2
123
124
125
126
        ),
    )

    # complete_store must not silently drop block 2
127
    manager.complete_store(to_keys([2, 3, 4, 5]))
128
129

    # block 2 must still be present in the cache
130
    assert manager.lookup(to_keys([2])) == 1
131
132


133
134
def test_cpu_manager():
    """
135
    Tests CPUOffloadingManager with lru policy.
136
    """
137
    # initialize a CPU manager with a capacity of 4 blocks
138
    cpu_manager = CPUOffloadingManager(
139
        num_blocks=4, cache_policy="lru", enable_events=True
140
    )
141
142

    # prepare store [1, 2]
143
    prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2]))
144
145
146
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
147
            keys_to_store=[1, 2],
148
            store_block_ids=[0, 1],
149
            evicted_keys=[],
150
151
        ),
    )
152
153

    # lookup [1, 2] -> not ready
154
    assert cpu_manager.lookup(to_keys([1, 2])) == 0
155
156
157
158
159

    # no events so far
    assert list(cpu_manager.take_events()) == []

    # complete store [1, 2]
160
    cpu_manager.complete_store(to_keys([1, 2]))
161
    verify_events(cpu_manager.take_events(), expected_stores=({1, 2},))
162
163

    # lookup [1, 2]
164
165
166
    assert cpu_manager.lookup(to_keys([1])) == 1
    assert cpu_manager.lookup(to_keys([1, 2])) == 2
    assert cpu_manager.lookup(to_keys([1, 2, 3])) == 2
167
168

    # prepare store [2, 3, 4, 5] -> evicts [1]
169
    prepare_store_output = cpu_manager.prepare_store(to_keys([2, 3, 4, 5]))
170
171
172
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
173
            keys_to_store=[3, 4, 5],
174
            store_block_ids=[2, 3, 0],
175
            evicted_keys=[1],
176
177
        ),
    )
178
179

    # verify eviction event
180
    verify_events(cpu_manager.take_events(), expected_evictions=({1},))
181
182

    # prepare store with no space
183
    assert cpu_manager.prepare_store(to_keys([1, 6])) is None
184
185

    # complete store [2, 3, 4, 5]
186
    cpu_manager.complete_store(to_keys([2, 3, 4, 5]))
187
188

    # prepare load [2, 3]
189
    prepare_load_output = cpu_manager.prepare_load(to_keys([2, 3]))
190
191
192
    verify_load_output(prepare_load_output, [1, 2])

    # prepare store with no space ([2, 3] is being loaded)
193
    assert cpu_manager.prepare_store(to_keys([6, 7, 8])) is None
194
195

    # complete load [2, 3]
196
    cpu_manager.complete_load(to_keys([2, 3]))
197
198

    # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
199
    prepare_store_output = cpu_manager.prepare_store(to_keys([6, 7, 8]))
200
201
202
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
203
            keys_to_store=[6, 7, 8],
204
            store_block_ids=[3, 2, 1],
205
            evicted_keys=[2, 3, 4],
206
207
        ),
    )
208
209

    # complete store [6, 7, 8]
210
    cpu_manager.complete_store(to_keys([6, 7, 8]))
211
212

    # touch [5, 6, 7] (move to end of LRU order)
213
    cpu_manager.touch(to_keys([5, 6, 7]))
214
215

    # prepare store [7, 9] -> evicts [8] (oldest following previous touch)
216
    prepare_store_output = cpu_manager.prepare_store(to_keys([9]))
217
218
219
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
220
            keys_to_store=[9],
221
            store_block_ids=[1],
222
            evicted_keys=[8],
223
224
        ),
    )
225
226

    # complete store [7, 9] with failure
227
    cpu_manager.complete_store(to_keys([7, 9]), success=False)
228
229

    # assert [7] is still stored, but [9] is not
230
231
    assert cpu_manager.lookup(to_keys([7])) == 1
    assert cpu_manager.lookup(to_keys([9])) == 0
232

233
234
235
236
237
    verify_events(
        cpu_manager.take_events(),
        expected_stores=({3, 4, 5}, {6, 7, 8}),
        expected_evictions=({2, 3, 4}, {8}),
    )
238
239


240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
class TestARCPolicy:
    """Unit tests for CPUOffloadingManager with ARC eviction policy."""

    def _make_manager(
        self, num_blocks: int = 4, enable_events: bool = True
    ) -> tuple[CPUOffloadingManager, ARCCachePolicy]:
        manager = CPUOffloadingManager(
            num_blocks=num_blocks,
            cache_policy="arc",
            enable_events=enable_events,
        )
        policy = manager._policy
        assert isinstance(policy, ARCCachePolicy)
        return manager, policy

    def test_basic(self):
        """
        Tests CPUOffloadingManager with arc policy.
        Verifies that ARC handles store, load, and lookup operations correctly.
        """
        cpu_manager, arc_policy = self._make_manager()

        # prepare store [1, 2]
263
        prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2]))
264
265
266
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
267
                keys_to_store=[1, 2],
268
                store_block_ids=[0, 1],
269
                evicted_keys=[],
270
271
272
273
            ),
        )

        # lookup [1, 2] -> not ready
274
        assert cpu_manager.lookup(to_keys([1, 2])) == 0
275
276
277
278
279

        # no events so far
        assert list(cpu_manager.take_events()) == []

        # complete store [1, 2]
280
        cpu_manager.complete_store(to_keys([1, 2]))
281
        verify_events(cpu_manager.take_events(), expected_stores=({1, 2},))
282
283

        # lookup [1, 2]
284
285
286
        assert cpu_manager.lookup(to_keys([1])) == 1
        assert cpu_manager.lookup(to_keys([1, 2])) == 2
        assert cpu_manager.lookup(to_keys([1, 2, 3])) == 2
287
288
289
290
291
292
293
294
295
296
297
298
299

        # blocks should be in T1 (recent)
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 0

    def test_t1_to_t2_promotion(self):
        """
        Tests that accessing a block in T1 promotes it to T2 (frequent).
        This is a key feature of ARC's adaptive behavior.
        """
        cpu_manager, arc_policy = self._make_manager(enable_events=False)

        # store and complete block 1
300
301
        cpu_manager.prepare_store(to_keys([1]))
        cpu_manager.complete_store(to_keys([1]))
302
303

        # block 1 starts in T1 (recent)
304
305
        assert to_keys([1])[0] in arc_policy.t1
        assert to_keys([1])[0] not in arc_policy.t2
306
307

        # touch block 1 (simulate second access)
308
        cpu_manager.touch(to_keys([1]))
309
310

        # block 1 should now be in T2 (frequent)
311
312
        assert to_keys([1])[0] not in arc_policy.t1
        assert to_keys([1])[0] in arc_policy.t2
313
314
315
316
317
318
319
320
321

    def test_eviction_with_load(self):
        """
        Tests ARC eviction behavior similar to LRU test.
        Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
        """
        cpu_manager, _ = self._make_manager()

        # prepare and complete store [1, 2, 3, 4]
322
        prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
323
324
325
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
326
                keys_to_store=[1, 2, 3, 4],
327
                store_block_ids=[0, 1, 2, 3],
328
                evicted_keys=[],
329
330
            ),
        )
331
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
332
333

        # prepare load [2, 3] (increases ref_cnt)
334
        prepare_load_output = cpu_manager.prepare_load(to_keys([2, 3]))
335
336
337
338
        verify_load_output(prepare_load_output, [1, 2])

        # prepare store [5, 6, 7] with [2, 3] being loaded
        # should fail because [2, 3] have ref_cnt > 0
339
        assert cpu_manager.prepare_store(to_keys([5, 6, 7])) is None
340
341

        # complete load [2, 3]
342
        cpu_manager.complete_load(to_keys([2, 3]))
343
344
345

        # now prepare store [5, 6, 7] should succeed
        # ARC will evict blocks one at a time from T1 as needed
346
        prepare_store_output = cpu_manager.prepare_store(to_keys([5, 6, 7]))
347
348
        assert prepare_store_output is not None
        # Should successfully evict enough blocks to make room (at least 1)
349
        assert len(prepare_store_output.evicted_keys) >= 1
350
351
352
353
354
355
356
357
358
359

    def test_adaptive_target(self):
        """
        Tests ARC's adaptive target adjustment via ghost lists.
        When a block in B1 (ghost list) is accessed, target_t1_size increases.
        When a block in B2 is accessed, target_t1_size decreases.
        """
        cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)

        # store blocks 1, 2 (fills cache)
360
361
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
362
363
364
365

        initial_target = arc_policy.target_t1_size

        # store block 3, evicting block 1 (moves to B1 ghost list)
366
367
        cpu_manager.prepare_store(to_keys([3]))
        cpu_manager.complete_store(to_keys([3]))
368
369

        # block 1 should be in B1 (ghost list)
370
        assert to_keys([1])[0] in arc_policy.b1
371
372
373

        # touch block 1 (cache miss, but in B1)
        # this should increase target_t1_size (favor recency)
374
        cpu_manager.touch(to_keys([1]))
375
376
377
378
379
380
381
382
383
384
385
386

        # target should have increased
        assert arc_policy.target_t1_size > initial_target

    def test_t1_t2_eviction_policy(self):
        """
        Tests that ARC evicts from T1 or T2 based on target_t1_size.
        If |T1| >= target_t1_size, evict from T1, otherwise from T2.
        """
        cpu_manager, arc_policy = self._make_manager(enable_events=False)

        # store blocks 1, 2, 3, 4
387
388
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
389
390

        # promote blocks 3, 4 to T2 by touching them
391
        cpu_manager.touch(to_keys([3, 4]))
392
393
394
395
396
397
398
399
400
401

        # now: T1 = {1, 2}, T2 = {3, 4}
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 2

        # set target_t1_size to prefer evicting from T1
        # (when |T1| >= target, evict from T1)
        arc_policy.target_t1_size = 1

        # store block 5, should evict from T1 (block 1, LRU in T1)
402
        output = cpu_manager.prepare_store(to_keys([5]))
403
        assert output is not None
404
        assert to_keys([1]) == output.evicted_keys
405

406
        cpu_manager.complete_store(to_keys([5]))
407
408

        # block 1 should be in B1 (ghost list)
409
        assert to_keys([1])[0] in arc_policy.b1
410
        # block 5 should be in T1
411
        assert to_keys([5])[0] in arc_policy.t1
412
413
414
415
416
417
418
419
420

    def test_ghost_list_bounds(self):
        """
        Tests that ghost lists (B1, B2) don't grow unbounded.
        They should be capped at cache_capacity.
        """
        cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)

        # fill cache with blocks 1, 2
421
422
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
423
424
425

        # store many blocks to fill ghost lists
        for i in range(3, 20):
426
427
            cpu_manager.prepare_store(to_keys([i]))
            cpu_manager.complete_store(to_keys([i]))
428
429
430
431
432
433
434
435
436
437
438
439
440

        # ghost lists should not exceed cache_capacity
        assert len(arc_policy.b1) <= arc_policy.cache_capacity
        assert len(arc_policy.b2) <= arc_policy.cache_capacity

    def test_touch_ordering(self):
        """
        Tests that touch() correctly updates access patterns.
        Similar to LRU test but verifies T1/T2 ordering.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store blocks 1, 2, 3, 4
441
442
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
443
444

        # promote 3, 4 to T2
445
        cpu_manager.touch(to_keys([3, 4]))
446
447
448

        # T1 = {1, 2}, T2 = {3, 4}
        # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
449
        cpu_manager.touch(to_keys([1, 3, 4]))
450
451
452
453
454
455

        # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
        assert len(arc_policy.t1) == 1
        assert len(arc_policy.t2) == 3

        # store block 5, should evict from T1 (block 2, only one in T1)
456
        prepare_store_output = cpu_manager.prepare_store(to_keys([5]))
457
458
459
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
460
                keys_to_store=[5],
461
                store_block_ids=[1],  # reuses block 2's storage
462
                evicted_keys=[2],
463
464
465
466
467
468
469
470
471
472
473
            ),
        )

    def test_failed_store(self):
        """
        Tests that failed store operations clean up correctly.
        Similar to LRU test but for ARC.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store blocks 1, 2, 3, 4
474
475
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
476
477

        # prepare store block 5 (will evict block 1)
478
        prepare_store_output = cpu_manager.prepare_store(to_keys([5]))
479
        assert prepare_store_output is not None
480
        assert len(prepare_store_output.evicted_keys) == 1
481
482

        # complete store with failure
483
        cpu_manager.complete_store(to_keys([5]), success=False)
484
485

        # block 5 should not be in cache
486
        assert cpu_manager.lookup(to_keys([5])) == 0
487
        # block 5 should not be in T1 or T2
488
489
        assert to_keys([5])[0] not in arc_policy.t1
        assert to_keys([5])[0] not in arc_policy.t2
490
491

        # evicted block should still be gone (in B1 ghost list)
492
        evicted_hash = prepare_store_output.evicted_keys[0]
493
494
495
496
497
498
499
500
501
502
        assert evicted_hash in arc_policy.b1

    def test_full_scenario(self):
        """
        Comprehensive test covering multiple ARC operations in sequence.
        Similar to the full LRU test but adapted for ARC behavior.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store [1, 2]
503
504
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
505
506

        # store [3, 4, 5] -> evicts [1]
507
        prepare_store_output = cpu_manager.prepare_store(to_keys([3, 4, 5]))
508
        assert prepare_store_output is not None
509
510
        assert len(prepare_store_output.evicted_keys) == 1
        cpu_manager.complete_store(to_keys([3, 4, 5]))
511
512

        # promote some blocks to T2
513
        cpu_manager.touch(to_keys([2, 3]))
514
515
516
517
518
519

        # T1 has {4, 5}, T2 has {2, 3}
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 2

        # store [6] -> should evict from T1 (4 is oldest in T1)
520
        prepare_store_output = cpu_manager.prepare_store(to_keys([6]))
521
        assert prepare_store_output is not None
522
        cpu_manager.complete_store(to_keys([6]))
523
524

        # verify blocks 2, 3 (in T2) are still present
525
526
        assert cpu_manager.lookup(to_keys([2])) == 1
        assert cpu_manager.lookup(to_keys([3])) == 1
527
528
529
530

        # verify events
        events = list(cpu_manager.take_events())
        assert len(events) > 0  # should have store and eviction events
531
532
533
534


def test_filter_reused_manager():
    """
535
    Tests FilterReusedOffloadingManager with a CPUOffloadingManager.
536
    """
537
    lru_manager = CPUOffloadingManager(
538
        num_blocks=4, cache_policy="lru", enable_events=True
539
    )
540
541
542
543
544
545

    manager = FilterReusedOffloadingManager(
        backing=lru_manager, store_threshold=2, max_tracker_size=3
    )

    # Lookup [1, 2] -> 1st time, added to tracker but not eligible for store yet
546
    assert manager.lookup(to_keys([1, 2])) == 0
547
548

    # prepare store [1, 2] -> should be filtered
549
    prepare_store_output = manager.prepare_store(to_keys([1, 2]))
550
    assert prepare_store_output is not None
551
    assert prepare_store_output.keys_to_store == []
552
553

    # Lookup [1] -> 2nd time, eligible now
554
    assert manager.lookup(to_keys([1])) == 0
555
556

    # prepare store [1, 2] -> [1] should be eligible, [2] should be filtered
557
    prepare_store_output = manager.prepare_store(to_keys([1, 2]))
558
    assert prepare_store_output is not None
559
    assert prepare_store_output.keys_to_store == to_keys([1])
560
561
562

    # Lookup [3, 4] -> 1st time
    # (evicts [2] from tracker since max_size is 3 and tracker has [1])
563
    assert manager.lookup(to_keys([3, 4])) == 0
564
    # Verify [2] was evicted from the tracker (tracker now has: [1], [3], [4])
565
    assert to_keys([2])[0] not in manager.counts
566
567

    # Lookup [2] again -> (this adds [2] back to the tracker as 1st time)
568
    assert manager.lookup(to_keys([2])) == 0
569
    # Verify [2] was re-added with count=1 (not eligible yet)
570
    assert manager.counts.get(to_keys([2])[0]) == 1
571
572

    # prepare store [2] -> should still be filtered out since count was reset
573
    prepare_store_output = manager.prepare_store(to_keys([2]))
574
    assert prepare_store_output is not None
575
    assert prepare_store_output.keys_to_store == []
576

577
    manager.complete_store(to_keys([1]))