test_cpu_manager.py 20.3 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from dataclasses import dataclass

import numpy as np
7
import pytest
8

9
10
11
from vllm.v1.kv_offload.abstract import (
    LoadStoreSpec,
    OffloadingEvent,
12
    OffloadKey,
13
    PrepareStoreOutput,
14
    make_offload_key,
15
)
16
17
from vllm.v1.kv_offload.cpu.manager import CPUOffloadingManager
from vllm.v1.kv_offload.cpu.policies.arc import ARCCachePolicy
18
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
19
from vllm.v1.kv_offload.reuse_manager import FilterReusedOffloadingManager
20
21
22
23


@dataclass
class ExpectedPrepareStoreOutput:
24
    keys_to_store: list[int]
25
    store_block_ids: list[int]
26
    evicted_keys: list[int]
27
28


29
30
def to_keys(int_ids: list[int]) -> list[OffloadKey]:
    return [make_offload_key(str(i).encode(), 0) for i in int_ids]
31
32
33


def verify_store_output(
34
    prepare_store_output: PrepareStoreOutput | None,
35
36
    expected_prepare_store_output: ExpectedPrepareStoreOutput,
):
37
    assert prepare_store_output is not None
38
39
    assert prepare_store_output.keys_to_store == to_keys(
        expected_prepare_store_output.keys_to_store
40
    )
41
42
    assert prepare_store_output.evicted_keys == to_keys(
        expected_prepare_store_output.evicted_keys
43
    )
44
45
    store_spec = prepare_store_output.store_spec
    assert isinstance(store_spec, CPULoadStoreSpec)
46
47
48
    expected_array = np.array(
        expected_prepare_store_output.store_block_ids, dtype=np.int64
    )
49
50
51
    assert np.array_equal(expected_array, store_spec.block_ids)


52
53
54
def verify_load_output(
    prepare_load_output: LoadStoreSpec, expected_prepare_load_output: list[int]
):
55
56
57
58
59
    assert isinstance(prepare_load_output, CPULoadStoreSpec)
    expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
    assert np.array_equal(expected_array, prepare_load_output.block_ids)


60
61
62
63
64
65
def verify_events(
    events: Iterable[OffloadingEvent],
    block_size: int,
    expected_stores: tuple[set[int], ...] = (),
    expected_evictions: tuple[set[int], ...] = (),
):
66
67
    stores: list[set[OffloadKey]] = []
    evictions: list[set[OffloadKey]] = []
68
69
70
71
    for event in events:
        assert event.medium == CPULoadStoreSpec.medium()
        assert event.block_size == block_size
        if event.removed:
72
            evictions.append(set(event.keys))
73
        else:
74
            stores.append(set(event.keys))
75

76
77
78
79
    def to_key_sets(
        int_sets: tuple[set[int], ...],
    ) -> tuple[set[OffloadKey], ...]:
        return tuple([set(to_keys(list(int_set))) for int_set in int_sets])
80

81
82
    assert tuple(evictions) == to_key_sets(expected_evictions)
    assert tuple(stores) == to_key_sets(expected_stores)
83
84


85
86
@pytest.mark.parametrize("eviction_policy", ["lru", "arc"])
def test_already_stored_block_not_evicted_during_prepare_store(eviction_policy):
87
88
89
    """
    Regression test: a block that is already stored must not be evicted
    by prepare_store() when it needs to make room for new blocks.
90
    Applies to both lru and arc policies.
91
92
93
94
95
96
97
98
99
100
101

    Scenario:
        - Store blocks [1, 2] and complete.
        - touch([1]) makes block 2 the LRU candidate.
        - prepare_store([2, 3, 4, 5]):
            * block 2 is filtered out as "already stored"
            * but without the fix, block 2 would be evicted as the LRU
              candidate to make room for [3, 4, 5]
        - After complete_store([2, 3, 4, 5]), block 2 must still be present.
    """
    block_size = 256
102
103
104
105
106
107
    manager = CPUOffloadingManager(
        block_size=block_size,
        num_blocks=4,
        cache_policy=eviction_policy,
        enable_events=True,
    )
108
109

    # store [1, 2] and complete
110
111
    manager.prepare_store(to_keys([1, 2]))
    manager.complete_store(to_keys([1, 2]))
112
113

    # touch [1] to make block 2 the LRU candidate
114
    manager.touch(to_keys([1]))
115
116

    # prepare_store([2, 3, 4, 5]):
117
    #   - block 2 is already stored -> filtered out of keys_to_store
118
119
    #   - block 2 must NOT be evicted even though it is the LRU candidate
    #   - block 1 (ID 0) is evicted instead; new blocks [3,4,5] get IDs 2,3,0
120
    prepare_store_output = manager.prepare_store(to_keys([2, 3, 4, 5]))
121
122
123
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
124
            keys_to_store=[3, 4, 5],
125
            store_block_ids=[2, 3, 0],
126
            evicted_keys=[1],  # block 1 evicted, not block 2
127
128
129
130
        ),
    )

    # complete_store must not silently drop block 2
131
    manager.complete_store(to_keys([2, 3, 4, 5]))
132
133

    # block 2 must still be present in the cache
134
    assert manager.lookup(to_keys([2])) == 1
135
136


137
138
def test_cpu_manager():
    """
139
    Tests CPUOffloadingManager with lru policy.
140
141
142
    """
    # initialize a CPU backend with a capacity of 4 blocks
    block_size = 256
143
144
145
    cpu_manager = CPUOffloadingManager(
        block_size=block_size, num_blocks=4, cache_policy="lru", enable_events=True
    )
146
147

    # prepare store [1, 2]
148
    prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2]))
149
150
151
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
152
            keys_to_store=[1, 2],
153
            store_block_ids=[0, 1],
154
            evicted_keys=[],
155
156
        ),
    )
157
158

    # lookup [1, 2] -> not ready
159
    assert cpu_manager.lookup(to_keys([1, 2])) == 0
160
161
162
163
164

    # no events so far
    assert list(cpu_manager.take_events()) == []

    # complete store [1, 2]
165
    cpu_manager.complete_store(to_keys([1, 2]))
166
167
168
    verify_events(
        cpu_manager.take_events(), block_size=block_size, expected_stores=({1, 2},)
    )
169
170

    # lookup [1, 2]
171
172
173
    assert cpu_manager.lookup(to_keys([1])) == 1
    assert cpu_manager.lookup(to_keys([1, 2])) == 2
    assert cpu_manager.lookup(to_keys([1, 2, 3])) == 2
174
175

    # prepare store [2, 3, 4, 5] -> evicts [1]
176
    prepare_store_output = cpu_manager.prepare_store(to_keys([2, 3, 4, 5]))
177
178
179
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
180
            keys_to_store=[3, 4, 5],
181
            store_block_ids=[2, 3, 0],
182
            evicted_keys=[1],
183
184
        ),
    )
185
186

    # verify eviction event
187
188
189
    verify_events(
        cpu_manager.take_events(), block_size=block_size, expected_evictions=({1},)
    )
190
191

    # prepare store with no space
192
    assert cpu_manager.prepare_store(to_keys([1, 6])) is None
193
194

    # complete store [2, 3, 4, 5]
195
    cpu_manager.complete_store(to_keys([2, 3, 4, 5]))
196
197

    # prepare load [2, 3]
198
    prepare_load_output = cpu_manager.prepare_load(to_keys([2, 3]))
199
200
201
    verify_load_output(prepare_load_output, [1, 2])

    # prepare store with no space ([2, 3] is being loaded)
202
    assert cpu_manager.prepare_store(to_keys([6, 7, 8])) is None
203
204

    # complete load [2, 3]
205
    cpu_manager.complete_load(to_keys([2, 3]))
206
207

    # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
208
    prepare_store_output = cpu_manager.prepare_store(to_keys([6, 7, 8]))
209
210
211
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
212
            keys_to_store=[6, 7, 8],
213
            store_block_ids=[3, 2, 1],
214
            evicted_keys=[2, 3, 4],
215
216
        ),
    )
217
218

    # complete store [6, 7, 8]
219
    cpu_manager.complete_store(to_keys([6, 7, 8]))
220
221

    # touch [5, 6, 7] (move to end of LRU order)
222
    cpu_manager.touch(to_keys([5, 6, 7]))
223
224

    # prepare store [7, 9] -> evicts [8] (oldest following previous touch)
225
    prepare_store_output = cpu_manager.prepare_store(to_keys([9]))
226
227
228
    verify_store_output(
        prepare_store_output,
        ExpectedPrepareStoreOutput(
229
            keys_to_store=[9],
230
            store_block_ids=[1],
231
            evicted_keys=[8],
232
233
        ),
    )
234
235

    # complete store [7, 9] with failure
236
    cpu_manager.complete_store(to_keys([7, 9]), success=False)
237
238

    # assert [7] is still stored, but [9] is not
239
240
    assert cpu_manager.lookup(to_keys([7])) == 1
    assert cpu_manager.lookup(to_keys([9])) == 0
241

242
243
244
245
246
247
    verify_events(
        cpu_manager.take_events(),
        block_size=block_size,
        expected_stores=({3, 4, 5}, {6, 7, 8}),
        expected_evictions=({2, 3, 4}, {8}),
    )
248
249


250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
class TestARCPolicy:
    """Unit tests for CPUOffloadingManager with ARC eviction policy."""

    def _make_manager(
        self, num_blocks: int = 4, enable_events: bool = True
    ) -> tuple[CPUOffloadingManager, ARCCachePolicy]:
        manager = CPUOffloadingManager(
            block_size=256,
            num_blocks=num_blocks,
            cache_policy="arc",
            enable_events=enable_events,
        )
        policy = manager._policy
        assert isinstance(policy, ARCCachePolicy)
        return manager, policy

    def test_basic(self):
        """
        Tests CPUOffloadingManager with arc policy.
        Verifies that ARC handles store, load, and lookup operations correctly.
        """
        cpu_manager, arc_policy = self._make_manager()

        # prepare store [1, 2]
274
        prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2]))
275
276
277
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
278
                keys_to_store=[1, 2],
279
                store_block_ids=[0, 1],
280
                evicted_keys=[],
281
282
283
284
            ),
        )

        # lookup [1, 2] -> not ready
285
        assert cpu_manager.lookup(to_keys([1, 2])) == 0
286
287
288
289
290

        # no events so far
        assert list(cpu_manager.take_events()) == []

        # complete store [1, 2]
291
        cpu_manager.complete_store(to_keys([1, 2]))
292
293
294
295
296
        verify_events(
            cpu_manager.take_events(), block_size=256, expected_stores=({1, 2},)
        )

        # lookup [1, 2]
297
298
299
        assert cpu_manager.lookup(to_keys([1])) == 1
        assert cpu_manager.lookup(to_keys([1, 2])) == 2
        assert cpu_manager.lookup(to_keys([1, 2, 3])) == 2
300
301
302
303
304
305
306
307
308
309
310
311
312

        # blocks should be in T1 (recent)
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 0

    def test_t1_to_t2_promotion(self):
        """
        Tests that accessing a block in T1 promotes it to T2 (frequent).
        This is a key feature of ARC's adaptive behavior.
        """
        cpu_manager, arc_policy = self._make_manager(enable_events=False)

        # store and complete block 1
313
314
        cpu_manager.prepare_store(to_keys([1]))
        cpu_manager.complete_store(to_keys([1]))
315
316

        # block 1 starts in T1 (recent)
317
318
        assert to_keys([1])[0] in arc_policy.t1
        assert to_keys([1])[0] not in arc_policy.t2
319
320

        # touch block 1 (simulate second access)
321
        cpu_manager.touch(to_keys([1]))
322
323

        # block 1 should now be in T2 (frequent)
324
325
        assert to_keys([1])[0] not in arc_policy.t1
        assert to_keys([1])[0] in arc_policy.t2
326
327
328
329
330
331
332
333
334

    def test_eviction_with_load(self):
        """
        Tests ARC eviction behavior similar to LRU test.
        Verifies that blocks being loaded (ref_cnt > 0) cannot be evicted.
        """
        cpu_manager, _ = self._make_manager()

        # prepare and complete store [1, 2, 3, 4]
335
        prepare_store_output = cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
336
337
338
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
339
                keys_to_store=[1, 2, 3, 4],
340
                store_block_ids=[0, 1, 2, 3],
341
                evicted_keys=[],
342
343
            ),
        )
344
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
345
346

        # prepare load [2, 3] (increases ref_cnt)
347
        prepare_load_output = cpu_manager.prepare_load(to_keys([2, 3]))
348
349
350
351
        verify_load_output(prepare_load_output, [1, 2])

        # prepare store [5, 6, 7] with [2, 3] being loaded
        # should fail because [2, 3] have ref_cnt > 0
352
        assert cpu_manager.prepare_store(to_keys([5, 6, 7])) is None
353
354

        # complete load [2, 3]
355
        cpu_manager.complete_load(to_keys([2, 3]))
356
357
358

        # now prepare store [5, 6, 7] should succeed
        # ARC will evict blocks one at a time from T1 as needed
359
        prepare_store_output = cpu_manager.prepare_store(to_keys([5, 6, 7]))
360
361
        assert prepare_store_output is not None
        # Should successfully evict enough blocks to make room (at least 1)
362
        assert len(prepare_store_output.evicted_keys) >= 1
363
364
365
366
367
368
369
370
371
372

    def test_adaptive_target(self):
        """
        Tests ARC's adaptive target adjustment via ghost lists.
        When a block in B1 (ghost list) is accessed, target_t1_size increases.
        When a block in B2 is accessed, target_t1_size decreases.
        """
        cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)

        # store blocks 1, 2 (fills cache)
373
374
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
375
376
377
378

        initial_target = arc_policy.target_t1_size

        # store block 3, evicting block 1 (moves to B1 ghost list)
379
380
        cpu_manager.prepare_store(to_keys([3]))
        cpu_manager.complete_store(to_keys([3]))
381
382

        # block 1 should be in B1 (ghost list)
383
        assert to_keys([1])[0] in arc_policy.b1
384
385
386

        # touch block 1 (cache miss, but in B1)
        # this should increase target_t1_size (favor recency)
387
        cpu_manager.touch(to_keys([1]))
388
389
390
391
392
393
394
395
396
397
398
399

        # target should have increased
        assert arc_policy.target_t1_size > initial_target

    def test_t1_t2_eviction_policy(self):
        """
        Tests that ARC evicts from T1 or T2 based on target_t1_size.
        If |T1| >= target_t1_size, evict from T1, otherwise from T2.
        """
        cpu_manager, arc_policy = self._make_manager(enable_events=False)

        # store blocks 1, 2, 3, 4
400
401
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
402
403

        # promote blocks 3, 4 to T2 by touching them
404
        cpu_manager.touch(to_keys([3, 4]))
405
406
407
408
409
410
411
412
413
414

        # now: T1 = {1, 2}, T2 = {3, 4}
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 2

        # set target_t1_size to prefer evicting from T1
        # (when |T1| >= target, evict from T1)
        arc_policy.target_t1_size = 1

        # store block 5, should evict from T1 (block 1, LRU in T1)
415
        output = cpu_manager.prepare_store(to_keys([5]))
416
        assert output is not None
417
        assert to_keys([1]) == output.evicted_keys
418

419
        cpu_manager.complete_store(to_keys([5]))
420
421

        # block 1 should be in B1 (ghost list)
422
        assert to_keys([1])[0] in arc_policy.b1
423
        # block 5 should be in T1
424
        assert to_keys([5])[0] in arc_policy.t1
425
426
427
428
429
430
431
432
433

    def test_ghost_list_bounds(self):
        """
        Tests that ghost lists (B1, B2) don't grow unbounded.
        They should be capped at cache_capacity.
        """
        cpu_manager, arc_policy = self._make_manager(num_blocks=2, enable_events=False)

        # fill cache with blocks 1, 2
434
435
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
436
437
438

        # store many blocks to fill ghost lists
        for i in range(3, 20):
439
440
            cpu_manager.prepare_store(to_keys([i]))
            cpu_manager.complete_store(to_keys([i]))
441
442
443
444
445
446
447
448
449
450
451
452
453

        # ghost lists should not exceed cache_capacity
        assert len(arc_policy.b1) <= arc_policy.cache_capacity
        assert len(arc_policy.b2) <= arc_policy.cache_capacity

    def test_touch_ordering(self):
        """
        Tests that touch() correctly updates access patterns.
        Similar to LRU test but verifies T1/T2 ordering.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store blocks 1, 2, 3, 4
454
455
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
456
457

        # promote 3, 4 to T2
458
        cpu_manager.touch(to_keys([3, 4]))
459
460
461

        # T1 = {1, 2}, T2 = {3, 4}
        # touch [1, 3, 4] - should promote 1 to T2, and move 3,4 to end of T2
462
        cpu_manager.touch(to_keys([1, 3, 4]))
463
464
465
466
467
468

        # T1 = {2}, T2 = {1, 3, 4} (in that order, with 4 most recent)
        assert len(arc_policy.t1) == 1
        assert len(arc_policy.t2) == 3

        # store block 5, should evict from T1 (block 2, only one in T1)
469
        prepare_store_output = cpu_manager.prepare_store(to_keys([5]))
470
471
472
        verify_store_output(
            prepare_store_output,
            ExpectedPrepareStoreOutput(
473
                keys_to_store=[5],
474
                store_block_ids=[1],  # reuses block 2's storage
475
                evicted_keys=[2],
476
477
478
479
480
481
482
483
484
485
486
            ),
        )

    def test_failed_store(self):
        """
        Tests that failed store operations clean up correctly.
        Similar to LRU test but for ARC.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store blocks 1, 2, 3, 4
487
488
        cpu_manager.prepare_store(to_keys([1, 2, 3, 4]))
        cpu_manager.complete_store(to_keys([1, 2, 3, 4]))
489
490

        # prepare store block 5 (will evict block 1)
491
        prepare_store_output = cpu_manager.prepare_store(to_keys([5]))
492
        assert prepare_store_output is not None
493
        assert len(prepare_store_output.evicted_keys) == 1
494
495

        # complete store with failure
496
        cpu_manager.complete_store(to_keys([5]), success=False)
497
498

        # block 5 should not be in cache
499
        assert cpu_manager.lookup(to_keys([5])) == 0
500
        # block 5 should not be in T1 or T2
501
502
        assert to_keys([5])[0] not in arc_policy.t1
        assert to_keys([5])[0] not in arc_policy.t2
503
504

        # evicted block should still be gone (in B1 ghost list)
505
        evicted_hash = prepare_store_output.evicted_keys[0]
506
507
508
509
510
511
512
513
514
515
        assert evicted_hash in arc_policy.b1

    def test_full_scenario(self):
        """
        Comprehensive test covering multiple ARC operations in sequence.
        Similar to the full LRU test but adapted for ARC behavior.
        """
        cpu_manager, arc_policy = self._make_manager()

        # store [1, 2]
516
517
        cpu_manager.prepare_store(to_keys([1, 2]))
        cpu_manager.complete_store(to_keys([1, 2]))
518
519

        # store [3, 4, 5] -> evicts [1]
520
        prepare_store_output = cpu_manager.prepare_store(to_keys([3, 4, 5]))
521
        assert prepare_store_output is not None
522
523
        assert len(prepare_store_output.evicted_keys) == 1
        cpu_manager.complete_store(to_keys([3, 4, 5]))
524
525

        # promote some blocks to T2
526
        cpu_manager.touch(to_keys([2, 3]))
527
528
529
530
531
532

        # T1 has {4, 5}, T2 has {2, 3}
        assert len(arc_policy.t1) == 2
        assert len(arc_policy.t2) == 2

        # store [6] -> should evict from T1 (4 is oldest in T1)
533
        prepare_store_output = cpu_manager.prepare_store(to_keys([6]))
534
        assert prepare_store_output is not None
535
        cpu_manager.complete_store(to_keys([6]))
536
537

        # verify blocks 2, 3 (in T2) are still present
538
539
        assert cpu_manager.lookup(to_keys([2])) == 1
        assert cpu_manager.lookup(to_keys([3])) == 1
540
541
542
543

        # verify events
        events = list(cpu_manager.take_events())
        assert len(events) > 0  # should have store and eviction events
544
545
546
547


def test_filter_reused_manager():
    """
548
    Tests FilterReusedOffloadingManager with a CPUOffloadingManager.
549
550
    """
    block_size = 256
551
552
553
    lru_manager = CPUOffloadingManager(
        block_size=block_size, num_blocks=4, cache_policy="lru", enable_events=True
    )
554
555
556
557
558
559

    manager = FilterReusedOffloadingManager(
        backing=lru_manager, store_threshold=2, max_tracker_size=3
    )

    # Lookup [1, 2] -> 1st time, added to tracker but not eligible for store yet
560
    assert manager.lookup(to_keys([1, 2])) == 0
561
562

    # prepare store [1, 2] -> should be filtered
563
    prepare_store_output = manager.prepare_store(to_keys([1, 2]))
564
    assert prepare_store_output is not None
565
    assert prepare_store_output.keys_to_store == []
566
567

    # Lookup [1] -> 2nd time, eligible now
568
    assert manager.lookup(to_keys([1])) == 0
569
570

    # prepare store [1, 2] -> [1] should be eligible, [2] should be filtered
571
    prepare_store_output = manager.prepare_store(to_keys([1, 2]))
572
    assert prepare_store_output is not None
573
    assert prepare_store_output.keys_to_store == to_keys([1])
574
575
576

    # Lookup [3, 4] -> 1st time
    # (evicts [2] from tracker since max_size is 3 and tracker has [1])
577
    assert manager.lookup(to_keys([3, 4])) == 0
578
    # Verify [2] was evicted from the tracker (tracker now has: [1], [3], [4])
579
    assert to_keys([2])[0] not in manager.counts
580
581

    # Lookup [2] again -> (this adds [2] back to the tracker as 1st time)
582
    assert manager.lookup(to_keys([2])) == 0
583
    # Verify [2] was re-added with count=1 (not eligible yet)
584
    assert manager.counts.get(to_keys([2])[0]) == 1
585
586

    # prepare store [2] -> should still be filtered out since count was reset
587
    prepare_store_output = manager.prepare_store(to_keys([2]))
588
    assert prepare_store_output is not None
589
    assert prepare_store_output.keys_to_store == []
590

591
    manager.complete_store(to_keys([1]))