"lib/llm/benches/tokenizer.rs" did not exist on "f242b4552b8ae37d0a3c2a4f0438e57d6f4240f3"
test_cpu_gpu.py 5.92 KB
Newer Older
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time

import pytest
import torch

9
from vllm.utils.torch_utils import set_random_seed
10
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
11
12
13
14
15
from vllm.v1.kv_offload.spec import (
    CanonicalKVCacheRef,
    CanonicalKVCaches,
    CanonicalKVCacheTensor,
)
16
from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers
17
18
19

NUM_GPU_BLOCKS = [64]
NUM_CPU_BLOCKS = [256]
20
21
22
GPU_PAGE_SIZES = [512, 1024]
BLOCK_SIZE_FACTORS = [1, 3]
NUM_TENSORS = [4]
23
SEEDS = [0]
24
CUDA_DEVICES = ["cuda:0"]
25
26
27
28
29
NUM_MAPPINGS = [3]


@pytest.mark.parametrize("gpu_to_cpu", [True, False])
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
30
31
@pytest.mark.parametrize("gpu_page_size_bytes", GPU_PAGE_SIZES)
@pytest.mark.parametrize("block_size_factor", BLOCK_SIZE_FACTORS)
32
33
@pytest.mark.parametrize("num_gpu_blocks", NUM_GPU_BLOCKS)
@pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
34
@pytest.mark.parametrize("num_tensors", NUM_TENSORS)
35
36
37
38
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_transfer(
39
    default_vllm_config,
40
41
    gpu_to_cpu: bool,
    num_mappings: int,
42
43
    gpu_page_size_bytes: int,
    block_size_factor: int,
44
45
    num_gpu_blocks: int,
    num_cpu_blocks: int,
46
    num_tensors: int,
47
48
49
    seed: int,
    device: str,
) -> None:
50
    set_random_seed(seed)
51

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
    # build CanonicalKVCacheTensor list: one per tensor
    kv_cache_tensors: list[CanonicalKVCacheTensor] = []
    for i in range(num_tensors):
        gpu_tensor = torch.randint(
            -128,
            127,
            (num_gpu_blocks, gpu_page_size_bytes),
            dtype=torch.int8,
            device=device,
        )
        kv_cache_tensors.append(
            CanonicalKVCacheTensor(
                tensor=gpu_tensor,
                page_size_bytes=gpu_page_size_bytes,
            )
67
        )
68

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
    # one group containing all tensors, one data ref per tensor
    kv_cache_groups_data_refs: list[list[CanonicalKVCacheRef]] = [
        [
            CanonicalKVCacheRef(
                tensor_idx=i,
                page_size_bytes=gpu_page_size_bytes,
            )
            for i in range(num_tensors)
        ]
    ]

    kv_caches = CanonicalKVCaches(
        tensors=kv_cache_tensors,
        group_data_refs=kv_cache_groups_data_refs,
    )
84
    handlers = CpuGpuOffloadingHandlers(
85
86
        kv_caches=kv_caches,
        block_size_factor=block_size_factor,
87
88
        num_cpu_blocks=num_cpu_blocks,
    )
89
90

    # select block mappings
91
    gpu_blocks = random.sample(range(num_gpu_blocks), num_mappings * block_size_factor)
92
93
    cpu_blocks = random.sample(range(num_cpu_blocks), num_mappings)

94
95
96
97
98
99
100
101
102
    # expand cpu blocks to gpu-page granularity for uniform comparison:
    # each cpu block maps to block_size_factor consecutive sub-blocks
    cpu_blocks_expanded = [
        cpu_block * block_size_factor + j
        for cpu_block in cpu_blocks
        for j in range(block_size_factor)
    ]

    # maybe skip some GPU blocks to test reading from the middle of a CPU block
103
    if not gpu_to_cpu:
104
105
106
        blocks_to_skip = block_size_factor - 1
        gpu_blocks = gpu_blocks[blocks_to_skip:]
        cpu_blocks_expanded = cpu_blocks_expanded[blocks_to_skip:]
107
108
109

    # set transfer direction
    if gpu_to_cpu:
110
        handler = handlers.gpu_to_cpu_handler
111
112
113
114
        src_spec = GPULoadStoreSpec(gpu_blocks, group_sizes=(len(gpu_blocks),))
        dst_spec = CPULoadStoreSpec(cpu_blocks)
        dst_to_src = dict(zip(cpu_blocks_expanded, gpu_blocks))
        num_dst_sub_blocks = num_cpu_blocks * block_size_factor
115
    else:
116
        handler = handlers.cpu_to_gpu_handler
117
118
119
120
        src_spec = CPULoadStoreSpec(cpu_blocks)
        dst_spec = GPULoadStoreSpec(gpu_blocks, group_sizes=(len(gpu_blocks),))
        dst_to_src = dict(zip(gpu_blocks, cpu_blocks_expanded))
        num_dst_sub_blocks = num_gpu_blocks
121
122

    # clone src and dst tensors before transfer
123
124
    orig_src_tensors = [x.clone() for x in handler.src_tensors]
    orig_dst_tensors = [x.clone() for x in handler.dst_tensors]
125
126

    # call transfer function
127
    start_time = time.time()
128
    assert handler.transfer_async(1, (src_spec, dst_spec))
129
    assert set({x.job_id for x in handler._transfers}) == {1}
130
131
132
133
134
135

    # wait for transfer to complete
    end_time = time.time() + 10
    while time.time() < end_time:
        finished = handler.get_finished()
        if finished:
136
137
138
139
140
141
142
            assert finished[0].job_id == 1
            assert finished[0].success
            assert (
                finished[0].transfer_type == ("GPU", "CPU")
                if gpu_to_cpu
                else ("CPU", "GPU")
            )
143
144
            assert finished[0].transfer_size == (
                len(gpu_blocks) * handler.group_block_size_in_bytes[0]
145
146
147
            )
            assert finished[0].transfer_time > 0
            assert finished[0].transfer_time < (time.time() - start_time)
148
149
150
151
            break
        time.sleep(0.1)

    # verify src tensors did not change
152
    for orig_tensor, tensor in zip(orig_src_tensors, handler.src_tensors):
153
154
        assert torch.equal(orig_tensor, tensor)

155
156
157
158
159
160
161
162
163
164
165
166
167
168
    # verify dst tensors at gpu-page granularity.
    for src_tensor, dst_tensor, orig_dst_tensor in zip(
        handler.src_tensors,
        handler.dst_tensors,
        orig_dst_tensors,
    ):
        # view both GPU and CPU tensors as (n, gpu_page_size_bytes) for comparison.
        src_view = src_tensor.view(-1, gpu_page_size_bytes)
        dst_view = dst_tensor.view(-1, gpu_page_size_bytes)
        orig_dst_view = orig_dst_tensor.view(-1, gpu_page_size_bytes)
        for dst_sub_block in range(num_dst_sub_blocks):
            src_sub_block = dst_to_src.get(dst_sub_block)
            if src_sub_block is not None:
                expected = src_view[src_sub_block]
169
            else:
170
171
                expected = orig_dst_view[dst_sub_block]
            torch.testing.assert_close(dst_view[dst_sub_block].cpu(), expected.cpu())