test_cache.py 11 KB
Newer Older
Woosuk Kwon's avatar
Woosuk Kwon committed
1
import random
2
from typing import Tuple
Woosuk Kwon's avatar
Woosuk Kwon committed
3

4
import pytest
Woosuk Kwon's avatar
Woosuk Kwon committed
5
6
import torch

7
from vllm import _custom_ops as ops
8
from vllm.utils import is_hip
Woosuk Kwon's avatar
Woosuk Kwon committed
9

Vladimir's avatar
Vladimir committed
10
COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
11
DTYPES = [torch.half, torch.bfloat16, torch.float]
Simon Mo's avatar
Simon Mo committed
12
NUM_TOKENS = [42]  # Arbitrary values for testing
13
NUM_LAYERS = [1]  # Arbitrary values for testing
14
15
16
NUM_HEADS = [8]  # Arbitrary values for testing
HEAD_SIZES = [64, 80, 96, 112, 128, 256]
BLOCK_SIZES = [8, 16, 32]
17
18
19
20
21

# Arbitrary values for testing
# don't make it too large. e.g. [1024, 36000] will OOM
NUM_BLOCKS = [1024, 10000]

22
NUM_MAPPINGS = [256]  # Arbitrary values for testing
23
SEEDS = [0]
24
25
26
CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
]
zhuwenwen's avatar
zhuwenwen committed
27
<<<<<<< HEAD
28
KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]  if not is_hip() else ["auto"]
zhuwenwen's avatar
zhuwenwen committed
29
=======
30
KV_CACHE_DTYPE = ["auto", "fp8"]
zhuwenwen's avatar
zhuwenwen committed
31
>>>>>>> v0.4.1
32
33
34
35
36
37
38
39
40
41


@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
42
@pytest.mark.parametrize("device", CUDA_DEVICES)
43
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
44
@torch.inference_mode()
45
46
def test_copy_blocks(
    kv_cache_factory,
47
48
49
50
51
52
53
    num_mappings: int,
    num_layers: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
54
    seed: int,
55
    kv_cache_dtype: str,
56
    device: str,
57
) -> None:
58
59
    random.seed(seed)
    torch.random.manual_seed(seed)
60
61
62
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
63
64
65
    # Generate random block mappings where each source block is mapped to two
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
66
67
    src_blocks = random.sample(range(num_blocks), num_mappings)
    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
68
    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
69
    block_mapping = {}
70
    for i in range(num_mappings):
71
72
73
74
        src = src_blocks[i]
        dst1 = dst_blocks[2 * i]
        dst2 = dst_blocks[2 * i + 1]
        block_mapping[src] = [dst1, dst2]
75
76
77
78

    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                num_layers, num_heads,
79
                                                head_size, kv_cache_dtype,
80
                                                dtype, seed, device)
81
82
83
84

    # Clone the KV caches.
    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
85
86

    # Call the copy blocks kernel.
87
    ops.copy_blocks(key_caches, value_caches, block_mapping)
88

89
    # Run the reference implementation.
90
91
92
93
94
95
    for src, dsts in block_mapping.items():
        for dst in dsts:
            for cloned_key_cache in cloned_key_caches:
                cloned_key_cache[dst].copy_(cloned_key_cache[src])
            for cloned_value_cache in cloned_value_caches:
                cloned_value_cache[dst].copy_(cloned_value_cache[src])
96
97
98
99

    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
        assert torch.allclose(key_cache, cloned_key_cache)
100
101
    for value_cache, cloned_value_cache in zip(value_caches,
                                               cloned_value_caches):
102
103
104
        assert torch.allclose(value_cache, cloned_value_cache)


105
106
107
108
109
110
111
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
112
@pytest.mark.parametrize("device", CUDA_DEVICES)
113
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
114
@torch.inference_mode()
115
116
def test_reshape_and_cache(
    kv_cache_factory,
Woosuk Kwon's avatar
Woosuk Kwon committed
117
118
119
120
121
122
    num_tokens: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
123
    seed: int,
124
    device: str,
125
    kv_cache_dtype: str,
Woosuk Kwon's avatar
Woosuk Kwon committed
126
) -> None:
127
128
    if not is_hip() and kv_cache_dtype == "fp8":
        pytest.skip()  # This test is not tuned for e5m2 cuda precision
129
130
    random.seed(seed)
    torch.random.manual_seed(seed)
131
132
133
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
134
    # Create a random slot mapping.
Woosuk Kwon's avatar
Woosuk Kwon committed
135
136
    num_slots = block_size * num_blocks
    slot_mapping = random.sample(range(num_slots), num_tokens)
137
138
139
    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)

    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
Woosuk Kwon's avatar
Woosuk Kwon committed
140
141
    _, key, value = qkv.unbind(dim=1)

142
143
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
144
145
146
                                                num_heads, head_size,
                                                kv_cache_dtype, dtype, seed,
                                                device)
147
    key_cache, value_cache = key_caches[0], value_caches[0]
Woosuk Kwon's avatar
Woosuk Kwon committed
148

149
    # Clone the KV caches.
150
151
    if kv_cache_dtype == "fp8":
        cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
152
        ops.convert_fp8(key_cache, cloned_key_cache)
153
        cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
154
        ops.convert_fp8(value_cache, cloned_value_cache)
155
156
157
158
159
160
    else:
        cloned_key_cache = key_cache.clone()
        cloned_value_cache = value_cache.clone()

    # Using default kv_scale
    kv_scale = 1.0
Woosuk Kwon's avatar
Woosuk Kwon committed
161

162
    # Call the reshape_and_cache kernel.
163
164
    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
                          kv_cache_dtype, kv_scale)
165
166
167

    if kv_cache_dtype == "fp8":
        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
168
        ops.convert_fp8(key_cache, result_key_cache)
169
        result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
170
        ops.convert_fp8(value_cache, result_value_cache)
Woosuk Kwon's avatar
Woosuk Kwon committed
171

172
173
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
174
    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
175
176
177
    block_indicies = block_indicies.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets = block_offsets.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
178
    for i in range(num_tokens):
179
180
        block_idx = block_indicies[i]
        block_offset = block_offsets[i]
Woosuk Kwon's avatar
Woosuk Kwon committed
181
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
182
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
Woosuk Kwon's avatar
Woosuk Kwon committed
183

184
185
186
187
188
189
190
191
192
193
194
195
    if kv_cache_dtype == "fp8":
        assert torch.allclose(result_key_cache,
                              cloned_key_cache,
                              atol=0.001,
                              rtol=0.1)
        assert torch.allclose(result_value_cache,
                              cloned_value_cache,
                              atol=0.001,
                              rtol=0.1)
    else:
        assert torch.allclose(key_cache, cloned_key_cache)
        assert torch.allclose(value_cache, cloned_value_cache)
Vladimir's avatar
Vladimir committed
196
197
198
199
200
201
202
203
204
205


@pytest.mark.parametrize("direction", COPYING_DIRECTION)
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
206
@pytest.mark.parametrize("device", CUDA_DEVICES)
207
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
Vladimir's avatar
Vladimir committed
208
209
210
211
212
213
214
215
216
217
218
@torch.inference_mode()
def test_swap_blocks(
    kv_cache_factory,
    direction: Tuple[str, str],
    num_mappings: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
219
    device: str,
220
    kv_cache_dtype: str,
Vladimir's avatar
Vladimir committed
221
) -> None:
222
223
224
225
    if kv_cache_dtype == "fp8" and "cpu" in direction:
        pytest.skip()
    if not is_hip() and kv_cache_dtype == "fp8":
        pytest.skip()  # This test is not tuned for e5m2 cuda precision
Vladimir's avatar
Vladimir committed
226
227
    random.seed(seed)
    torch.random.manual_seed(seed)
228
229
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
230
231
232

    src_device = device if direction[0] == "cuda" else 'cpu'
    dst_device = device if direction[1] == "cuda" else 'cpu'
Vladimir's avatar
Vladimir committed
233
234
235
236
237
238
239
240
241
242
243
244
245

    src_blocks = random.sample(range(num_blocks), num_mappings)
    # For the same device, mapping must not overlap
    if src_device == dst_device:
        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
        dst_blocks = random.sample(remaining_blocks, num_mappings)
    else:
        dst_blocks = random.sample(range(num_blocks), num_mappings)

    block_mapping = dict(zip(src_blocks, dst_blocks))

    # Create the KV caches on the first device.
    src_key_caches, src_value_caches = kv_cache_factory(
246
247
        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
        seed, src_device)
Vladimir's avatar
Vladimir committed
248
249
250

    # Create the KV caches on the second device.
    dist_key_caches, dist_value_caches = kv_cache_factory(
251
252
        num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype,
        seed, dst_device)
Vladimir's avatar
Vladimir committed
253
254
255
256
257

    src_key_caches_clone = src_key_caches[0].clone()
    src_value_caches_clone = src_value_caches[0].clone()

    # Call the swap_blocks kernel.
258
259
    ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
    ops.swap_blocks(src_value_caches[0], dist_value_caches[0], block_mapping)
Vladimir's avatar
Vladimir committed
260
261
262
263
264
265

    for src, dst in block_mapping.items():
        assert torch.allclose(src_key_caches_clone[src].cpu(),
                              dist_key_caches[0][dst].cpu())
        assert torch.allclose(src_value_caches_clone[src].cpu(),
                              dist_value_caches[0][dst].cpu())
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296


@pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3")
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_fp8_conversion(
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
    device: str,
) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    low = -224.0
    high = 224.0
    shape = (num_blocks, num_heads, head_size, block_size)
    cache = torch.empty(shape, dtype=dtype, device=device)
    cache.uniform_(low, high)

    cache_fp8 = torch.empty_like(cache, dtype=torch.uint8)
297
    ops.convert_fp8(cache, cache_fp8)
298
299

    converted_cache = torch.empty_like(cache)
300
    ops.convert_fp8(cache_fp8, converted_cache)
301
302

    assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1)