test_utils.py 20.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
# ruff: noqa
4

5
import hashlib
6
import json
7
import os
8
import pickle
9
10
import tempfile
from pathlib import Path
11
from unittest.mock import patch
12

13
import pytest
14
import torch
15
import yaml
16
from transformers import AutoTokenizer
17
from vllm_test_utils.monitor import monitor
18

19
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
20
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
21

22
from vllm.utils import (
23
24
25
26
    FlexibleArgumentParser,
    bind_kv_cache,
    unique_filepath,
)
27
from vllm.utils.hashing import sha256
28
29
30
31
32
from vllm.utils.torch_utils import (
    common_broadcastable_dtype,
    current_stream,
    is_lossless_cast,
)
33
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
34
from ..utils import create_new_process_for_each_test, flat_product
35

36

37
38
39
40
# Tests for FlexibleArgumentParser
@pytest.fixture
def parser():
    parser = FlexibleArgumentParser()
41
42
43
44
45
46
47
48
    parser.add_argument(
        "--image-input-type", choices=["pixel_values", "image_features"]
    )
    parser.add_argument("--model-name")
    parser.add_argument("--batch-size", type=int)
    parser.add_argument("--enable-feature", action="store_true")
    parser.add_argument("--hf-overrides", type=json.loads)
    parser.add_argument("-O", "--compilation-config", type=json.loads)
49
50
51
    return parser


52
53
54
@pytest.fixture
def parser_with_config():
    parser = FlexibleArgumentParser()
55
56
57
58
59
60
61
62
    parser.add_argument("serve")
    parser.add_argument("model_tag", nargs="?")
    parser.add_argument("--model", type=str)
    parser.add_argument("--served-model-name", type=str)
    parser.add_argument("--config", type=str)
    parser.add_argument("--port", type=int)
    parser.add_argument("--tensor-parallel-size", type=int)
    parser.add_argument("--trust-remote-code", action="store_true")
63
64
65
    return parser


66
def test_underscore_to_dash(parser):
67
68
    args = parser.parse_args(["--image_input_type", "pixel_values"])
    assert args.image_input_type == "pixel_values"
69
70
71


def test_mixed_usage(parser):
72
73
74
75
76
    args = parser.parse_args(
        ["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
    )
    assert args.image_input_type == "image_features"
    assert args.model_name == "facebook/opt-125m"
77
78
79
80


def test_with_equals_sign(parser):
    args = parser.parse_args(
81
82
83
84
        ["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
    )
    assert args.image_input_type == "pixel_values"
    assert args.model_name == "facebook/opt-125m"
85
86
87


def test_with_int_value(parser):
88
    args = parser.parse_args(["--batch_size", "32"])
89
    assert args.batch_size == 32
90
    args = parser.parse_args(["--batch-size", "32"])
91
92
93
94
    assert args.batch_size == 32


def test_with_bool_flag(parser):
95
    args = parser.parse_args(["--enable_feature"])
96
    assert args.enable_feature is True
97
    args = parser.parse_args(["--enable-feature"])
98
99
100
101
102
    assert args.enable_feature is True


def test_invalid_choice(parser):
    with pytest.raises(SystemExit):
103
        parser.parse_args(["--image_input_type", "invalid_choice"])
104
105
106


def test_missing_required_argument(parser):
107
    parser.add_argument("--required-arg", required=True)
108
109
    with pytest.raises(SystemExit):
        parser.parse_args([])
110
111


112
def test_cli_override_to_config(parser_with_config, cli_config_file):
113
114
115
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
    )
116
    assert args.tensor_parallel_size == 3
117
118
119
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
    )
120
    assert args.tensor_parallel_size == 3
121
    assert args.port == 12312
122
123
124
125
126
127
128
129
130
131
132
133
    args = parser_with_config.parse_args(
        [
            "serve",
            "mymodel",
            "--tensor-parallel-size",
            "3",
            "--config",
            cli_config_file,
            "--port",
            "666",
        ]
    )
134
135
    assert args.tensor_parallel_size == 3
    assert args.port == 666
136
137


138
def test_config_args(parser_with_config, cli_config_file):
139
    args = parser_with_config.parse_args(
140
141
        ["serve", "mymodel", "--config", cli_config_file]
    )
142
    assert args.tensor_parallel_size == 2
143
    assert args.trust_remote_code
144
145
146
147


def test_config_file(parser_with_config):
    with pytest.raises(FileNotFoundError):
148
        parser_with_config.parse_args(
149
150
            ["serve", "mymodel", "--config", "test_config.yml"]
        )
151
152
153

    with pytest.raises(ValueError):
        parser_with_config.parse_args(
154
155
            ["serve", "mymodel", "--config", "./data/test_config.json"]
        )
156
157

    with pytest.raises(ValueError):
158
159
160
161
162
163
164
165
166
167
168
        parser_with_config.parse_args(
            [
                "serve",
                "mymodel",
                "--tensor-parallel-size",
                "3",
                "--config",
                "--batch-size",
                "32",
            ]
        )
169
170


171
def test_no_model_tag(parser_with_config, cli_config_file):
172
    with pytest.raises(ValueError):
173
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
174
175


176
177
178
179
180
def test_dict_args(parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
181
        # Test nesting
182
183
184
185
        "--hf-overrides.key2.key3",
        "val2",
        "--hf-overrides.key2.key4",
        "val3",
186
        # Test compile config and compilation mode
187
188
189
190
        "-O.use_inductor=true",
        "-O.backend",
        "custom",
        "-O1",
191
        # Test = sign
192
        "--hf-overrides.key5=val4",
193
194
195
196
197
        # Test underscore to dash conversion
        "--hf_overrides.key_6",
        "val5",
        "--hf_overrides.key-7.key_8",
        "val6",
198
199
200
201
202
203
204
205
206
        # Test data type detection
        "--hf_overrides.key9",
        "100",
        "--hf_overrides.key10",
        "100.0",
        "--hf_overrides.key11",
        "true",
        "--hf_overrides.key12.key13",
        "null",
207
208
209
210
211
212
213
        # Test '-' and '.' in value
        "--hf_overrides.key14.key15",
        "-minus.and.dot",
        # Test array values
        "-O.custom_ops+",
        "-quant_fp8",
        "-O.custom_ops+=+silu_mul,-rms_norm",
214
215
216
217
218
219
220
221
222
223
    ]
    parsed_args = parser.parse_args(args)
    assert parsed_args.model_name == "something.something"
    assert parsed_args.hf_overrides == {
        "key1": "val1",
        "key2": {
            "key3": "val2",
            "key4": "val3",
        },
        "key5": "val4",
224
225
226
227
        "key_6": "val5",
        "key-7": {
            "key_8": "val6",
        },
228
229
230
231
232
233
        "key9": 100,
        "key10": 100.0,
        "key11": True,
        "key12": {
            "key13": None,
        },
234
235
        "key14": {
            "key15": "-minus.and.dot",
236
        },
237
    }
238
    assert parsed_args.compilation_config == {
239
        "mode": 1,
240
241
242
243
244
245
246
247
248
249
250
251
252
253
        "use_inductor": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
    }


def test_duplicate_dict_args(caplog_vllm, parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
        "--hf-overrides.key1",
        "val2",
        "-O1",
254
        "-O.mode",
255
256
257
258
259
260
261
        "2",
        "-O3",
    ]

    parsed_args = parser.parse_args(args)
    # Should be the last value
    assert parsed_args.hf_overrides == {"key1": "val2"}
262
    assert parsed_args.compilation_config == {"mode": 3}
263
264
265
266

    assert len(caplog_vllm.records) == 1
    assert "duplicate" in caplog_vllm.text
    assert "--hf-overrides.key1" in caplog_vllm.text
267
    assert "-O.mode" in caplog_vllm.text
268
269


270
@create_new_process_for_each_test()
271
272
273
def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
274
275
    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary

276
277
278
279
    lib = CudaRTLibrary()
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)

280
    baseline_snapshot = MemorySnapshot()
281
282
283

    # load weights

284
    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
285

286
    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
287

288
289
290
291
292
293
294
    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
        current_used = total - free
        current_torch = torch.cuda.memory_reserved()
        current_non_torch = current_used - current_torch
        return current_non_torch

295
296
297
298
299
300
    with (
        memory_profiling(
            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
        ) as result,
        monitor(measure_current_non_torch) as monitored_values,
    ):
301
        # make a memory spike, 1 GiB
302
        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
303
304
305
306
307
        del spike

        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)

308
309
310
311
312
    # this is an analytic value, it is exact,
    # we only have 256 MiB non-torch memory increase
    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
    assert measured_diff == 256 * 1024 * 1024

313
    # Check that the memory usage is within 5% of the expected values
314
315
    # 5% tolerance is caused by cuda runtime.
    # we cannot control cuda runtime in the granularity of bytes,
316
    # which causes a small error (<10 MiB in practice)
317
    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
318
    assert abs(non_torch_ratio - 1) <= 0.05
319
    assert result.torch_peak_increase == 1024 * 1024 * 1024
320
321
322
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
323
324


325
326
327
328
def test_bind_kv_cache():
    from vllm.attention import Attention

    ctx = {
329
330
331
332
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
333
334
    }
    kv_cache = [
335
336
337
338
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
339
340
    ]
    bind_kv_cache(ctx, [kv_cache])
341
342
343
344
345
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]

346

347
348
349
350
def test_bind_kv_cache_kv_sharing():
    from vllm.attention import Attention

    ctx = {
351
352
353
354
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
355
356
    }
    kv_cache = [
357
358
359
360
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
361
362
    ]
    shared_kv_cache_layers = {
363
364
        "layers.2.self_attn": "layers.1.self_attn",
        "layers.3.self_attn": "layers.0.self_attn",
365
366
    }
    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
367
368
369
370
371
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]

372

373
374
375
376
377
def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention

    # example from Jamba PP=2
    ctx = {
378
379
        "model.layers.20.attn": Attention(32, 128, 0.1),
        "model.layers.28.attn": Attention(32, 128, 0.1),
380
381
    }
    kv_cache = [
382
383
        torch.zeros((1,)),
        torch.zeros((1,)),
384
385
    ]
    bind_kv_cache(ctx, [kv_cache])
386
387
    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
388
389
390


def test_bind_kv_cache_pp():
391
    with patch("vllm.utils.torch_utils.cuda_device_count_stateless", lambda: 2):
392
        # this test runs with 1 GPU, but we simulate 2 GPUs
393
        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
394
395
396
397
    with set_current_vllm_config(cfg):
        from vllm.attention import Attention

        ctx = {
398
            "layers.0.self_attn": Attention(32, 128, 0.1),
399
        }
400
        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
401
        bind_kv_cache(ctx, kv_cache)
402
403
        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
404
405


406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
        # Different precision_levels
        (torch.bool, torch.int8, True),
        (torch.bool, torch.float16, True),
        (torch.bool, torch.complex32, True),
        (torch.int64, torch.bool, False),
        (torch.int64, torch.float16, True),
        (torch.int64, torch.complex32, True),
        (torch.float64, torch.bool, False),
        (torch.float64, torch.int8, False),
        (torch.float64, torch.complex32, True),
        (torch.complex128, torch.bool, False),
        (torch.complex128, torch.int8, False),
        (torch.complex128, torch.float16, False),
        # precision_level=0
        (torch.bool, torch.bool, True),
        # precision_level=1
        (torch.int8, torch.int16, True),
        (torch.int16, torch.int8, False),
        (torch.uint8, torch.int8, False),
        (torch.int8, torch.uint8, False),
        # precision_level=2
        (torch.float16, torch.float32, True),
        (torch.float32, torch.float16, False),
        (torch.bfloat16, torch.float32, True),
        (torch.float32, torch.bfloat16, False),
        # precision_level=3
        (torch.complex32, torch.complex64, True),
        (torch.complex64, torch.complex32, False),
    ],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result


@pytest.mark.parametrize(
    ("dtypes", "expected_result"),
    [
        ([torch.bool], torch.bool),
        ([torch.bool, torch.int8], torch.int8),
        ([torch.bool, torch.int8, torch.float16], torch.float16),
        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
    ],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
    assert common_broadcastable_dtype(dtypes) == expected_result


456
457
458
def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
):
459
    # Test model in CLI takes precedence over config
460
    args = parser_with_config.parse_args(
461
462
463
464
        ["serve", "cli-model", "--config", cli_config_file_with_model]
    )
    assert args.model_tag == "cli-model"
    assert args.served_model_name == "mymodel"
465
466

    # Test model from config file works
467
468
469
470
471
472
473
474
475
    args = parser_with_config.parse_args(
        [
            "serve",
            "--config",
            cli_config_file_with_model,
        ]
    )
    assert args.model == "config-model"
    assert args.served_model_name == "mymodel"
476
477
478

    # Test no model specified anywhere raises error
    with pytest.raises(ValueError, match="No model specified!"):
479
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
480
481

    # Test using --model option raises error
482
483
484
485
486
487
488
489
490
491
492
    # with pytest.raises(
    #         ValueError,
    #         match=
    #     ("With `vllm serve`, you should provide the model as a positional "
    #      "argument or in a config file instead of via the `--model` option."),
    # ):
    #     parser_with_config.parse_args(['serve', '--model', 'my-model'])

    # Test using --model option back-compatibility
    # (when back-compatibility ends, the above test should be uncommented
    # and the below test should be removed)
493
494
495
496
497
498
499
500
501
502
503
504
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size",
            "2",
            "--model",
            "my-model",
            "--trust-remote-code",
            "--port",
            "8001",
        ]
    )
505
506
507
508
509
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001

510
511
512
513
514
515
516
517
518
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size=2",
            "--model=my-model",
            "--trust-remote-code",
            "--port=8001",
        ]
    )
519
520
521
522
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001
523
524

    # Test other config values are preserved
525
526
527
528
529
530
531
532
    args = parser_with_config.parse_args(
        [
            "serve",
            "cli-model",
            "--config",
            cli_config_file_with_model,
        ]
    )
533
534
535
536
537
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 12312


538
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
539
540
541
542
543
def test_sha256(input: tuple):
    digest = sha256(input)
    assert digest is not None
    assert isinstance(digest, bytes)
    assert digest != b""
544

545
546
    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
    assert digest == hashlib.sha256(input_bytes).digest()
547
548

    # hashing again, returns the same value
549
    assert digest == sha256(input)
550
551

    # hashing different input, returns different value
552
    assert digest != sha256(input + (1,))
553
554


555
556
557
558
def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
    # token_ids = [9707, 11, 1879, 0]
559
    assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
560
    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
561
    assert tokens == ["Hello", ",", " world", "!"]
562
563
564
565


def test_current_stream_multithread():
    import threading
566

567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")

    main_default_stream = torch.cuda.current_stream()
    child_stream = torch.cuda.Stream()

    thread_stream_ready = threading.Event()
    thread_can_exit = threading.Event()

    def child_thread_func():
        with torch.cuda.stream(child_stream):
            thread_stream_ready.set()
            thread_can_exit.wait(timeout=10)

    child_thread = threading.Thread(target=child_thread_func)
    child_thread.start()

    try:
585
586
587
        assert thread_stream_ready.wait(timeout=5), (
            "Child thread failed to enter stream context in time"
        )
588
589
590

        main_current_stream = current_stream()

591
592
593
594
595
596
        assert main_current_stream != child_stream, (
            "Main thread's current_stream was contaminated by child thread"
        )
        assert main_current_stream == main_default_stream, (
            "Main thread's current_stream is not the default stream"
        )
597
598
599
600
601
602
603
604
605

        # Notify child thread it can exit
        thread_can_exit.set()

    finally:
        # Ensure child thread exits properly
        child_thread.join(timeout=5)
        if child_thread.is_alive():
            pytest.fail("Child thread failed to exit properly")
606
607
608
609
610
611
612
613


def test_load_config_file(tmp_path):
    # Define the configuration data
    config_data = {
        "enable-logging": True,
        "list-arg": ["item1", "item2"],
        "port": 12323,
614
        "tensor-parallel-size": 4,
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
    }

    # Write the configuration data to a temporary YAML file
    config_file_path = tmp_path / "config.yaml"
    with open(config_file_path, "w") as config_file:
        yaml.dump(config_data, config_file)

    # Initialize the parser
    parser = FlexibleArgumentParser()

    # Call the function with the temporary file path
    processed_args = parser.load_config_file(str(config_file_path))

    # Expected output
    expected_args = [
        "--enable-logging",
        "--list-arg",
        "item1",
        "item2",
        "--port",
        "12323",
        "--tensor-parallel-size",
        "4",
    ]

    # Assert that the processed arguments match the expected output
    assert processed_args == expected_args
    os.remove(str(config_file_path))
643
644
645
646
647
648
649
650
651
652
653
654


def test_unique_filepath():
    temp_dir = tempfile.mkdtemp()
    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
    paths = set()
    for i in range(10):
        path = unique_filepath(path_fn)
        path.write_text("test")
        paths.add(path)
    assert len(paths) == 10
    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676


def test_flat_product():
    # Check regular itertools.product behavior
    result1 = list(flat_product([1, 2, 3], ["a", "b"]))
    assert result1 == [
        (1, "a"),
        (1, "b"),
        (2, "a"),
        (2, "b"),
        (3, "a"),
        (3, "b"),
    ]

    # check that the tuples get flattened
    result2 = list(flat_product([(1, 2), (3, 4)], ["a", "b"], [(5, 6)]))
    assert result2 == [
        (1, 2, "a", 5, 6),
        (1, 2, "b", 5, 6),
        (3, 4, "a", 5, 6),
        (3, 4, "b", 5, 6),
    ]