"vscode:/vscode.git/clone" did not exist on "0b2c14e39c4eb7fe1540097c56312a702f7b0797"
test_utils.py 30 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
# ruff: noqa
4

5
import asyncio
6
import hashlib
7
import json
8
import os
9
import pickle
10
import socket
11
import tempfile
12
from collections.abc import AsyncIterator
13
from pathlib import Path
14
from unittest.mock import patch
15

16
import pytest
17
import torch
18
import yaml
19
import zmq
20
from transformers import AutoTokenizer
21
from vllm_test_utils.monitor import monitor
22

23
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
24
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
25

26
from vllm.utils import (
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    FlexibleArgumentParser,
    MemorySnapshot,
    PlaceholderModule,
    bind_kv_cache,
    common_broadcastable_dtype,
    current_stream,
    deprecate_kwargs,
    get_open_port,
    get_tcp_uri,
    is_lossless_cast,
    join_host_port,
    make_zmq_path,
    make_zmq_socket,
    memory_profiling,
    merge_async_iterators,
    sha256,
    split_host_port,
    split_zmq_path,
    supports_kw,
    swap_dict_values,
    unique_filepath,
)

50
from ..utils import create_new_process_for_each_test, error_on_warning
51

52
53
54

@pytest.mark.asyncio
async def test_merge_async_iterators():
55
    async def mock_async_iterator(idx: int):
56
57
58
59
60
        try:
            while True:
                yield f"item from iterator {idx}"
                await asyncio.sleep(0.1)
        except asyncio.CancelledError:
61
            print(f"iterator {idx} cancelled")
62
63

    iterators = [mock_async_iterator(i) for i in range(3)]
64
    merged_iterator = merge_async_iterators(*iterators)
65

66
    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
67
68
69
70
71
72
73
74
75
76
77
        async for idx, output in generator:
            print(f"idx: {idx}, output: {output}")

    task = asyncio.create_task(stream_output(merged_iterator))
    await asyncio.sleep(0.5)
    task.cancel()
    with pytest.raises(asyncio.CancelledError):
        await task

    for iterator in iterators:
        try:
78
79
            # Can use anext() in python >= 3.10
            await asyncio.wait_for(iterator.__anext__(), 1)
80
81
82
83
84
85
        except StopAsyncIteration:
            # All iterators should be cancelled and print this message.
            print("Iterator was cancelled normally")
        except (Exception, asyncio.CancelledError) as e:
            raise AssertionError() from e

86
87
88
89
90
91
92
93
94

def test_deprecate_kwargs_always():
    @deprecate_kwargs("old_arg", is_deprecated=True)
    def dummy(*, old_arg: object = None, new_arg: object = None):
        pass

    with pytest.warns(DeprecationWarning, match="'old_arg'"):
        dummy(old_arg=1)

95
    with error_on_warning(DeprecationWarning):
96
97
98
99
100
101
102
103
        dummy(new_arg=1)


def test_deprecate_kwargs_never():
    @deprecate_kwargs("old_arg", is_deprecated=False)
    def dummy(*, old_arg: object = None, new_arg: object = None):
        pass

104
    with error_on_warning(DeprecationWarning):
105
106
        dummy(old_arg=1)

107
    with error_on_warning(DeprecationWarning):
108
109
110
111
112
113
114
115
116
117
118
119
120
        dummy(new_arg=1)


def test_deprecate_kwargs_dynamic():
    is_deprecated = True

    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
    def dummy(*, old_arg: object = None, new_arg: object = None):
        pass

    with pytest.warns(DeprecationWarning, match="'old_arg'"):
        dummy(old_arg=1)

121
    with error_on_warning(DeprecationWarning):
122
123
124
125
        dummy(new_arg=1)

    is_deprecated = False

126
    with error_on_warning(DeprecationWarning):
127
128
        dummy(old_arg=1)

129
    with error_on_warning(DeprecationWarning):
130
131
132
133
134
135
136
137
138
139
        dummy(new_arg=1)


def test_deprecate_kwargs_additional_message():
    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
    def dummy(*, old_arg: object = None, new_arg: object = None):
        pass

    with pytest.warns(DeprecationWarning, match="abcd"):
        dummy(old_arg=1)
140
141


142
143
144
145
146
147
148
149
150
151
def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PORT", "5678")
        # make sure we can get multiple ports, even if the env var is set
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
            s1.bind(("localhost", get_open_port()))
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
                s2.bind(("localhost", get_open_port()))
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                    s3.bind(("localhost", get_open_port()))
152
153
154
155
156
157


# Tests for FlexibleArgumentParser
@pytest.fixture
def parser():
    parser = FlexibleArgumentParser()
158
159
160
161
162
163
164
165
    parser.add_argument(
        "--image-input-type", choices=["pixel_values", "image_features"]
    )
    parser.add_argument("--model-name")
    parser.add_argument("--batch-size", type=int)
    parser.add_argument("--enable-feature", action="store_true")
    parser.add_argument("--hf-overrides", type=json.loads)
    parser.add_argument("-O", "--compilation-config", type=json.loads)
166
167
168
    return parser


169
170
171
@pytest.fixture
def parser_with_config():
    parser = FlexibleArgumentParser()
172
173
174
175
176
177
178
179
    parser.add_argument("serve")
    parser.add_argument("model_tag", nargs="?")
    parser.add_argument("--model", type=str)
    parser.add_argument("--served-model-name", type=str)
    parser.add_argument("--config", type=str)
    parser.add_argument("--port", type=int)
    parser.add_argument("--tensor-parallel-size", type=int)
    parser.add_argument("--trust-remote-code", action="store_true")
180
181
182
    return parser


183
def test_underscore_to_dash(parser):
184
185
    args = parser.parse_args(["--image_input_type", "pixel_values"])
    assert args.image_input_type == "pixel_values"
186
187
188


def test_mixed_usage(parser):
189
190
191
192
193
    args = parser.parse_args(
        ["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
    )
    assert args.image_input_type == "image_features"
    assert args.model_name == "facebook/opt-125m"
194
195
196
197


def test_with_equals_sign(parser):
    args = parser.parse_args(
198
199
200
201
        ["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
    )
    assert args.image_input_type == "pixel_values"
    assert args.model_name == "facebook/opt-125m"
202
203
204


def test_with_int_value(parser):
205
    args = parser.parse_args(["--batch_size", "32"])
206
    assert args.batch_size == 32
207
    args = parser.parse_args(["--batch-size", "32"])
208
209
210
211
    assert args.batch_size == 32


def test_with_bool_flag(parser):
212
    args = parser.parse_args(["--enable_feature"])
213
    assert args.enable_feature is True
214
    args = parser.parse_args(["--enable-feature"])
215
216
217
218
219
    assert args.enable_feature is True


def test_invalid_choice(parser):
    with pytest.raises(SystemExit):
220
        parser.parse_args(["--image_input_type", "invalid_choice"])
221
222
223


def test_missing_required_argument(parser):
224
    parser.add_argument("--required-arg", required=True)
225
226
    with pytest.raises(SystemExit):
        parser.parse_args([])
227
228


229
def test_cli_override_to_config(parser_with_config, cli_config_file):
230
231
232
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
    )
233
    assert args.tensor_parallel_size == 3
234
235
236
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
    )
237
    assert args.tensor_parallel_size == 3
238
    assert args.port == 12312
239
240
241
242
243
244
245
246
247
248
249
250
    args = parser_with_config.parse_args(
        [
            "serve",
            "mymodel",
            "--tensor-parallel-size",
            "3",
            "--config",
            cli_config_file,
            "--port",
            "666",
        ]
    )
251
252
    assert args.tensor_parallel_size == 3
    assert args.port == 666
253
254


255
def test_config_args(parser_with_config, cli_config_file):
256
    args = parser_with_config.parse_args(
257
258
        ["serve", "mymodel", "--config", cli_config_file]
    )
259
    assert args.tensor_parallel_size == 2
260
    assert args.trust_remote_code
261
262
263
264


def test_config_file(parser_with_config):
    with pytest.raises(FileNotFoundError):
265
        parser_with_config.parse_args(
266
267
            ["serve", "mymodel", "--config", "test_config.yml"]
        )
268
269
270

    with pytest.raises(ValueError):
        parser_with_config.parse_args(
271
272
            ["serve", "mymodel", "--config", "./data/test_config.json"]
        )
273
274

    with pytest.raises(ValueError):
275
276
277
278
279
280
281
282
283
284
285
        parser_with_config.parse_args(
            [
                "serve",
                "mymodel",
                "--tensor-parallel-size",
                "3",
                "--config",
                "--batch-size",
                "32",
            ]
        )
286
287


288
def test_no_model_tag(parser_with_config, cli_config_file):
289
    with pytest.raises(ValueError):
290
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
291
292


293
294
295
296
297
def test_dict_args(parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
298
        # Test nesting
299
300
301
302
        "--hf-overrides.key2.key3",
        "val2",
        "--hf-overrides.key2.key4",
        "val3",
303
304
305
306
307
        # Test compile config and compilation level
        "-O.use_inductor=true",
        "-O.backend",
        "custom",
        "-O1",
308
        # Test = sign
309
        "--hf-overrides.key5=val4",
310
311
312
313
314
        # Test underscore to dash conversion
        "--hf_overrides.key_6",
        "val5",
        "--hf_overrides.key-7.key_8",
        "val6",
315
316
317
318
319
320
321
322
323
        # Test data type detection
        "--hf_overrides.key9",
        "100",
        "--hf_overrides.key10",
        "100.0",
        "--hf_overrides.key11",
        "true",
        "--hf_overrides.key12.key13",
        "null",
324
325
326
327
328
329
330
        # Test '-' and '.' in value
        "--hf_overrides.key14.key15",
        "-minus.and.dot",
        # Test array values
        "-O.custom_ops+",
        "-quant_fp8",
        "-O.custom_ops+=+silu_mul,-rms_norm",
331
332
333
334
335
336
337
338
339
340
    ]
    parsed_args = parser.parse_args(args)
    assert parsed_args.model_name == "something.something"
    assert parsed_args.hf_overrides == {
        "key1": "val1",
        "key2": {
            "key3": "val2",
            "key4": "val3",
        },
        "key5": "val4",
341
342
343
344
        "key_6": "val5",
        "key-7": {
            "key_8": "val6",
        },
345
346
347
348
349
350
        "key9": 100,
        "key10": 100.0,
        "key11": True,
        "key12": {
            "key13": None,
        },
351
352
        "key14": {
            "key15": "-minus.and.dot",
353
        },
354
    }
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
    assert parsed_args.compilation_config == {
        "level": 1,
        "use_inductor": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
    }


def test_duplicate_dict_args(caplog_vllm, parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
        "--hf-overrides.key1",
        "val2",
        "-O1",
        "-O.level",
        "2",
        "-O3",
    ]

    parsed_args = parser.parse_args(args)
    # Should be the last value
    assert parsed_args.hf_overrides == {"key1": "val2"}
    assert parsed_args.compilation_config == {"level": 3}

    assert len(caplog_vllm.records) == 1
    assert "duplicate" in caplog_vllm.text
    assert "--hf-overrides.key1" in caplog_vllm.text
    assert "-O.level" in caplog_vllm.text
385
386


387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
@pytest.mark.parametrize(
    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
    [
        # Tests for positional argument support
        (lambda foo: None, "foo", True, True, False),
        (lambda foo: None, "foo", False, True, True),
        # Tests for positional or keyword / keyword only
        (lambda foo=100: None, "foo", True, True, False),
        (lambda *, foo: None, "foo", False, True, True),
        # Tests to make sure the names of variadic params are NOT supported
        (lambda *args: None, "args", False, True, False),
        (lambda **kwargs: None, "kwargs", False, True, False),
        # Tests for if we allow var kwargs to add support
        (lambda foo: None, "something_else", False, True, False),
        (lambda foo, **kwargs: None, "something_else", False, True, True),
        (lambda foo, **kwargs: None, "kwargs", True, True, False),
        (lambda foo, **kwargs: None, "foo", True, True, False),
404
405
406
407
408
409
410
    ],
)
def test_supports_kw(
    callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
):
    assert (
        supports_kw(
411
412
413
            callable=callable,
            kw_name=kw_name,
            requires_kw_only=requires_kw_only,
414
415
416
417
            allow_var_kwargs=allow_var_kwargs,
        )
        == is_supported
    )
418
419


420
@create_new_process_for_each_test()
421
422
423
def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
424
425
    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary

426
427
428
429
    lib = CudaRTLibrary()
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)

430
    baseline_snapshot = MemorySnapshot()
431
432
433

    # load weights

434
    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
435

436
    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
437

438
439
440
441
442
443
444
    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
        current_used = total - free
        current_torch = torch.cuda.memory_reserved()
        current_non_torch = current_used - current_torch
        return current_non_torch

445
446
447
448
449
450
    with (
        memory_profiling(
            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
        ) as result,
        monitor(measure_current_non_torch) as monitored_values,
    ):
451
        # make a memory spike, 1 GiB
452
        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
453
454
455
456
457
        del spike

        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)

458
459
460
461
462
    # this is an analytic value, it is exact,
    # we only have 256 MiB non-torch memory increase
    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
    assert measured_diff == 256 * 1024 * 1024

463
    # Check that the memory usage is within 5% of the expected values
464
465
    # 5% tolerance is caused by cuda runtime.
    # we cannot control cuda runtime in the granularity of bytes,
466
    # which causes a small error (<10 MiB in practice)
467
    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
468
    assert abs(non_torch_ratio - 1) <= 0.05
469
    assert result.torch_peak_increase == 1024 * 1024 * 1024
470
471
472
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
473
474


475
476
477
478
def test_bind_kv_cache():
    from vllm.attention import Attention

    ctx = {
479
480
481
482
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
483
484
    }
    kv_cache = [
485
486
487
488
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
489
490
    ]
    bind_kv_cache(ctx, [kv_cache])
491
492
493
494
495
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]

496

497
498
499
500
def test_bind_kv_cache_kv_sharing():
    from vllm.attention import Attention

    ctx = {
501
502
503
504
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
505
506
    }
    kv_cache = [
507
508
509
510
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
511
512
    ]
    shared_kv_cache_layers = {
513
514
        "layers.2.self_attn": "layers.1.self_attn",
        "layers.3.self_attn": "layers.0.self_attn",
515
516
    }
    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
517
518
519
520
521
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]

522

523
524
525
526
527
def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention

    # example from Jamba PP=2
    ctx = {
528
529
        "model.layers.20.attn": Attention(32, 128, 0.1),
        "model.layers.28.attn": Attention(32, 128, 0.1),
530
531
    }
    kv_cache = [
532
533
        torch.zeros((1,)),
        torch.zeros((1,)),
534
535
    ]
    bind_kv_cache(ctx, [kv_cache])
536
537
    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
538
539
540


def test_bind_kv_cache_pp():
541
542
    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
        # this test runs with 1 GPU, but we simulate 2 GPUs
543
        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
544
545
546
547
    with set_current_vllm_config(cfg):
        from vllm.attention import Attention

        ctx = {
548
            "layers.0.self_attn": Attention(32, 128, 0.1),
549
        }
550
        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
551
        bind_kv_cache(ctx, kv_cache)
552
553
        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
554
555


556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
        # Different precision_levels
        (torch.bool, torch.int8, True),
        (torch.bool, torch.float16, True),
        (torch.bool, torch.complex32, True),
        (torch.int64, torch.bool, False),
        (torch.int64, torch.float16, True),
        (torch.int64, torch.complex32, True),
        (torch.float64, torch.bool, False),
        (torch.float64, torch.int8, False),
        (torch.float64, torch.complex32, True),
        (torch.complex128, torch.bool, False),
        (torch.complex128, torch.int8, False),
        (torch.complex128, torch.float16, False),
        # precision_level=0
        (torch.bool, torch.bool, True),
        # precision_level=1
        (torch.int8, torch.int16, True),
        (torch.int16, torch.int8, False),
        (torch.uint8, torch.int8, False),
        (torch.int8, torch.uint8, False),
        # precision_level=2
        (torch.float16, torch.float32, True),
        (torch.float32, torch.float16, False),
        (torch.bfloat16, torch.float32, True),
        (torch.float32, torch.bfloat16, False),
        # precision_level=3
        (torch.complex32, torch.complex64, True),
        (torch.complex64, torch.complex32, False),
    ],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result


@pytest.mark.parametrize(
    ("dtypes", "expected_result"),
    [
        ([torch.bool], torch.bool),
        ([torch.bool, torch.int8], torch.int8),
        ([torch.bool, torch.int8, torch.float16], torch.float16),
        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
    ],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
    assert common_broadcastable_dtype(dtypes) == expected_result


606
607
608
609
def test_placeholder_module_error_handling():
    placeholder = PlaceholderModule("placeholder_1234")

    def build_ctx():
610
        return pytest.raises(ModuleNotFoundError, match="No module named")
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643

    with build_ctx():
        int(placeholder)

    with build_ctx():
        placeholder()

    with build_ctx():
        _ = placeholder.some_attr

    with build_ctx():
        # Test conflict with internal __name attribute
        _ = placeholder.name

    # OK to print the placeholder or use it in a f-string
    _ = repr(placeholder)
    _ = str(placeholder)

    # No error yet; only error when it is used downstream
    placeholder_attr = placeholder.placeholder_attr("attr")

    with build_ctx():
        int(placeholder_attr)

    with build_ctx():
        placeholder_attr()

    with build_ctx():
        _ = placeholder_attr.some_attr

    with build_ctx():
        # Test conflict with internal __module attribute
        _ = placeholder_attr.module
644
645
646
647
648
649
650
651
652
653
654


@pytest.mark.parametrize(
    "obj,key1,key2",
    [
        # Tests for both keys exist
        ({1: "a", 2: "b"}, 1, 2),
        # Tests for one key does not exist
        ({1: "a", 2: "b"}, 1, 3),
        # Tests for both keys do not exist
        ({1: "a", 2: "b"}, 3, 4),
655
656
    ],
)
657
658
659
660
661
662
663
664
665
666
667
def test_swap_dict_values(obj, key1, key2):
    original_obj = obj.copy()
    swap_dict_values(obj, key1, key2)
    if key1 in original_obj:
        assert obj[key2] == original_obj[key1]
    else:
        assert key2 not in obj
    if key2 in original_obj:
        assert obj[key1] == original_obj[key2]
    else:
        assert key1 not in obj
668

669

670
671
672
def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
):
673
    # Test model in CLI takes precedence over config
674
    args = parser_with_config.parse_args(
675
676
677
678
        ["serve", "cli-model", "--config", cli_config_file_with_model]
    )
    assert args.model_tag == "cli-model"
    assert args.served_model_name == "mymodel"
679
680

    # Test model from config file works
681
682
683
684
685
686
687
688
689
    args = parser_with_config.parse_args(
        [
            "serve",
            "--config",
            cli_config_file_with_model,
        ]
    )
    assert args.model == "config-model"
    assert args.served_model_name == "mymodel"
690
691
692

    # Test no model specified anywhere raises error
    with pytest.raises(ValueError, match="No model specified!"):
693
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
694
695

    # Test using --model option raises error
696
697
698
699
700
701
702
703
704
705
706
    # with pytest.raises(
    #         ValueError,
    #         match=
    #     ("With `vllm serve`, you should provide the model as a positional "
    #      "argument or in a config file instead of via the `--model` option."),
    # ):
    #     parser_with_config.parse_args(['serve', '--model', 'my-model'])

    # Test using --model option back-compatibility
    # (when back-compatibility ends, the above test should be uncommented
    # and the below test should be removed)
707
708
709
710
711
712
713
714
715
716
717
718
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size",
            "2",
            "--model",
            "my-model",
            "--trust-remote-code",
            "--port",
            "8001",
        ]
    )
719
720
721
722
723
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001

724
725
726
727
728
729
730
731
732
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size=2",
            "--model=my-model",
            "--trust-remote-code",
            "--port=8001",
        ]
    )
733
734
735
736
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001
737
738

    # Test other config values are preserved
739
740
741
742
743
744
745
746
    args = parser_with_config.parse_args(
        [
            "serve",
            "cli-model",
            "--config",
            cli_config_file_with_model,
        ]
    )
747
748
749
750
751
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 12312


752
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
753
754
755
756
757
def test_sha256(input: tuple):
    digest = sha256(input)
    assert digest is not None
    assert isinstance(digest, bytes)
    assert digest != b""
758

759
760
    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
    assert digest == hashlib.sha256(input_bytes).digest()
761
762

    # hashing again, returns the same value
763
    assert digest == sha256(input)
764
765

    # hashing different input, returns different value
766
    assert digest != sha256(input + (1,))
767
768
769
770
771
772
773
774
775


@pytest.mark.parametrize(
    "path,expected",
    [
        ("ipc://some_path", ("ipc", "some_path", "")),
        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
776
777
    ],
)
778
779
780
781
782
783
784
785
786
787
788
def test_split_zmq_path(path, expected):
    assert split_zmq_path(path) == expected


@pytest.mark.parametrize(
    "invalid_path",
    [
        "invalid_path",  # Missing scheme
        "tcp://127.0.0.1",  # Missing port
        "tcp://[::1]",  # Missing port for IPv6
        "tcp://:5555",  # Missing host
789
790
    ],
)
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
def test_split_zmq_path_invalid(invalid_path):
    with pytest.raises(ValueError):
        split_zmq_path(invalid_path)


def test_make_zmq_socket_ipv6():
    # Check if IPv6 is supported by trying to create an IPv6 socket
    try:
        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
        sock.close()
    except socket.error:
        pytest.skip("IPv6 is not supported on this system")

    ctx = zmq.Context()
    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
    socket_type = zmq.REP  # Example socket type

    # Create the socket
    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)

    # Verify that the IPV6 option is set
812
813
814
    assert zsock.getsockopt(zmq.IPV6) == 1, (
        "IPV6 option should be enabled for IPv6 addresses"
    )
815
816
817
818

    # Clean up
    zsock.close()
    ctx.term()
819
820
821
822
823


def test_make_zmq_path():
    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864


def test_get_tcp_uri():
    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"


def test_split_host_port():
    # valid ipv4
    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
    # invalid ipv4
    with pytest.raises(ValueError):
        # multi colon
        assert split_host_port("127.0.0.1::5555")
    with pytest.raises(ValueError):
        # tailing colon
        assert split_host_port("127.0.0.1:5555:")
    with pytest.raises(ValueError):
        # no colon
        assert split_host_port("127.0.0.15555")
    with pytest.raises(ValueError):
        # none int port
        assert split_host_port("127.0.0.1:5555a")

    # valid ipv6
    assert split_host_port("[::1]:5555") == ("::1", 5555)
    # invalid ipv6
    with pytest.raises(ValueError):
        # multi colon
        assert split_host_port("[::1]::5555")
    with pytest.raises(IndexError):
        # no colon
        assert split_host_port("[::1]5555")
    with pytest.raises(ValueError):
        # none int port
        assert split_host_port("[::1]:5555a")


def test_join_host_port():
    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
    assert join_host_port("::1", 5555) == "[::1]:5555"
865
866


867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
def test_json_count_leaves():
    """Test json_count_leaves function from jsontree utility."""
    from vllm.utils.jsontree import json_count_leaves

    # Single leaf values
    assert json_count_leaves(42) == 1
    assert json_count_leaves("hello") == 1
    assert json_count_leaves(None) == 1

    # Empty containers
    assert json_count_leaves([]) == 0
    assert json_count_leaves({}) == 0
    assert json_count_leaves(()) == 0

    # Flat structures
    assert json_count_leaves([1, 2, 3]) == 3
    assert json_count_leaves({"a": 1, "b": 2}) == 2
    assert json_count_leaves((1, 2, 3)) == 3

    # Nested structures
    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
    assert json_count_leaves(nested_dict) == 3

    nested_list = [1, [2, 3], 4]
    assert json_count_leaves(nested_list) == 4

    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
    assert json_count_leaves(mixed_nested) == 4


897
898
899
900
def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
    # token_ids = [9707, 11, 1879, 0]
901
    assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
902
    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
903
    assert tokens == ["Hello", ",", " world", "!"]
904
905
906
907


def test_current_stream_multithread():
    import threading
908

909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")

    main_default_stream = torch.cuda.current_stream()
    child_stream = torch.cuda.Stream()

    thread_stream_ready = threading.Event()
    thread_can_exit = threading.Event()

    def child_thread_func():
        with torch.cuda.stream(child_stream):
            thread_stream_ready.set()
            thread_can_exit.wait(timeout=10)

    child_thread = threading.Thread(target=child_thread_func)
    child_thread.start()

    try:
927
928
929
        assert thread_stream_ready.wait(timeout=5), (
            "Child thread failed to enter stream context in time"
        )
930
931
932

        main_current_stream = current_stream()

933
934
935
936
937
938
        assert main_current_stream != child_stream, (
            "Main thread's current_stream was contaminated by child thread"
        )
        assert main_current_stream == main_default_stream, (
            "Main thread's current_stream is not the default stream"
        )
939
940
941
942
943
944
945
946
947

        # Notify child thread it can exit
        thread_can_exit.set()

    finally:
        # Ensure child thread exits properly
        child_thread.join(timeout=5)
        if child_thread.is_alive():
            pytest.fail("Child thread failed to exit properly")
948
949
950
951
952
953
954
955


def test_load_config_file(tmp_path):
    # Define the configuration data
    config_data = {
        "enable-logging": True,
        "list-arg": ["item1", "item2"],
        "port": 12323,
956
        "tensor-parallel-size": 4,
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
    }

    # Write the configuration data to a temporary YAML file
    config_file_path = tmp_path / "config.yaml"
    with open(config_file_path, "w") as config_file:
        yaml.dump(config_data, config_file)

    # Initialize the parser
    parser = FlexibleArgumentParser()

    # Call the function with the temporary file path
    processed_args = parser.load_config_file(str(config_file_path))

    # Expected output
    expected_args = [
        "--enable-logging",
        "--list-arg",
        "item1",
        "item2",
        "--port",
        "12323",
        "--tensor-parallel-size",
        "4",
    ]

    # Assert that the processed arguments match the expected output
    assert processed_args == expected_args
    os.remove(str(config_file_path))
985
986
987
988
989
990
991
992
993
994
995
996


def test_unique_filepath():
    temp_dir = tempfile.mkdtemp()
    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
    paths = set()
    for i in range(10):
        path = unique_filepath(path_fn)
        path.write_text("test")
        paths.add(path)
    assert len(paths) == 10
    assert len(list(Path(temp_dir).glob("*.txt"))) == 10