"vscode:/vscode.git/clone" did not exist on "52eadcec9ea6f59433bfcba2f0d065b3ce5548b4"
test_utils.py 24 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
# ruff: noqa
4

5
import hashlib
6
import json
7
import os
8
import pickle
9
import socket
10
11
import tempfile
from pathlib import Path
12
from unittest.mock import patch
13

14
import pytest
15
import torch
16
import yaml
17
import zmq
18
from transformers import AutoTokenizer
19
from vllm_test_utils.monitor import monitor
20

21
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
22
from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
23

24
from vllm.utils import (
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
    FlexibleArgumentParser,
    MemorySnapshot,
    bind_kv_cache,
    common_broadcastable_dtype,
    current_stream,
    get_open_port,
    get_tcp_uri,
    is_lossless_cast,
    join_host_port,
    make_zmq_path,
    make_zmq_socket,
    memory_profiling,
    sha256,
    split_host_port,
    split_zmq_path,
    unique_filepath,
)

43
from ..utils import create_new_process_for_each_test, flat_product
44

45

46
47
48
49
50
51
52
53
54
55
def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_PORT", "5678")
        # make sure we can get multiple ports, even if the env var is set
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
            s1.bind(("localhost", get_open_port()))
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
                s2.bind(("localhost", get_open_port()))
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
                    s3.bind(("localhost", get_open_port()))
56
57
58
59
60
61


# Tests for FlexibleArgumentParser
@pytest.fixture
def parser():
    parser = FlexibleArgumentParser()
62
63
64
65
66
67
68
69
    parser.add_argument(
        "--image-input-type", choices=["pixel_values", "image_features"]
    )
    parser.add_argument("--model-name")
    parser.add_argument("--batch-size", type=int)
    parser.add_argument("--enable-feature", action="store_true")
    parser.add_argument("--hf-overrides", type=json.loads)
    parser.add_argument("-O", "--compilation-config", type=json.loads)
70
71
72
    return parser


73
74
75
@pytest.fixture
def parser_with_config():
    parser = FlexibleArgumentParser()
76
77
78
79
80
81
82
83
    parser.add_argument("serve")
    parser.add_argument("model_tag", nargs="?")
    parser.add_argument("--model", type=str)
    parser.add_argument("--served-model-name", type=str)
    parser.add_argument("--config", type=str)
    parser.add_argument("--port", type=int)
    parser.add_argument("--tensor-parallel-size", type=int)
    parser.add_argument("--trust-remote-code", action="store_true")
84
85
86
    return parser


87
def test_underscore_to_dash(parser):
88
89
    args = parser.parse_args(["--image_input_type", "pixel_values"])
    assert args.image_input_type == "pixel_values"
90
91
92


def test_mixed_usage(parser):
93
94
95
96
97
    args = parser.parse_args(
        ["--image_input_type", "image_features", "--model-name", "facebook/opt-125m"]
    )
    assert args.image_input_type == "image_features"
    assert args.model_name == "facebook/opt-125m"
98
99
100
101


def test_with_equals_sign(parser):
    args = parser.parse_args(
102
103
104
105
        ["--image_input_type=pixel_values", "--model-name=facebook/opt-125m"]
    )
    assert args.image_input_type == "pixel_values"
    assert args.model_name == "facebook/opt-125m"
106
107
108


def test_with_int_value(parser):
109
    args = parser.parse_args(["--batch_size", "32"])
110
    assert args.batch_size == 32
111
    args = parser.parse_args(["--batch-size", "32"])
112
113
114
115
    assert args.batch_size == 32


def test_with_bool_flag(parser):
116
    args = parser.parse_args(["--enable_feature"])
117
    assert args.enable_feature is True
118
    args = parser.parse_args(["--enable-feature"])
119
120
121
122
123
    assert args.enable_feature is True


def test_invalid_choice(parser):
    with pytest.raises(SystemExit):
124
        parser.parse_args(["--image_input_type", "invalid_choice"])
125
126
127


def test_missing_required_argument(parser):
128
    parser.add_argument("--required-arg", required=True)
129
130
    with pytest.raises(SystemExit):
        parser.parse_args([])
131
132


133
def test_cli_override_to_config(parser_with_config, cli_config_file):
134
135
136
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--config", cli_config_file, "--tensor-parallel-size", "3"]
    )
137
    assert args.tensor_parallel_size == 3
138
139
140
    args = parser_with_config.parse_args(
        ["serve", "mymodel", "--tensor-parallel-size", "3", "--config", cli_config_file]
    )
141
    assert args.tensor_parallel_size == 3
142
    assert args.port == 12312
143
144
145
146
147
148
149
150
151
152
153
154
    args = parser_with_config.parse_args(
        [
            "serve",
            "mymodel",
            "--tensor-parallel-size",
            "3",
            "--config",
            cli_config_file,
            "--port",
            "666",
        ]
    )
155
156
    assert args.tensor_parallel_size == 3
    assert args.port == 666
157
158


159
def test_config_args(parser_with_config, cli_config_file):
160
    args = parser_with_config.parse_args(
161
162
        ["serve", "mymodel", "--config", cli_config_file]
    )
163
    assert args.tensor_parallel_size == 2
164
    assert args.trust_remote_code
165
166
167
168


def test_config_file(parser_with_config):
    with pytest.raises(FileNotFoundError):
169
        parser_with_config.parse_args(
170
171
            ["serve", "mymodel", "--config", "test_config.yml"]
        )
172
173
174

    with pytest.raises(ValueError):
        parser_with_config.parse_args(
175
176
            ["serve", "mymodel", "--config", "./data/test_config.json"]
        )
177
178

    with pytest.raises(ValueError):
179
180
181
182
183
184
185
186
187
188
189
        parser_with_config.parse_args(
            [
                "serve",
                "mymodel",
                "--tensor-parallel-size",
                "3",
                "--config",
                "--batch-size",
                "32",
            ]
        )
190
191


192
def test_no_model_tag(parser_with_config, cli_config_file):
193
    with pytest.raises(ValueError):
194
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
195
196


197
198
199
200
201
def test_dict_args(parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
202
        # Test nesting
203
204
205
206
        "--hf-overrides.key2.key3",
        "val2",
        "--hf-overrides.key2.key4",
        "val3",
207
        # Test compile config and compilation mode
208
209
210
211
        "-O.use_inductor=true",
        "-O.backend",
        "custom",
        "-O1",
212
        # Test = sign
213
        "--hf-overrides.key5=val4",
214
215
216
217
218
        # Test underscore to dash conversion
        "--hf_overrides.key_6",
        "val5",
        "--hf_overrides.key-7.key_8",
        "val6",
219
220
221
222
223
224
225
226
227
        # Test data type detection
        "--hf_overrides.key9",
        "100",
        "--hf_overrides.key10",
        "100.0",
        "--hf_overrides.key11",
        "true",
        "--hf_overrides.key12.key13",
        "null",
228
229
230
231
232
233
234
        # Test '-' and '.' in value
        "--hf_overrides.key14.key15",
        "-minus.and.dot",
        # Test array values
        "-O.custom_ops+",
        "-quant_fp8",
        "-O.custom_ops+=+silu_mul,-rms_norm",
235
236
237
238
239
240
241
242
243
244
    ]
    parsed_args = parser.parse_args(args)
    assert parsed_args.model_name == "something.something"
    assert parsed_args.hf_overrides == {
        "key1": "val1",
        "key2": {
            "key3": "val2",
            "key4": "val3",
        },
        "key5": "val4",
245
246
247
248
        "key_6": "val5",
        "key-7": {
            "key_8": "val6",
        },
249
250
251
252
253
254
        "key9": 100,
        "key10": 100.0,
        "key11": True,
        "key12": {
            "key13": None,
        },
255
256
        "key14": {
            "key15": "-minus.and.dot",
257
        },
258
    }
259
    assert parsed_args.compilation_config == {
260
        "mode": 1,
261
262
263
264
265
266
267
268
269
270
271
272
273
274
        "use_inductor": True,
        "backend": "custom",
        "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
    }


def test_duplicate_dict_args(caplog_vllm, parser):
    args = [
        "--model-name=something.something",
        "--hf-overrides.key1",
        "val1",
        "--hf-overrides.key1",
        "val2",
        "-O1",
275
        "-O.mode",
276
277
278
279
280
281
282
        "2",
        "-O3",
    ]

    parsed_args = parser.parse_args(args)
    # Should be the last value
    assert parsed_args.hf_overrides == {"key1": "val2"}
283
    assert parsed_args.compilation_config == {"mode": 3}
284
285
286
287

    assert len(caplog_vllm.records) == 1
    assert "duplicate" in caplog_vllm.text
    assert "--hf-overrides.key1" in caplog_vllm.text
288
    assert "-O.mode" in caplog_vllm.text
289
290


291
@create_new_process_for_each_test()
292
293
294
def test_memory_profiling():
    # Fake out some model loading + inference memory usage to test profiling
    # Memory used by other processes will show up as cuda usage outside of torch
295
296
    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary

297
298
299
300
    lib = CudaRTLibrary()
    # 512 MiB allocation outside of this instance
    handle1 = lib.cudaMalloc(512 * 1024 * 1024)

301
    baseline_snapshot = MemorySnapshot()
302
303
304

    # load weights

305
    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
306

307
    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
308

309
310
311
312
313
314
315
    def measure_current_non_torch():
        free, total = torch.cuda.mem_get_info()
        current_used = total - free
        current_torch = torch.cuda.memory_reserved()
        current_non_torch = current_used - current_torch
        return current_non_torch

316
317
318
319
320
321
    with (
        memory_profiling(
            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
        ) as result,
        monitor(measure_current_non_torch) as monitored_values,
    ):
322
        # make a memory spike, 1 GiB
323
        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
324
325
326
327
328
        del spike

        # Add some extra non-torch memory 256 MiB (simulate NCCL)
        handle2 = lib.cudaMalloc(256 * 1024 * 1024)

329
330
331
332
333
    # this is an analytic value, it is exact,
    # we only have 256 MiB non-torch memory increase
    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
    assert measured_diff == 256 * 1024 * 1024

334
    # Check that the memory usage is within 5% of the expected values
335
336
    # 5% tolerance is caused by cuda runtime.
    # we cannot control cuda runtime in the granularity of bytes,
337
    # which causes a small error (<10 MiB in practice)
338
    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
339
    assert abs(non_torch_ratio - 1) <= 0.05
340
    assert result.torch_peak_increase == 1024 * 1024 * 1024
341
342
343
    del weights
    lib.cudaFree(handle1)
    lib.cudaFree(handle2)
344
345


346
347
348
349
def test_bind_kv_cache():
    from vllm.attention import Attention

    ctx = {
350
351
352
353
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
354
355
    }
    kv_cache = [
356
357
358
359
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
360
361
    ]
    bind_kv_cache(ctx, [kv_cache])
362
363
364
365
366
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[2]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[3]

367

368
369
370
371
def test_bind_kv_cache_kv_sharing():
    from vllm.attention import Attention

    ctx = {
372
373
374
375
        "layers.0.self_attn": Attention(32, 128, 0.1),
        "layers.1.self_attn": Attention(32, 128, 0.1),
        "layers.2.self_attn": Attention(32, 128, 0.1),
        "layers.3.self_attn": Attention(32, 128, 0.1),
376
377
    }
    kv_cache = [
378
379
380
381
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
        torch.zeros((1,)),
382
383
    ]
    shared_kv_cache_layers = {
384
385
        "layers.2.self_attn": "layers.1.self_attn",
        "layers.3.self_attn": "layers.0.self_attn",
386
387
    }
    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
388
389
390
391
392
    assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0]
    assert ctx["layers.1.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.2.self_attn"].kv_cache[0] is kv_cache[1]
    assert ctx["layers.3.self_attn"].kv_cache[0] is kv_cache[0]

393

394
395
396
397
398
def test_bind_kv_cache_non_attention():
    from vllm.attention import Attention

    # example from Jamba PP=2
    ctx = {
399
400
        "model.layers.20.attn": Attention(32, 128, 0.1),
        "model.layers.28.attn": Attention(32, 128, 0.1),
401
402
    }
    kv_cache = [
403
404
        torch.zeros((1,)),
        torch.zeros((1,)),
405
406
    ]
    bind_kv_cache(ctx, [kv_cache])
407
408
    assert ctx["model.layers.20.attn"].kv_cache[0] is kv_cache[0]
    assert ctx["model.layers.28.attn"].kv_cache[0] is kv_cache[1]
409
410
411


def test_bind_kv_cache_pp():
412
413
    with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
        # this test runs with 1 GPU, but we simulate 2 GPUs
414
        cfg = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=2))
415
416
417
418
    with set_current_vllm_config(cfg):
        from vllm.attention import Attention

        ctx = {
419
            "layers.0.self_attn": Attention(32, 128, 0.1),
420
        }
421
        kv_cache = [[torch.zeros((1,))], [torch.zeros((1,))]]
422
        bind_kv_cache(ctx, kv_cache)
423
424
        assert ctx["layers.0.self_attn"].kv_cache[0] is kv_cache[0][0]
        assert ctx["layers.0.self_attn"].kv_cache[1] is kv_cache[1][0]
425
426


427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
@pytest.mark.parametrize(
    ("src_dtype", "tgt_dtype", "expected_result"),
    [
        # Different precision_levels
        (torch.bool, torch.int8, True),
        (torch.bool, torch.float16, True),
        (torch.bool, torch.complex32, True),
        (torch.int64, torch.bool, False),
        (torch.int64, torch.float16, True),
        (torch.int64, torch.complex32, True),
        (torch.float64, torch.bool, False),
        (torch.float64, torch.int8, False),
        (torch.float64, torch.complex32, True),
        (torch.complex128, torch.bool, False),
        (torch.complex128, torch.int8, False),
        (torch.complex128, torch.float16, False),
        # precision_level=0
        (torch.bool, torch.bool, True),
        # precision_level=1
        (torch.int8, torch.int16, True),
        (torch.int16, torch.int8, False),
        (torch.uint8, torch.int8, False),
        (torch.int8, torch.uint8, False),
        # precision_level=2
        (torch.float16, torch.float32, True),
        (torch.float32, torch.float16, False),
        (torch.bfloat16, torch.float32, True),
        (torch.float32, torch.bfloat16, False),
        # precision_level=3
        (torch.complex32, torch.complex64, True),
        (torch.complex64, torch.complex32, False),
    ],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result


@pytest.mark.parametrize(
    ("dtypes", "expected_result"),
    [
        ([torch.bool], torch.bool),
        ([torch.bool, torch.int8], torch.int8),
        ([torch.bool, torch.int8, torch.float16], torch.float16),
        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
    ],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
    assert common_broadcastable_dtype(dtypes) == expected_result


477
478
479
def test_model_specification(
    parser_with_config, cli_config_file, cli_config_file_with_model
):
480
    # Test model in CLI takes precedence over config
481
    args = parser_with_config.parse_args(
482
483
484
485
        ["serve", "cli-model", "--config", cli_config_file_with_model]
    )
    assert args.model_tag == "cli-model"
    assert args.served_model_name == "mymodel"
486
487

    # Test model from config file works
488
489
490
491
492
493
494
495
496
    args = parser_with_config.parse_args(
        [
            "serve",
            "--config",
            cli_config_file_with_model,
        ]
    )
    assert args.model == "config-model"
    assert args.served_model_name == "mymodel"
497
498
499

    # Test no model specified anywhere raises error
    with pytest.raises(ValueError, match="No model specified!"):
500
        parser_with_config.parse_args(["serve", "--config", cli_config_file])
501
502

    # Test using --model option raises error
503
504
505
506
507
508
509
510
511
512
513
    # with pytest.raises(
    #         ValueError,
    #         match=
    #     ("With `vllm serve`, you should provide the model as a positional "
    #      "argument or in a config file instead of via the `--model` option."),
    # ):
    #     parser_with_config.parse_args(['serve', '--model', 'my-model'])

    # Test using --model option back-compatibility
    # (when back-compatibility ends, the above test should be uncommented
    # and the below test should be removed)
514
515
516
517
518
519
520
521
522
523
524
525
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size",
            "2",
            "--model",
            "my-model",
            "--trust-remote-code",
            "--port",
            "8001",
        ]
    )
526
527
528
529
530
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001

531
532
533
534
535
536
537
538
539
    args = parser_with_config.parse_args(
        [
            "serve",
            "--tensor-parallel-size=2",
            "--model=my-model",
            "--trust-remote-code",
            "--port=8001",
        ]
    )
540
541
542
543
    assert args.model is None
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 8001
544
545

    # Test other config values are preserved
546
547
548
549
550
551
552
553
    args = parser_with_config.parse_args(
        [
            "serve",
            "cli-model",
            "--config",
            cli_config_file_with_model,
        ]
    )
554
555
556
557
558
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code is True
    assert args.port == 12312


559
@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
560
561
562
563
564
def test_sha256(input: tuple):
    digest = sha256(input)
    assert digest is not None
    assert isinstance(digest, bytes)
    assert digest != b""
565

566
567
    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
    assert digest == hashlib.sha256(input_bytes).digest()
568
569

    # hashing again, returns the same value
570
    assert digest == sha256(input)
571
572

    # hashing different input, returns different value
573
    assert digest != sha256(input + (1,))
574
575
576
577
578
579
580
581
582


@pytest.mark.parametrize(
    "path,expected",
    [
        ("ipc://some_path", ("ipc", "some_path", "")),
        ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
        ("tcp://[::1]:5555", ("tcp", "::1", "5555")),  # IPv6 address
        ("inproc://some_identifier", ("inproc", "some_identifier", "")),
583
584
    ],
)
585
586
587
588
589
590
591
592
593
594
595
def test_split_zmq_path(path, expected):
    assert split_zmq_path(path) == expected


@pytest.mark.parametrize(
    "invalid_path",
    [
        "invalid_path",  # Missing scheme
        "tcp://127.0.0.1",  # Missing port
        "tcp://[::1]",  # Missing port for IPv6
        "tcp://:5555",  # Missing host
596
597
    ],
)
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
def test_split_zmq_path_invalid(invalid_path):
    with pytest.raises(ValueError):
        split_zmq_path(invalid_path)


def test_make_zmq_socket_ipv6():
    # Check if IPv6 is supported by trying to create an IPv6 socket
    try:
        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
        sock.close()
    except socket.error:
        pytest.skip("IPv6 is not supported on this system")

    ctx = zmq.Context()
    ipv6_path = "tcp://[::]:5555"  # IPv6 loopback address
    socket_type = zmq.REP  # Example socket type

    # Create the socket
    zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)

    # Verify that the IPV6 option is set
619
620
621
    assert zsock.getsockopt(zmq.IPV6) == 1, (
        "IPV6 option should be enabled for IPv6 addresses"
    )
622
623
624
625

    # Clean up
    zsock.close()
    ctx.term()
626
627
628
629
630


def test_make_zmq_path():
    assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
    assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671


def test_get_tcp_uri():
    assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
    assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"


def test_split_host_port():
    # valid ipv4
    assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
    # invalid ipv4
    with pytest.raises(ValueError):
        # multi colon
        assert split_host_port("127.0.0.1::5555")
    with pytest.raises(ValueError):
        # tailing colon
        assert split_host_port("127.0.0.1:5555:")
    with pytest.raises(ValueError):
        # no colon
        assert split_host_port("127.0.0.15555")
    with pytest.raises(ValueError):
        # none int port
        assert split_host_port("127.0.0.1:5555a")

    # valid ipv6
    assert split_host_port("[::1]:5555") == ("::1", 5555)
    # invalid ipv6
    with pytest.raises(ValueError):
        # multi colon
        assert split_host_port("[::1]::5555")
    with pytest.raises(IndexError):
        # no colon
        assert split_host_port("[::1]5555")
    with pytest.raises(ValueError):
        # none int port
        assert split_host_port("[::1]:5555a")


def test_join_host_port():
    assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
    assert join_host_port("::1", 5555) == "[::1]:5555"
672
673
674
675
676
677


def test_convert_ids_list_to_tokens():
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
    token_ids = tokenizer.encode("Hello, world!")
    # token_ids = [9707, 11, 1879, 0]
678
    assert tokenizer.convert_ids_to_tokens(token_ids) == ["Hello", ",", "Ġworld", "!"]
679
    tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
680
    assert tokens == ["Hello", ",", " world", "!"]
681
682
683
684


def test_current_stream_multithread():
    import threading
685

686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
    if not torch.cuda.is_available():
        pytest.skip("CUDA not available")

    main_default_stream = torch.cuda.current_stream()
    child_stream = torch.cuda.Stream()

    thread_stream_ready = threading.Event()
    thread_can_exit = threading.Event()

    def child_thread_func():
        with torch.cuda.stream(child_stream):
            thread_stream_ready.set()
            thread_can_exit.wait(timeout=10)

    child_thread = threading.Thread(target=child_thread_func)
    child_thread.start()

    try:
704
705
706
        assert thread_stream_ready.wait(timeout=5), (
            "Child thread failed to enter stream context in time"
        )
707
708
709

        main_current_stream = current_stream()

710
711
712
713
714
715
        assert main_current_stream != child_stream, (
            "Main thread's current_stream was contaminated by child thread"
        )
        assert main_current_stream == main_default_stream, (
            "Main thread's current_stream is not the default stream"
        )
716
717
718
719
720
721
722
723
724

        # Notify child thread it can exit
        thread_can_exit.set()

    finally:
        # Ensure child thread exits properly
        child_thread.join(timeout=5)
        if child_thread.is_alive():
            pytest.fail("Child thread failed to exit properly")
725
726
727
728
729
730
731
732


def test_load_config_file(tmp_path):
    # Define the configuration data
    config_data = {
        "enable-logging": True,
        "list-arg": ["item1", "item2"],
        "port": 12323,
733
        "tensor-parallel-size": 4,
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
    }

    # Write the configuration data to a temporary YAML file
    config_file_path = tmp_path / "config.yaml"
    with open(config_file_path, "w") as config_file:
        yaml.dump(config_data, config_file)

    # Initialize the parser
    parser = FlexibleArgumentParser()

    # Call the function with the temporary file path
    processed_args = parser.load_config_file(str(config_file_path))

    # Expected output
    expected_args = [
        "--enable-logging",
        "--list-arg",
        "item1",
        "item2",
        "--port",
        "12323",
        "--tensor-parallel-size",
        "4",
    ]

    # Assert that the processed arguments match the expected output
    assert processed_args == expected_args
    os.remove(str(config_file_path))
762
763
764
765
766
767
768
769
770
771
772
773


def test_unique_filepath():
    temp_dir = tempfile.mkdtemp()
    path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
    paths = set()
    for i in range(10):
        path = unique_filepath(path_fn)
        path.write_text("test")
        paths.add(path)
    assert len(paths) == 10
    assert len(list(Path(temp_dir).glob("*.txt"))) == 10
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795


def test_flat_product():
    # Check regular itertools.product behavior
    result1 = list(flat_product([1, 2, 3], ["a", "b"]))
    assert result1 == [
        (1, "a"),
        (1, "b"),
        (2, "a"),
        (2, "b"),
        (3, "a"),
        (3, "b"),
    ]

    # check that the tuples get flattened
    result2 = list(flat_product([(1, 2), (3, 4)], ["a", "b"], [(5, 6)]))
    assert result2 == [
        (1, 2, "a", 5, 6),
        (1, 2, "b", 5, 6),
        (3, 4, "a", 5, 6),
        (3, 4, "b", 5, 6),
    ]