"vscode:/vscode.git/clone" did not exist on "47de8821d3cdd32fce7df6312318223aee591fd2"
conftest.py 42 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12

# ruff: noqa

from tblib import pickling_support

# Install support for pickling exceptions so that we can nicely propagate
# failures from tests running in a subprocess.
# This should be run before any custom exception subclasses are defined.
pickling_support.install()

13
import http.server
14
import json
15
import math
16
import mimetypes
17
import os
18
import socket
19
import tempfile
20
21
import threading
from collections.abc import Generator
22
from enum import Enum
23
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
Woosuk Kwon's avatar
Woosuk Kwon committed
24

25
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
26
27
import pytest
import torch
28
import torch.nn as nn
29
import torch.nn.functional as F
30
from huggingface_hub import snapshot_download
31
from PIL import Image
32
33
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
34
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
35

36
37
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
38
from vllm import LLM, SamplingParams
39
from vllm.assets.audio import AudioAsset
40
from vllm.assets.image import ImageAsset
41
from vllm.assets.video import VideoAsset
42
43
from vllm.config.model import (ConvertOption, RunnerOption,
                               _get_and_verify_dtype)
44
from vllm.connections import global_http_connection
45
from vllm.distributed import (cleanup_dist_env_and_memory,
46
47
                              init_distributed_environment,
                              initialize_model_parallel)
48
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
49
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
50
from vllm.logger import init_logger
51
from vllm.multimodal.utils import fetch_image
52
from vllm.outputs import RequestOutput
53
from vllm.sampling_params import BeamSearchParams
54
from vllm.sequence import Logprob
55
from vllm.transformers_utils.utils import maybe_model_redirect
56

57
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
58

59
60
61
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
62
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
63

Cyrus Leung's avatar
Cyrus Leung committed
64
_M = TypeVar("_M")
65

66
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
67
68

PromptImageInput = _PromptMultiModalInput[Image.Image]
69
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
70
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
71

72

73
def _read_prompts(filename: str) -> list[str]:
74
    with open(filename) as f:
75
76
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
77
78


79
class ImageAssetPrompts(TypedDict):
80
81
    stop_sign: str
    cherry_blossom: str
82
83


84
class ImageTestAssets(list[ImageAsset]):
85
86

    def __init__(self) -> None:
87
88
89
90
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
91

92
    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
93
94
95
96
97
98
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
99
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
100
101


102
103
class VideoAssetPrompts(TypedDict):
    baby_reading: str
104
105


106
class VideoTestAssets(list[VideoAsset]):
107
108
109

    def __init__(self) -> None:
        super().__init__([
110
            VideoAsset("baby_reading"),
111
112
        ])

113
114
    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
        return [prompts["baby_reading"]]
115
116


117
class AudioAssetPrompts(TypedDict):
118
119
120
121
    mary_had_lamb: str
    winning_call: str


122
class AudioTestAssets(list[AudioAsset]):
123
124
125
126
127
128
129

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])

130
    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
131
132
        return [prompts["mary_had_lamb"], prompts["winning_call"]]

133

134
IMAGE_ASSETS = ImageTestAssets()
135
"""Singleton instance of {class}`ImageTestAssets`."""
136
VIDEO_ASSETS = VideoTestAssets()
137
"""Singleton instance of {class}`VideoTestAssets`."""
138
AUDIO_ASSETS = AudioTestAssets()
139
"""Singleton instance of {class}`AudioTestAssets`."""
140
141


142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
162
@pytest.fixture(params=[True, False])
163
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
164
165
166
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
167
    skip_v0 = request.node.get_closest_marker("skip_v0")
Joe Runde's avatar
Joe Runde committed
168
169
170
171
172
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
173
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
174
    else:
175
176
        if skip_v0:
            pytest.skip("Skipping test on vllm V0")
177
178
179
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
180
181


182
183
184
185
186
187
188
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


189
190
191
192
193
194
195
196
197
198
199
200
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
201
    cleanup_dist_env_and_memory()
202
203


204
@pytest.fixture()
205
def should_do_global_cleanup_after_test(request) -> bool:
206
207
208
209
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
210

211
    return not request.node.get_closest_marker("skip_global_cleanup")
212
213


214
@pytest.fixture(autouse=True)
215
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
216
    yield
217
    if should_do_global_cleanup_after_test:
218
        cleanup_dist_env_and_memory()
219
220


221
222
223
224
225
226
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
227
@pytest.fixture
228
def example_prompts() -> list[str]:
229
230
    prompts = []
    for filename in _TEST_PROMPTS:
231
        prompts += _read_prompts(filename)
232
233
234
    return prompts


235
236
237
238
239
240
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


241
242
243
244
245
246
247
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


248
@pytest.fixture
249
def example_long_prompts() -> list[str]:
250
251
    prompts = []
    for filename in _LONG_PROMPTS:
252
        prompts += _read_prompts(filename)
253
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
254
255


256
@pytest.fixture(scope="session")
257
def image_assets() -> ImageTestAssets:
258
259
260
    return IMAGE_ASSETS


261
@pytest.fixture(scope="session")
262
def video_assets() -> VideoTestAssets:
263
264
265
    return VIDEO_ASSETS


266
@pytest.fixture(scope="session")
267
def audio_assets() -> AudioTestAssets:
268
269
270
    return AUDIO_ASSETS


271
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
272
_R = TypeVar("_R")
273

Woosuk Kwon's avatar
Woosuk Kwon committed
274
275
276

class HfRunner:

277
    def get_default_device(self):
278
        from vllm.platforms import current_platform
279

280
281
        return ("cpu"
                if current_platform.is_cpu() else current_platform.device_type)
282
283

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
284
285
286
        if x is None or isinstance(x, (bool, )):
            return x

287
        if device is None:
288
            device = self.device
289

290
291
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
292

293
294
295
296
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
297

Woosuk Kwon's avatar
Woosuk Kwon committed
298
299
300
    def __init__(
        self,
        model_name: str,
301
        dtype: str = "auto",
302
        *,
303
        model_kwargs: Optional[dict[str, Any]] = None,
304
        trust_remote_code: bool = True,
305
        is_sentence_transformer: bool = False,
306
        is_cross_encoder: bool = False,
307
        skip_tokenizer_init: bool = False,
308
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
309
    ) -> None:
310
        model_name = maybe_model_redirect(model_name)
311
        self.model_name = model_name
312

313
314
        self.config = AutoConfig.from_pretrained(
            model_name,
315
            trust_remote_code=trust_remote_code,
316
317
        )
        self.device = self.get_default_device()
318
319
320
321
322
323
        self.dtype = torch_dtype = _get_and_verify_dtype(
            self.model_name,
            self.config,
            dtype=dtype,
            is_pooling_model=is_sentence_transformer or is_cross_encoder,
        )
324
325
326
327

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

328
        if is_sentence_transformer:
329
330
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
331
332
333
334
335

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
336
                trust_remote_code=trust_remote_code,
337
            )
338
339
340
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
341
342
343
344
345

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
346
                trust_remote_code=trust_remote_code,
347
            )
348
        else:
349
350
            model = auto_cls.from_pretrained(
                model_name,
351
                trust_remote_code=trust_remote_code,
352
353
354
                **model_kwargs,
            )

355
356
357
358
359
360
            # in case some unquantized custom models are not in same dtype
            if (getattr(model, "quantization_method", None) is None
                    and any(p.dtype != self.dtype
                            for p in model.parameters())):
                model = model.to(dtype=self.dtype)

361
362
363
            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
364
                model = model.to(device=self.device)
365
366

            self.model = model
367

368
369
370
371
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
372
                trust_remote_code=trust_remote_code,
373
            )
374

375
376
377
378
379
380
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
381
            trust_remote_code=trust_remote_code,
382
        )
383
384
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
385

386
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
387
        self,
388
        prompts: list[str],
389
        images: Optional[PromptImageInput] = None,
390
391
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
392
    ) -> list[Union[BatchFeature, BatchEncoding]]:
393
        if images is not None:
394
            assert len(prompts) == len(images)
395

396
397
398
399
400
401
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

402
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
403
        for i, prompt in enumerate(prompts):
404
            processor_kwargs: dict[str, Any] = {
405
406
407
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
408
409
410
411
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
412
413
414
415
416
417
418
419
420
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
421
422

            inputs = self.processor(**processor_kwargs)
423
424
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
425

426
427
428
429
            all_inputs.append(inputs)

        return all_inputs

430
431
432
433
434
435
436
437
438
    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
        all_inputs = self.get_inputs(prompts)
        embeddings = []
        for inputs in all_inputs:
            input_ids = self.wrap_device(inputs)["input_ids"]
            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
            embeddings.append(embedding)
        return embeddings

439
    def classify(self, prompts: list[str]) -> list[str]:
440
441
442
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
443
444
        problem_type = getattr(self.config, "problem_type", "")

445
446
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
447
448
449
450
451
452
            if problem_type == "regression":
                logits = output.logits[0].tolist()
            elif problem_type == "multi_label_classification":
                logits = output.logits.sigmoid()[0].tolist()
            else:
                logits = output.logits.softmax(dim=-1)[0].tolist()
453
454
455
456
            outputs.append(logits)

        return outputs

457
458
    def generate(
        self,
459
        prompts: list[str],
460
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
461
        videos: Optional[PromptVideoInput] = None,
462
463
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
464
    ) -> list[tuple[list[list[int]], list[str]]]:
465
466
467
468
469
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

470
        outputs: list[tuple[list[list[int]], list[str]]] = []
471
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
472
            output_ids = self.model.generate(
473
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
474
475
476
                use_cache=True,
                **kwargs,
            )
477
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
478
479
480
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
481
482
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
483
484
485
486
487
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
488
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
489
        max_tokens: int,
490
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
491
        videos: Optional[PromptVideoInput] = None,
492
        audios: Optional[PromptAudioInput] = None,
493
        **kwargs: Any,
494
    ) -> list[tuple[list[int], str]]:
495
496
        outputs = self.generate(prompts,
                                do_sample=False,
497
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
498
                                images=images,
499
500
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
501
                                **kwargs)
502
503
504

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
505
506
507

    def generate_beam_search(
        self,
508
        prompts: list[str],
509
510
        beam_width: int,
        max_tokens: int,
511
512
513
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
514
    ) -> list[tuple[list[list[int]], list[str]]]:
515
516
517
518
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
519
520
521
522
523
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

524
525
526
527
528
529
530
531
532
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
533

534
535
    def generate_greedy_logprobs(
        self,
536
        prompts: list[str],
537
        max_tokens: int,
538
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
539
        videos: Optional[PromptVideoInput] = None,
540
        audios: Optional[PromptAudioInput] = None,
541
        **kwargs: Any,
542
    ) -> list[list[torch.Tensor]]:
543
544
545
546
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
547

548
        all_logprobs: list[list[torch.Tensor]] = []
549
        for inputs in all_inputs:
550
            output = self.model.generate(
551
                **self.wrap_device(inputs),
552
553
554
555
556
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
557
                **kwargs,
558
            )
559
560
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
561
562
563
            all_logprobs.append(seq_logprobs)
        return all_logprobs

564
    def _hidden_states_to_seq_logprobs(
565
        self,
566
567
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
568
569
        output_embeddings = self.model.get_output_embeddings()

570
        seq_logprobs: list[torch.Tensor] = []
571
572
573
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
574
575
576
577
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
578
                output_embeddings.weight.t(),
579
            )
580
581
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
582
583
584
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

585
586
587
588
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
589
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
590
        num_logprobs: Optional[int],
591
    ) -> tuple[list[dict[int, float]], int]:
592
593
594
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

595
        # convert to dict
596
        seq_logprobs_lst: list[dict[int, float]] = []
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

614
615
    def generate_greedy_logprobs_limit(
        self,
616
        prompts: list[str],
617
        max_tokens: int,
618
        num_logprobs: Optional[int],
619
620
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
621
        videos: Optional[PromptVideoInput] = None,
622
        **kwargs: Any,
623
    ) -> list[TokensTextLogprobs]:
624
625
626
627
628
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

629
630
631
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
632

633
        for inputs in all_inputs:
634
            output = self.model.generate(
635
                **self.wrap_device(inputs),
636
637
638
639
640
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
641
                **kwargs,
642
643
            )

644
645
646
647
648
649
650
651
652
653
654
655
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
656

657
658
659
660
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

661
662
663
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
664

665
666
667
668
669
670
    def predict(self, prompts: list[list[str]], *args,
                **kwargs) -> torch.Tensor:
        return self.model.predict(prompts,
                                  *args,
                                  convert_to_tensor=True,
                                  **kwargs)
671

672
673
674
675
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
676
        del self.model
677
        cleanup_dist_env_and_memory()
678

Woosuk Kwon's avatar
Woosuk Kwon committed
679

Cyrus Leung's avatar
Cyrus Leung committed
680
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
681
682
683
684
685
def hf_runner():
    return HfRunner


class VllmRunner:
686
687
    """
    The default value of some arguments have been modified from
688
    {class}`~vllm.LLM` as follows:
689

690
691
692
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
693
694
    - `block_size`: To reduce memory usage, set default to `64` if on XPU
        devices, otherwise default to `16`.
695
696
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
697
    - `enforce_eager`: Set to `False` to test CUDA graph.
698
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
699
700
701
702

    def __init__(
        self,
        model_name: str,
703
704
        runner: RunnerOption = "auto",
        convert: ConvertOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
705
        tokenizer_name: Optional[str] = None,
706
        tokenizer_mode: str = "auto",
707
708
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
709
        max_model_len: Optional[int] = 1024,
710
        dtype: str = "auto",
711
        disable_log_stats: bool = True,
712
        tensor_parallel_size: int = 1,
713
        block_size: int = 16 if not torch.xpu.is_available() else 64,
714
        enable_chunked_prefill: Optional[bool] = False,
715
        swap_space: int = 4,
716
        enforce_eager: Optional[bool] = False,
717
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
718
    ) -> None:
719
        self.llm = LLM(
Woosuk Kwon's avatar
Woosuk Kwon committed
720
            model=model_name,
721
722
            runner=runner,
            convert=convert,
Woosuk Kwon's avatar
Woosuk Kwon committed
723
            tokenizer=tokenizer_name,
724
            tokenizer_mode=tokenizer_mode,
725
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
726
            dtype=dtype,
727
            seed=seed,
728
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
729
            enforce_eager=enforce_eager,
730
            disable_log_stats=disable_log_stats,
731
            tensor_parallel_size=tensor_parallel_size,
732
            max_model_len=max_model_len,
733
734
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
735
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
736
737
        )

738
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
739
        self,
740
        prompts: Union[list[str], list[torch.Tensor], list[int]],
741
        images: Optional[PromptImageInput] = None,
742
743
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
744
    ) -> list[TextPrompt]:
745

746
747
748
749
750
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
751

752
753
754
755
756
757
758
759
760
761
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio

762
            text_prompt_kwargs: dict[str, Any] = {
763
764
                "multi_modal_data": multi_modal_data or None
            }
765
766
767
768
769
770
771
            if isinstance(prompt, str):
                text_prompt_kwargs["prompt"] = prompt
            elif isinstance(prompt, list):
                text_prompt_kwargs["prompt_token_ids"] = prompt
            else:
                text_prompt_kwargs["prompt_embeds"] = prompt

772
            inputs.append(TextPrompt(**text_prompt_kwargs))
773
774
775
776
777

        return inputs

    def generate(
        self,
778
        prompts: Union[list[str], list[torch.Tensor]],
779
780
781
782
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
783
        **kwargs: Any,
784
    ) -> list[tuple[list[list[int]], list[str]]]:
785
786
787
788
789
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

790
791
792
        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)
793

794
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
795
796
797
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
798
799
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
800
801
            for sample in req_output.outputs:
                output_str = sample.text
802
                output_ids = list(sample.token_ids)
803
                req_sample_output_ids.append(prompt_ids + output_ids)
804
                req_sample_output_strs.append((prompt_str or "") + output_str)
805
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
806
807
        return outputs

808
    @staticmethod
809
    def _final_steps_generate_w_logprobs(
810
811
812
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
813
        for req_output in req_outputs:
814
            assert len(req_output.outputs) > 0
815
816
            for sample in req_output.outputs:
                output_str = sample.text
817
                output_ids = list(sample.token_ids)
818
                output_logprobs = sample.logprobs
819
820
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
821
822
        return outputs

823
824
    def generate_w_logprobs(
        self,
825
        prompts: list[str],
826
        sampling_params: SamplingParams,
827
828
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
829
        videos: Optional[PromptVideoInput] = None,
830
        **kwargs: Any,
831
832
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
833
834
835
836
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
837

838
839
840
        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)
841
842
843
844
845
846
847

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
848

Woosuk Kwon's avatar
Woosuk Kwon committed
849
850
    def generate_greedy(
        self,
851
        prompts: Union[list[str], list[torch.Tensor]],
Woosuk Kwon's avatar
Woosuk Kwon committed
852
        max_tokens: int,
853
        images: Optional[PromptImageInput] = None,
854
855
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
856
        **kwargs: Any,
857
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
858
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
859
860
861
862
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
863
864
                                audios=audios,
                                **kwargs)
865
866
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
867

868
869
    def generate_greedy_logprobs(
        self,
870
        prompts: list[str],
871
        max_tokens: int,
872
        num_logprobs: Optional[int],
873
        num_prompt_logprobs: Optional[int] = None,
874
875
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
876
        videos: Optional[PromptVideoInput] = None,
877
878
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
879
        **kwargs: Any,
880
881
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
882
883
884
885
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
886
            prompt_logprobs=num_prompt_logprobs,
887
888
            stop_token_ids=stop_token_ids,
            stop=stop)
889
890
891
892
893

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
894
895
                                        videos=videos,
                                        **kwargs)
896

897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
    def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
        """
        Return the perplexity score associated with generating the prompts

        :param prompts: list of prompts to score
        :return: perplexity score of each prompt
        """
        outputs = self.generate_greedy_logprobs(prompts,
                                                max_tokens=1,
                                                num_logprobs=None,
                                                num_prompt_logprobs=0)

        perplexities = []
        for output in outputs:
            output = cast(TokensTextLogprobsPromptLogprobs, output)
            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
            assert token_datas[0] is None
            token_log_probs = []
            for token_data in token_datas[1:]:
                assert token_data is not None
                assert len(token_data) == 1
                token_log_prob = list(token_data.values())[0].logprob
                token_log_probs.append(token_log_prob)

            perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
            perplexities.append(perplexity)

        return perplexities

926
    def generate_beam_search(
927
        self,
928
        prompts: list[str],
929
930
        beam_width: int,
        max_tokens: int,
931
932
933
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
934
        concurrency_limit: Optional[int] = None,
935
    ) -> list[tuple[list[list[int]], list[str]]]:
936
937
938
939
940
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

941
942
943
944
        outputs = self.llm.beam_search(inputs,
                                       BeamSearchParams(beam_width=beam_width,
                                                        max_tokens=max_tokens),
                                       concurrency_limit=concurrency_limit)
945
946
947
948
949
950
951
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

952
    def classify(self, prompts: list[str]) -> list[list[float]]:
953
        req_outputs = self.llm.classify(prompts)
954
955
        return [req_output.outputs.probs for req_output in req_outputs]

956
957
958
959
960
961
962
    def embed(self,
              prompts: list[str],
              images: Optional[PromptImageInput] = None,
              videos: Optional[PromptVideoInput] = None,
              audios: Optional[PromptAudioInput] = None,
              *args,
              **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
963
964
965
966
967
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

968
        req_outputs = self.llm.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
969
        return [req_output.outputs.embedding for req_output in req_outputs]
970

971
    def encode(self, prompts: list[str]) -> list[list[float]]:
972
        req_outputs = self.llm.encode(prompts)
973
974
        return [req_output.outputs.data for req_output in req_outputs]

975
976
977
978
    def reward(self, prompts: list[str]) -> list[list[float]]:
        req_outputs = self.llm.reward(prompts)
        return [req_output.outputs.data for req_output in req_outputs]

979
980
    def score(
        self,
981
982
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
983
984
        *args,
        **kwargs,
985
    ) -> list[float]:
986
        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
987
        return [req_output.outputs.score for req_output in req_outputs]
988

989
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
990
        return self.llm.apply_model(func)
991

992
993
994
    def get_llm(self) -> LLM:
        return self.llm

995
996
997
998
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
999
        del self.llm
1000
        cleanup_dist_env_and_memory()
1001

Woosuk Kwon's avatar
Woosuk Kwon committed
1002

1003
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1004
1005
def vllm_runner():
    return VllmRunner
1006
1007


1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1022
1023
1024
1025
1026
1027
1028


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1029
1030
    from vllm.platforms import current_platform
    return current_platform.device_count()
1031
1032
1033


temp_dir = tempfile.gettempdir()
1034
1035
_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
1036
_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
1037
1038
1039
1040


@pytest.fixture
def dummy_opt_path():
1041
1042
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1043
        snapshot_download(repo_id="facebook/opt-125m",
1044
                          local_dir=_dummy_opt_path,
1045
1046
1047
1048
1049
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1050
        with open(json_path) as f:
1051
1052
1053
1054
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
    return _dummy_opt_path


@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1069
        with open(json_path) as f:
1070
1071
1072
1073
1074
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1088
        with open(json_path) as f:
1089
1090
1091
1092
1093
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
1112
            item.add_marker(skip_optional)
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240


class AssetHandler(http.server.BaseHTTPRequestHandler):
    # _IMAGE_CACHE : Dict[str, bytes] = {}

    def log_message(self, *args, **kwargs):
        pass

    def do_GET(self):
        # Accepts paths like: /1280px-Venn_diagram_rgb.jpg
        filename = self.path.lstrip("/")
        if not filename or "." not in filename:
            self.send_error(404, "Missing filename (expected /<name>.<ext>)")
            return

        base, ext = filename.rsplit(".", 1)
        ext = ext.lower()

        if ext not in ["jpg", "png"]:
            self.send_error(404, f"Unsupported extension: .{ext}")
            return

        try:
            data = ImageAsset(base).read_bytes(ext=ext)
        except Exception as e:
            self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
            return

        ctype, _ = mimetypes.guess_type(filename)
        if ctype is None:
            ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
        self.send_response(200)
        self.send_header("Content-Type", ctype)
        self.send_header("Content-Length", str(len(data)))
        self.end_headers()
        self.wfile.write(data)


def _find_free_port() -> int:
    with socket.socket() as s:
        s.bind(("127.0.0.1", 0))
        return s.getsockname()[1]


class LocalAssetServer:

    address: str
    port: int
    server: Optional[http.server.ThreadingHTTPServer]
    thread: Optional[threading.Thread]

    def __init__(self, address: str = "127.0.0.1") -> None:
        self.address = address
        self.port = -1
        self.server = None
        self.thread = None

    def __enter__(self):
        self.port = _find_free_port()
        self.server = http.server.ThreadingHTTPServer(
            (self.address, self.port), AssetHandler)
        self.thread = threading.Thread(target=self.server.serve_forever,
                                       daemon=True)
        self.thread.start()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        if self.server:
            self.server.shutdown()
            del self.server

        if self.thread:
            self.thread.join()
            del self.thread

        if exc_type is None:
            return None

        return False

    @property
    def base_url(self) -> str:
        assert self.port is not None
        return f"http://{self.address}:{self.port}"

    def url_for(self, name: str) -> str:
        """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
        return f"{self.base_url}/{name}"

    def get_image_asset(self, name: str) -> Image.Image:
        return fetch_image(self.url_for(name))


@pytest.fixture(scope="session")
def local_asset_server() -> Generator[LocalAssetServer, None, None]:
    """
    Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. 
    The server currently servers images at:
    http://127.0.0.1:<port>/<name>.<ext>
    """
    with LocalAssetServer() as srv:
        yield srv


@pytest.fixture
def image_url(request, local_asset_server) -> str:
    # request.param is one of the IMAGE_ASSETS filenames
    name = request.param
    return local_asset_server.url_for(name)


@pytest.fixture
def image_urls(request, local_asset_server) -> list[str]:
    """Indirect fixture: takes a list of names, returns list of full URLs."""
    names: list[str] = request.param
    return [local_asset_server.url_for(name) for name in names]