conftest.py 42.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import json
4
import os
5
import tempfile
6
from enum import Enum
zhuwenwen's avatar
zhuwenwen committed
7

8
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
9
10
import pytest
import pytest_html
Woosuk Kwon's avatar
Woosuk Kwon committed
11

12
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
13
14
import pytest
import torch
15
import torch.nn as nn
16
import torch.nn.functional as F
17
from huggingface_hub import snapshot_download
18
from PIL import Image
19
20
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
21
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
22

23
24
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
25
from vllm import LLM, SamplingParams
26
from vllm.assets.audio import AudioAsset
27
from vllm.assets.image import ImageAsset
28
from vllm.assets.video import VideoAsset
29
from vllm.config import TaskOption, _get_and_verify_dtype
30
from vllm.connections import global_http_connection
31
from vllm.distributed import (cleanup_dist_env_and_memory,
32
33
                              init_distributed_environment,
                              initialize_model_parallel)
34
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
35
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
36
from vllm.logger import init_logger
37
from vllm.outputs import RequestOutput
38
from vllm.sampling_params import BeamSearchParams
zhuwenwen's avatar
zhuwenwen committed
39

40

41
from vllm.transformers_utils.utils import maybe_model_redirect
zhuwenwen's avatar
zhuwenwen committed
42
from .utils import models_path_prefix
43

44
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
45

46
47
48
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
49
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
50

Cyrus Leung's avatar
Cyrus Leung committed
51
_M = TypeVar("_M")
52

53
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
54
55

PromptImageInput = _PromptMultiModalInput[Image.Image]
56
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
57
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
58

59

60
def _read_prompts(filename: str) -> list[str]:
61
    with open(filename) as f:
62
63
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
64
65


66
class ImageAssetPrompts(TypedDict):
67
68
    stop_sign: str
    cherry_blossom: str
69
70


71
class ImageTestAssets(list[ImageAsset]):
72
73

    def __init__(self) -> None:
74
75
76
77
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
78

79
    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
80
81
82
83
84
85
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
86
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
87
88


89
90
class VideoAssetPrompts(TypedDict):
    baby_reading: str
91
92


93
class VideoTestAssets(list[VideoAsset]):
94
95
96

    def __init__(self) -> None:
        super().__init__([
97
            VideoAsset("baby_reading"),
98
99
        ])

100
101
    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
        return [prompts["baby_reading"]]
102
103


104
class AudioAssetPrompts(TypedDict):
105
106
    mary_had_lamb: str
    winning_call: str
107
108


109
class AudioTestAssets(list[AudioAsset]):
110
111
112
113
114
115
116

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])

117
    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
118
        return [prompts["mary_had_lamb"], prompts["winning_call"]]
119
120


121
IMAGE_ASSETS = ImageTestAssets()
122
"""Singleton instance of {class}`ImageTestAssets`."""
123
VIDEO_ASSETS = VideoTestAssets()
124
"""Singleton instance of {class}`VideoTestAssets`."""
125
AUDIO_ASSETS = AudioTestAssets()
126
"""Singleton instance of {class}`AudioTestAssets`."""
127
128


129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
149
@pytest.fixture(params=[True, False])
150
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
151
152
153
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
154
    skip_v0 = request.node.get_closest_marker("skip_v0")
Joe Runde's avatar
Joe Runde committed
155
156
157
158
159
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
160
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
161
    else:
162
163
        if skip_v0:
            pytest.skip("Skipping test on vllm V0")
164
165
166
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
167
168


169
170
171
172
173
174
175
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


176
177
178
179
180
181
182
183
184
185
186
187
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
188
    cleanup_dist_env_and_memory()
189
190


191
@pytest.fixture()
192
def should_do_global_cleanup_after_test(request) -> bool:
193
194
195
196
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
197

198
    return not request.node.get_closest_marker("skip_global_cleanup")
199
200


201
@pytest.fixture(autouse=True)
202
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
203
    yield
204
    if should_do_global_cleanup_after_test:
205
        cleanup_dist_env_and_memory()
206
207


208
209
210
211
212
213
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
214
@pytest.fixture
215
def example_prompts() -> list[str]:
216
217
    prompts = []
    for filename in _TEST_PROMPTS:
218
        prompts += _read_prompts(filename)
219
220
221
    return prompts


222
223
224
225
226
227
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


228
229
230
231
232
233
234
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


235
@pytest.fixture
236
def example_encoder_decoder_prompts(
237
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
238
239
240
241
242
243
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
244

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
260
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
261
        DecoderPromptType.EMPTY_STR:
262
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
263
        DecoderPromptType.CUSTOM:
264
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
265
266
267
    }


268
@pytest.fixture
269
def example_long_prompts() -> list[str]:
270
271
    prompts = []
    for filename in _LONG_PROMPTS:
272
        prompts += _read_prompts(filename)
273
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
274
275


276
@pytest.fixture(scope="session")
277
def image_assets() -> ImageTestAssets:
278
279
280
    return IMAGE_ASSETS


281
@pytest.fixture(scope="session")
282
def video_assets() -> VideoTestAssets:
283
284
285
    return VIDEO_ASSETS


286
@pytest.fixture(scope="session")
287
def audio_assets() -> AudioTestAssets:
288
289
290
    return AUDIO_ASSETS


291
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
292
_R = TypeVar("_R")
293

Woosuk Kwon's avatar
Woosuk Kwon committed
294
295
296

class HfRunner:

297
    def get_default_device(self):
298
        from vllm.platforms import current_platform
299

300
301
        return ("cpu"
                if current_platform.is_cpu() else current_platform.device_type)
302
303

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
304
305
306
        if x is None or isinstance(x, (bool, )):
            return x

307
        if device is None:
308
            device = self.device
309

310
311
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
312

313
314
315
316
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
317

Woosuk Kwon's avatar
Woosuk Kwon committed
318
319
320
    def __init__(
        self,
        model_name: str,
321
        dtype: str = "auto",
322
        *,
323
        model_kwargs: Optional[dict[str, Any]] = None,
324
        trust_remote_code: bool = True,
325
        is_sentence_transformer: bool = False,
326
        is_cross_encoder: bool = False,
327
        skip_tokenizer_init: bool = False,
328
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
329
    ) -> None:
330
        model_name = maybe_model_redirect(model_name)
331
        self.model_name = model_name
332

333
334
        self.config = AutoConfig.from_pretrained(
            model_name,
335
            trust_remote_code=trust_remote_code,
336
337
        )
        self.device = self.get_default_device()
338
339
340
341
342
343
        self.dtype = torch_dtype = _get_and_verify_dtype(
            self.model_name,
            self.config,
            dtype=dtype,
            is_pooling_model=is_sentence_transformer or is_cross_encoder,
        )
344
345
346
347

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

348
        if is_sentence_transformer:
349
350
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
351
352
353
354
355

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
356
                trust_remote_code=trust_remote_code,
357
            )
358
359
360
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
361
362
363
364
365

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
366
                trust_remote_code=trust_remote_code,
367
            )
368
        else:
369
370
            model = auto_cls.from_pretrained(
                model_name,
371
                trust_remote_code=trust_remote_code,
372
373
374
                **model_kwargs,
            )

375
376
377
378
379
380
            # in case some unquantized custom models are not in same dtype
            if (getattr(model, "quantization_method", None) is None
                    and any(p.dtype != self.dtype
                            for p in model.parameters())):
                model = model.to(dtype=self.dtype)

381
382
383
            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
384
                model = model.to(device=self.device)
385
386

            self.model = model
387

388
389
390
391
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
392
                trust_remote_code=trust_remote_code,
393
            )
394

395
396
397
398
399
400
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
401
            trust_remote_code=trust_remote_code,
402
        )
403
404
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
405

406
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
407
        self,
408
        prompts: list[str],
409
        images: Optional[PromptImageInput] = None,
410
411
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
412
    ) -> list[Union[BatchFeature, BatchEncoding]]:
413
        if images is not None:
414
            assert len(prompts) == len(images)
415

416
417
418
419
420
421
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

422
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
423
        for i, prompt in enumerate(prompts):
424
            processor_kwargs: dict[str, Any] = {
425
426
427
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
428
429
430
431
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
432
433
434
435
436
437
438
439
440
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
441
442

            inputs = self.processor(**processor_kwargs)
443
444
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
445

446
447
448
449
            all_inputs.append(inputs)

        return all_inputs

450
451
452
453
454
455
456
457
458
    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
        all_inputs = self.get_inputs(prompts)
        embeddings = []
        for inputs in all_inputs:
            input_ids = self.wrap_device(inputs)["input_ids"]
            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
            embeddings.append(embedding)
        return embeddings

459
    def classify(self, prompts: list[str]) -> list[str]:
460
461
462
463
464
465
466
467
468
469
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

470
471
    def generate(
        self,
472
        prompts: list[str],
473
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
474
        videos: Optional[PromptVideoInput] = None,
475
476
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
477
    ) -> list[tuple[list[list[int]], list[str]]]:
478
479
480
481
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
482

483
        outputs: list[tuple[list[list[int]], list[str]]] = []
484
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
485
            output_ids = self.model.generate(
486
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
487
488
489
                use_cache=True,
                **kwargs,
            )
490
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
491
492
493
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
494
495
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
496
497
498
499
500
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
501
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
502
        max_tokens: int,
503
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
504
        videos: Optional[PromptVideoInput] = None,
505
        audios: Optional[PromptAudioInput] = None,
506
        **kwargs: Any,
507
    ) -> list[tuple[list[int], str]]:
508
509
        outputs = self.generate(prompts,
                                do_sample=False,
510
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
511
                                images=images,
512
513
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
514
                                **kwargs)
515
516
517

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
518
519
520

    def generate_beam_search(
        self,
521
        prompts: list[str],
522
523
        beam_width: int,
        max_tokens: int,
524
525
526
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
527
    ) -> list[tuple[list[list[int]], list[str]]]:
528
529
530
531
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
532
533
534
535
536
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

537
538
539
540
541
542
543
544
545
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
546

547
548
    def generate_greedy_logprobs(
        self,
549
        prompts: list[str],
550
        max_tokens: int,
551
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
552
        videos: Optional[PromptVideoInput] = None,
553
        audios: Optional[PromptAudioInput] = None,
554
        **kwargs: Any,
555
    ) -> list[list[torch.Tensor]]:
556
557
558
559
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
560

561
        all_logprobs: list[list[torch.Tensor]] = []
562
        for inputs in all_inputs:
563
            output = self.model.generate(
564
                **self.wrap_device(inputs),
565
566
567
568
569
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
570
                **kwargs,
571
            )
572
573
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
574
575
576
            all_logprobs.append(seq_logprobs)
        return all_logprobs

577
    def _hidden_states_to_seq_logprobs(
578
        self,
579
580
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
581
582
        output_embeddings = self.model.get_output_embeddings()

583
        seq_logprobs: list[torch.Tensor] = []
584
585
586
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
587
588
589
590
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
591
                output_embeddings.weight.t(),
592
            )
593
594
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
595
596
597
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

598
599
600
601
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
602
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
603
        num_logprobs: int,
604
    ) -> tuple[list[dict[int, float]], int]:
605
606
607
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

608
        # convert to dict
609
        seq_logprobs_lst: list[dict[int, float]] = []
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

627
628
    def generate_greedy_logprobs_limit(
        self,
629
        prompts: list[str],
630
631
        max_tokens: int,
        num_logprobs: int,
632
633
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
634
        videos: Optional[PromptVideoInput] = None,
635
        **kwargs: Any,
636
    ) -> list[TokensTextLogprobs]:
637
638
639
640
641
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

642
643
644
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
645

646
        for inputs in all_inputs:
647
            output = self.model.generate(
648
                **self.wrap_device(inputs),
649
650
651
652
653
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
654
                **kwargs,
655
656
            )

657
658
659
660
661
662
663
664
665
666
667
668
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
669

670
671
672
673
674
675
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
676
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
677
678
        max_tokens: int,
        num_logprobs: int,
679
        images: Optional[PromptImageInput] = None,
680
        **kwargs: Any,
681
    ) -> list[TokensTextLogprobs]:
682
683
684
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
685

686
687
688
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
689

690
691
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
692
            processor_kwargs: dict[str, Any] = {
693
694
695
696
697
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
698

699
700
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
701
702
703
704

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
705
706
707
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
708
709
710
711
712
713
714
715

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
716
                **encoder_inputs,
717
718
719
720
721
722
723
724
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
725
726
727
728
729
730
731
732
733
734
735

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

736
737
738
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
739

740
741
742
743
744
745
    def predict(self, prompts: list[list[str]], *args,
                **kwargs) -> torch.Tensor:
        return self.model.predict(prompts,
                                  *args,
                                  convert_to_tensor=True,
                                  **kwargs)
746

747
748
749
750
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
751
        del self.model
752
        cleanup_dist_env_and_memory()
753

Woosuk Kwon's avatar
Woosuk Kwon committed
754

Cyrus Leung's avatar
Cyrus Leung committed
755
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
756
757
758
759
760
def hf_runner():
    return HfRunner


class VllmRunner:
761
762
    """
    The default value of some arguments have been modified from
763
    {class}`~vllm.LLM` as follows:
764

765
766
767
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
768
769
    - `block_size`: To reduce memory usage, set default to `64` if on XPU
        devices, otherwise default to `16`.
770
771
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
772
    - `enforce_eager`: Set to `False` to test CUDA graph.
773
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
774
775
776
777

    def __init__(
        self,
        model_name: str,
778
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
779
        tokenizer_name: Optional[str] = None,
780
        tokenizer_mode: str = "auto",
781
782
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
783
        max_model_len: int = 1024,
784
        dtype: str = "auto",
785
        disable_log_stats: bool = True,
786
        tensor_parallel_size: int = 1,
787
        block_size: int = 16 if not torch.xpu.is_available() else 64,
788
        enable_chunked_prefill: Optional[bool] = False,
789
        swap_space: int = 4,
790
        enforce_eager: Optional[bool] = False,
791
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
792
    ) -> None:
793
        self.llm = LLM(
Woosuk Kwon's avatar
Woosuk Kwon committed
794
            model=model_name,
795
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
796
            tokenizer=tokenizer_name,
797
            tokenizer_mode=tokenizer_mode,
798
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
799
            dtype=dtype,
800
            seed=seed,
801
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
802
            enforce_eager=enforce_eager,
803
            disable_log_stats=disable_log_stats,
804
            tensor_parallel_size=tensor_parallel_size,
805
            max_model_len=max_model_len,
806
807
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
808
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
809
810
        )

811
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
812
        self,
813
        prompts: Union[list[str], list[torch.Tensor], list[int]],
814
        images: Optional[PromptImageInput] = None,
815
816
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
817
    ) -> list[TextPrompt]:
818

819
820
821
822
823
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
824

825
826
827
828
829
830
831
832
833
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio
834

835
            text_prompt_kwargs: dict[str, Any] = {
836
837
                "multi_modal_data": multi_modal_data or None
            }
838
839
840
841
842
843
844
            if isinstance(prompt, str):
                text_prompt_kwargs["prompt"] = prompt
            elif isinstance(prompt, list):
                text_prompt_kwargs["prompt_token_ids"] = prompt
            else:
                text_prompt_kwargs["prompt_embeds"] = prompt

845
            inputs.append(TextPrompt(**text_prompt_kwargs))
846
847
848
849
850

        return inputs

    def generate(
        self,
851
        prompts: Union[list[str], list[torch.Tensor]],
852
853
854
855
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
856
        **kwargs: Any,
857
    ) -> list[tuple[list[list[int]], list[str]]]:
858
859
860
861
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
862

863
864
865
        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)
866

867
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
868
869
870
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
871
872
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
873
874
            for sample in req_output.outputs:
                output_str = sample.text
875
                output_ids = list(sample.token_ids)
876
                req_sample_output_ids.append(prompt_ids + output_ids)
877
                req_sample_output_strs.append((prompt_str or "") + output_str)
878
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
879
880
        return outputs

881
    @staticmethod
882
    def _final_steps_generate_w_logprobs(
883
884
885
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
886
        for req_output in req_outputs:
887
            assert len(req_output.outputs) > 0
888
889
            for sample in req_output.outputs:
                output_str = sample.text
890
                output_ids = list(sample.token_ids)
891
                output_logprobs = sample.logprobs
892
893
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
894
895
        return outputs

896
897
    def generate_w_logprobs(
        self,
898
        prompts: list[str],
899
        sampling_params: SamplingParams,
900
901
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
902
        videos: Optional[PromptVideoInput] = None,
903
        **kwargs: Any,
904
905
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
906
907
908
909
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
910

911
912
913
        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)
914
915
916
917
918
919
920

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
921
922
923

    def generate_encoder_decoder_w_logprobs(
        self,
924
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
925
        sampling_params: SamplingParams,
926
927
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
928
929
930
931
932
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
933
934
        req_outputs = self.llm.generate(encoder_decoder_prompts,
                                        sampling_params=sampling_params)
935
936
937
938
939
940
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
941

Woosuk Kwon's avatar
Woosuk Kwon committed
942
943
    def generate_greedy(
        self,
944
        prompts: Union[list[str], list[torch.Tensor]],
Woosuk Kwon's avatar
Woosuk Kwon committed
945
        max_tokens: int,
946
        images: Optional[PromptImageInput] = None,
947
948
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
949
        **kwargs: Any,
950
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
951
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
952
953
954
955
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
956
957
                                audios=audios,
                                **kwargs)
958
959
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
960

961
962
    def generate_greedy_logprobs(
        self,
963
        prompts: list[str],
964
965
        max_tokens: int,
        num_logprobs: int,
966
        num_prompt_logprobs: Optional[int] = None,
967
968
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
969
        videos: Optional[PromptVideoInput] = None,
970
971
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
972
        **kwargs: Any,
973
974
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
975
976
977
978
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
979
            prompt_logprobs=num_prompt_logprobs,
980
981
            stop_token_ids=stop_token_ids,
            stop=stop)
982
983
984
985
986

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
987
988
                                        videos=videos,
                                        **kwargs)
989

990
991
    def generate_encoder_decoder_greedy_logprobs(
        self,
992
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
993
994
        max_tokens: int,
        num_logprobs: int,
995
        num_prompt_logprobs: Optional[int] = None,
996
        skip_special_tokens: bool = True,
997
998
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
999
1000
1001
1002
1003
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
1004
            skip_special_tokens=skip_special_tokens,
1005
        )
1006
1007
1008
1009
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

1010
        return self.generate_encoder_decoder_w_logprobs(
1011
1012
            encoder_decoder_prompts, greedy_logprobs_params)

1013
    def generate_beam_search(
1014
        self,
1015
        prompts: list[str],
1016
1017
        beam_width: int,
        max_tokens: int,
1018
1019
1020
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
1021
    ) -> list[tuple[list[list[int]], list[str]]]:
1022
1023
1024
1025
1026
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1027
        outputs = self.llm.beam_search(
1028
            inputs,
1029
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
1030
1031
1032
1033
1034
1035
1036
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

1037
    def classify(self, prompts: list[str]) -> list[list[float]]:
1038
        req_outputs = self.llm.classify(prompts)
1039
1040
        return [req_output.outputs.probs for req_output in req_outputs]

1041
1042
1043
1044
1045
1046
1047
    def embed(self,
              prompts: list[str],
              images: Optional[PromptImageInput] = None,
              videos: Optional[PromptVideoInput] = None,
              audios: Optional[PromptAudioInput] = None,
              *args,
              **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
1048
1049
1050
1051
1052
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1053
        req_outputs = self.llm.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
1054
        return [req_output.outputs.embedding for req_output in req_outputs]
1055

1056
    def encode(self, prompts: list[str]) -> list[list[float]]:
1057
        req_outputs = self.llm.encode(prompts)
1058
1059
        return [req_output.outputs.data for req_output in req_outputs]

1060
1061
    def score(
        self,
1062
1063
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
1064
1065
        *args,
        **kwargs,
1066
    ) -> list[float]:
1067
        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
1068
        return [req_output.outputs.score for req_output in req_outputs]
1069

1070
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
1071
        executor = self.llm.llm_engine.model_executor
1072
1073
        return executor.apply_model(func)

1074
1075
1076
1077
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
1078
        del self.llm
1079
        cleanup_dist_env_and_memory()
1080

Woosuk Kwon's avatar
Woosuk Kwon committed
1081

1082
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1083
1084
def vllm_runner():
    return VllmRunner
1085
1086


1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1101
1102
1103
1104
1105
1106
1107


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1108
1109
    from vllm.platforms import current_platform
    return current_platform.device_count()
1110
1111


1112
# temp_dir = tempfile.gettempdir()
zhuwenwen's avatar
zhuwenwen committed
1113
1114
1115
1116
_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")

1117
1118
1119
1120


@pytest.fixture
def dummy_opt_path():
1121
1122
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1123
        snapshot_download(repo_id="facebook/opt-125m",
1124
                          local_dir=_dummy_opt_path,
1125
1126
1127
1128
1129
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1130
        with open(json_path) as f:
1131
1132
1133
1134
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1135
1136
    return _dummy_opt_path

1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154

# 定义一个 pytest 钩子,在测试后生成报告
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
    # 获取测试结果
    outcome = yield
    result = outcome.get_result()

    # 如果测试失败并且有浏览器实例,添加截图
    if result.when == "call" and result.failed:
        if hasattr(item, "funcargs") and "browser" in item.funcargs:
            browser = item.funcargs["browser"]
            screenshot_path = "screenshot.png"  # 设置截图路径
            browser.save_screenshot(screenshot_path)

            # 如果测试结果有 extra 属性,则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
zhuwenwen's avatar
zhuwenwen committed
1155
1156


1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1168
        with open(json_path) as f:
1169
1170
1171
1172
1173
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1187
        with open(json_path) as f:
1188
1189
1190
1191
1192
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")