conftest.py 40 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import json
4
import os
5
import tempfile
6
from collections import UserList
7
from enum import Enum
zhuwenwen's avatar
zhuwenwen committed
8

9
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
10
11
import pytest
import pytest_html
Woosuk Kwon's avatar
Woosuk Kwon committed
12

13
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
14
15
import pytest
import torch
16
import torch.nn as nn
17
import torch.nn.functional as F
18
from huggingface_hub import snapshot_download
19
from PIL import Image
20
21
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
22
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
23

24
25
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
26
from vllm import LLM, SamplingParams
27
from vllm.assets.audio import AudioAsset
28
from vllm.assets.image import ImageAsset
29
from vllm.assets.video import VideoAsset
30
from vllm.config import TaskOption, _get_and_verify_dtype
31
from vllm.connections import global_http_connection
32
from vllm.distributed import (cleanup_dist_env_and_memory,
33
34
                              init_distributed_environment,
                              initialize_model_parallel)
35
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
36
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
37
from vllm.logger import init_logger
38
from vllm.outputs import RequestOutput
39
from vllm.sampling_params import BeamSearchParams
zhuwenwen's avatar
zhuwenwen committed
40

41
from vllm.utils import cuda_device_count_stateless
zhuwenwen's avatar
zhuwenwen committed
42
from .utils import models_path_prefix
43
44


45
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
46

47
48
49
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
50
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
51

Cyrus Leung's avatar
Cyrus Leung committed
52
_M = TypeVar("_M")
53

54
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
55
56

PromptImageInput = _PromptMultiModalInput[Image.Image]
57
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
58
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
59

60

61
def _read_prompts(filename: str) -> list[str]:
62
    with open(filename) as f:
63
64
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
65
66


67
68
69
class _ImageAssetPrompts(TypedDict):
    stop_sign: str
    cherry_blossom: str
70
71


72
73
class _ImageAssetsBase(UserList[ImageAsset]):
    pass
74

75
76

class _ImageAssets(_ImageAssetsBase):
77
78

    def __init__(self) -> None:
79
80
81
82
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
83

84
    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
85
86
87
88
89
90
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
91
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
92
93


94
95
96
97
class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str


98
99
class _VideoAssetsBase(UserList[VideoAsset]):
    pass
100
101
102
103
104
105
106
107
108


class _VideoAssets(_VideoAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])

109
    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
110
111
112
        return [prompts["sample_demo_1"]]


113
114
115
116
117
118
119
120
121
122
123
124
125
class _AudioAssetsBase(UserList[AudioAsset]):
    pass


class _AudioAssets(_AudioAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])


126
127
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
128
129
VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
130
131
AUDIO_ASSETS = _AudioAssets()
"""Singleton instance of :class:`_AudioAssets`."""
132
133


134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
154
@pytest.fixture(params=[True, False])
155
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
156
157
158
159
160
161
162
163
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
164
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
165
    else:
166
167
168
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
169
170


171
172
173
174
175
176
177
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


178
179
180
181
182
183
184
185
186
187
188
189
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
190
    cleanup_dist_env_and_memory()
191
192


193
@pytest.fixture()
194
def should_do_global_cleanup_after_test(request) -> bool:
195
196
197
198
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
199

200
    return not request.node.get_closest_marker("skip_global_cleanup")
201
202


203
@pytest.fixture(autouse=True)
204
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
205
    yield
206
    if should_do_global_cleanup_after_test:
207
        cleanup_dist_env_and_memory()
208
209


210
211
212
213
214
215
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
216
@pytest.fixture
217
def example_prompts() -> list[str]:
218
219
    prompts = []
    for filename in _TEST_PROMPTS:
220
        prompts += _read_prompts(filename)
221
222
223
    return prompts


224
225
226
227
228
229
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


230
231
232
233
234
235
236
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


237
@pytest.fixture
238
def example_encoder_decoder_prompts(
239
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
240
241
242
243
244
245
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
246

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
262
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
263
        DecoderPromptType.EMPTY_STR:
264
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
265
        DecoderPromptType.CUSTOM:
266
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
267
268
269
    }


270
@pytest.fixture
271
def example_long_prompts() -> list[str]:
272
273
    prompts = []
    for filename in _LONG_PROMPTS:
274
        prompts += _read_prompts(filename)
275
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
276
277


278
279
280
281
282
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS


283
284
285
286
287
@pytest.fixture(scope="session")
def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS


288
289
290
291
292
@pytest.fixture(scope="session")
def audio_assets() -> _AudioAssets:
    return AUDIO_ASSETS


293
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
294
_R = TypeVar("_R")
295

Woosuk Kwon's avatar
Woosuk Kwon committed
296
297
298

class HfRunner:

299
    def get_default_device(self):
300
        from vllm.platforms import current_platform
301

302
        return ("cpu" if current_platform.is_cpu() else "cuda")
303
304

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
305
306
307
        if x is None or isinstance(x, (bool, )):
            return x

308
        if device is None:
309
            device = self.device
310

311
312
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
313

314
315
316
317
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
318

Woosuk Kwon's avatar
Woosuk Kwon committed
319
320
321
    def __init__(
        self,
        model_name: str,
322
        dtype: str = "auto",
323
        *,
324
        model_kwargs: Optional[dict[str, Any]] = None,
325
        is_sentence_transformer: bool = False,
326
        is_cross_encoder: bool = False,
327
        skip_tokenizer_init: bool = False,
328
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
329
    ) -> None:
330
        self.model_name = model_name
331

332
333
334
335
336
337
338
339
340
341
        self.config = AutoConfig.from_pretrained(
            model_name,
            trust_remote_code=True,
        )
        self.device = self.get_default_device()
        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

342
        if is_sentence_transformer:
343
344
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
345
346
347
348
349
350
351

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
                trust_remote_code=True,
            )
352
353
354
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
355
356
357
358
359
360
361

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
                trust_remote_code=True,
            )
362
        else:
363
364
365
366
367
368
369
370
371
372
373
374
            model = auto_cls.from_pretrained(
                model_name,
                trust_remote_code=True,
                **model_kwargs,
            )

            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
                model = model.to(self.device)

            self.model = model
375

376
377
378
379
380
381
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            )
382

383
384
385
386
387
388
389
390
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
391
392
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
393

394
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
395
        self,
396
        prompts: list[str],
397
        images: Optional[PromptImageInput] = None,
398
399
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
400
    ) -> list[Union[BatchFeature, BatchEncoding]]:
401
        if images is not None:
402
            assert len(prompts) == len(images)
403

404
405
406
407
408
409
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

410
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
411
        for i, prompt in enumerate(prompts):
412
            processor_kwargs: dict[str, Any] = {
413
414
415
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
416
417
418
419
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
420
421
422
423
424
425
426
427
428
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
429
430

            inputs = self.processor(**processor_kwargs)
431
432
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
433

434
435
436
437
            all_inputs.append(inputs)

        return all_inputs

438
    def classify(self, prompts: list[str]) -> list[str]:
439
440
441
442
443
444
445
446
447
448
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

449
450
    def generate(
        self,
451
        prompts: list[str],
452
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
453
        videos: Optional[PromptVideoInput] = None,
454
455
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
456
    ) -> list[tuple[list[list[int]], list[str]]]:
457
458
459
460
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
461

462
        outputs: list[tuple[list[list[int]], list[str]]] = []
463
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
464
            output_ids = self.model.generate(
465
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
466
467
468
                use_cache=True,
                **kwargs,
            )
469
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
470
471
472
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
473
474
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
475
476
477
478
479
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
480
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
481
        max_tokens: int,
482
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
483
        videos: Optional[PromptVideoInput] = None,
484
        audios: Optional[PromptAudioInput] = None,
485
        **kwargs: Any,
486
    ) -> list[tuple[list[int], str]]:
487
488
        outputs = self.generate(prompts,
                                do_sample=False,
489
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
490
                                images=images,
491
492
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
493
                                **kwargs)
494
495
496

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
497
498
499

    def generate_beam_search(
        self,
500
        prompts: list[str],
501
502
        beam_width: int,
        max_tokens: int,
503
504
505
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
506
    ) -> list[tuple[list[list[int]], list[str]]]:
507
508
509
510
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
511
512
513
514
515
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

516
517
518
519
520
521
522
523
524
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
525

526
527
    def generate_greedy_logprobs(
        self,
528
        prompts: list[str],
529
        max_tokens: int,
530
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
531
        videos: Optional[PromptVideoInput] = None,
532
        audios: Optional[PromptAudioInput] = None,
533
        **kwargs: Any,
534
    ) -> list[list[torch.Tensor]]:
535
536
537
538
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
539

540
        all_logprobs: list[list[torch.Tensor]] = []
541
        for inputs in all_inputs:
542
            output = self.model.generate(
543
                **self.wrap_device(inputs),
544
545
546
547
548
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
549
                **kwargs,
550
            )
551
552
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
553
554
555
            all_logprobs.append(seq_logprobs)
        return all_logprobs

556
    def _hidden_states_to_seq_logprobs(
557
        self,
558
559
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
560
561
        output_embeddings = self.model.get_output_embeddings()

562
        seq_logprobs: list[torch.Tensor] = []
563
564
565
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
566
567
568
569
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
570
                output_embeddings.weight.t(),
571
            )
572
573
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
574
575
576
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

577
578
579
580
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
581
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
582
        num_logprobs: int,
583
    ) -> tuple[list[dict[int, float]], int]:
584
585
586
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

587
        # convert to dict
588
        seq_logprobs_lst: list[dict[int, float]] = []
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

606
607
    def generate_greedy_logprobs_limit(
        self,
608
        prompts: list[str],
609
610
        max_tokens: int,
        num_logprobs: int,
611
612
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
613
        videos: Optional[PromptVideoInput] = None,
614
        **kwargs: Any,
615
    ) -> list[TokensTextLogprobs]:
616
617
618
619
620
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

621
622
623
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
624

625
        for inputs in all_inputs:
626
            output = self.model.generate(
627
                **self.wrap_device(inputs),
628
629
630
631
632
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
633
                **kwargs,
634
635
            )

636
637
638
639
640
641
642
643
644
645
646
647
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
648

649
650
651
652
653
654
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
655
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
656
657
        max_tokens: int,
        num_logprobs: int,
658
        images: Optional[PromptImageInput] = None,
659
        **kwargs: Any,
660
    ) -> list[TokensTextLogprobs]:
661
662
663
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
664

665
666
667
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
668

669
670
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
671
            processor_kwargs: dict[str, Any] = {
672
673
674
675
676
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
677

678
679
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
680
681
682
683

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
684
685
686
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
687
688
689
690
691
692
693
694

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
695
                **encoder_inputs,
696
697
698
699
700
701
702
703
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
704
705
706
707
708
709
710
711
712
713
714

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

715
716
717
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
718

719
    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
720
721
        return self.model.predict(prompts, convert_to_tensor=True)

722
723
724
725
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
726
        del self.model
727
        cleanup_dist_env_and_memory()
728

Woosuk Kwon's avatar
Woosuk Kwon committed
729

Cyrus Leung's avatar
Cyrus Leung committed
730
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
731
732
733
734
735
def hf_runner():
    return HfRunner


class VllmRunner:
736
737
738
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
739

740
741
742
743
744
745
746
747
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
748
749
750
751

    def __init__(
        self,
        model_name: str,
752
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
753
        tokenizer_name: Optional[str] = None,
754
        tokenizer_mode: str = "auto",
755
756
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
757
        max_model_len: int = 1024,
758
        dtype: str = "auto",
759
        disable_log_stats: bool = True,
760
        tensor_parallel_size: int = 1,
761
        block_size: int = 16,
762
        enable_chunked_prefill: Optional[bool] = False,
763
        swap_space: int = 4,
764
        enforce_eager: Optional[bool] = False,
765
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
766
767
768
    ) -> None:
        self.model = LLM(
            model=model_name,
769
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
770
            tokenizer=tokenizer_name,
771
            tokenizer_mode=tokenizer_mode,
772
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
773
            dtype=dtype,
774
            seed=seed,
775
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
776
            enforce_eager=enforce_eager,
777
            disable_log_stats=disable_log_stats,
778
            tensor_parallel_size=tensor_parallel_size,
779
            max_model_len=max_model_len,
780
781
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
782
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
783
784
        )

785
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
786
        self,
787
        prompts: list[str],
788
        images: Optional[PromptImageInput] = None,
789
790
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
791
    ) -> list[TextPrompt]:
792

793
794
795
796
797
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
798

799
800
801
802
803
804
805
806
807
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio
808

809
810
811
812
            inputs.append(
                TextPrompt(prompt=prompt,
                           multi_modal_data=multi_modal_data
                           if multi_modal_data else None))
813
814
815
816
817

        return inputs

    def generate(
        self,
818
        prompts: list[str],
819
820
821
822
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
823
        **kwargs: Any,
824
    ) -> list[tuple[list[list[int]], list[str]]]:
825
826
827
828
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
829

830
        req_outputs = self.model.generate(inputs,
831
832
                                          sampling_params=sampling_params,
                                          **kwargs)
833

834
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
835
836
837
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
838
839
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
840
841
            for sample in req_output.outputs:
                output_str = sample.text
842
                output_ids = list(sample.token_ids)
843
844
845
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
846
847
        return outputs

848
    @staticmethod
849
    def _final_steps_generate_w_logprobs(
850
851
852
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
853
        for req_output in req_outputs:
854
            assert len(req_output.outputs) > 0
855
856
            for sample in req_output.outputs:
                output_str = sample.text
857
                output_ids = list(sample.token_ids)
858
                output_logprobs = sample.logprobs
859
860
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
861
862
        return outputs

863
864
    def generate_w_logprobs(
        self,
865
        prompts: list[str],
866
        sampling_params: SamplingParams,
867
868
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
869
        videos: Optional[PromptVideoInput] = None,
870
        **kwargs: Any,
871
872
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
873
874
875
876
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
877

878
        req_outputs = self.model.generate(inputs,
879
880
                                          sampling_params=sampling_params,
                                          **kwargs)
881
882
883
884
885
886
887

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
888
889
890

    def generate_encoder_decoder_w_logprobs(
        self,
891
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
892
        sampling_params: SamplingParams,
893
894
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
895
896
897
898
899
900
901
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
902
903
904
905
906
907
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
908

Woosuk Kwon's avatar
Woosuk Kwon committed
909
910
    def generate_greedy(
        self,
911
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
912
        max_tokens: int,
913
        images: Optional[PromptImageInput] = None,
914
915
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
916
        **kwargs: Any,
917
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
918
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
919
920
921
922
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
923
924
                                audios=audios,
                                **kwargs)
925
926
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
927

928
929
    def generate_greedy_logprobs(
        self,
930
        prompts: list[str],
931
932
        max_tokens: int,
        num_logprobs: int,
933
        num_prompt_logprobs: Optional[int] = None,
934
935
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
936
        videos: Optional[PromptVideoInput] = None,
937
938
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
939
        **kwargs: Any,
940
941
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
942
943
944
945
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
946
            prompt_logprobs=num_prompt_logprobs,
947
948
            stop_token_ids=stop_token_ids,
            stop=stop)
949
950
951
952
953

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
954
955
                                        videos=videos,
                                        **kwargs)
956

957
958
    def generate_encoder_decoder_greedy_logprobs(
        self,
959
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
960
961
        max_tokens: int,
        num_logprobs: int,
962
        num_prompt_logprobs: Optional[int] = None,
963
        skip_special_tokens: bool = True,
964
965
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
966
967
968
969
970
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
971
            skip_special_tokens=skip_special_tokens,
972
        )
973
974
975
976
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

977
        return self.generate_encoder_decoder_w_logprobs(
978
979
            encoder_decoder_prompts, greedy_logprobs_params)

980
    def generate_beam_search(
981
        self,
982
        prompts: list[str],
983
984
        beam_width: int,
        max_tokens: int,
985
986
987
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
988
    ) -> list[tuple[list[list[int]], list[str]]]:
989
990
991
992
993
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

994
        outputs = self.model.beam_search(
995
            inputs,
996
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
997
998
999
1000
1001
1002
1003
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

1004
    def classify(self, prompts: list[str]) -> list[list[float]]:
1005
1006
1007
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

1008
1009
1010
1011
1012
1013
1014
    def encode(self,
               prompts: list[str],
               images: Optional[PromptImageInput] = None,
               videos: Optional[PromptVideoInput] = None,
               audios: Optional[PromptAudioInput] = None,
               *args,
               **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
1015
1016
1017
1018
1019
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1020
        req_outputs = self.model.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
1021
        return [req_output.outputs.embedding for req_output in req_outputs]
1022

1023
1024
    def score(
        self,
1025
1026
1027
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
    ) -> list[float]:
1028
        req_outputs = self.model.score(text_1, text_2)
1029
        return [req_output.outputs.score for req_output in req_outputs]
1030

1031
1032
1033
1034
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

1035
1036
1037
1038
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
1039
        del self.model
1040
        cleanup_dist_env_and_memory()
1041

Woosuk Kwon's avatar
Woosuk Kwon committed
1042

1043
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1044
1045
def vllm_runner():
    return VllmRunner
1046
1047


1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1062
1063
1064
1065
1066
1067
1068


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1069
    return cuda_device_count_stateless()
1070
1071


1072
# temp_dir = tempfile.gettempdir()
zhuwenwen's avatar
zhuwenwen committed
1073
1074
1075
1076
_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")

1077
1078
1079
1080


@pytest.fixture
def dummy_opt_path():
1081
1082
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1083
        snapshot_download(repo_id="facebook/opt-125m",
1084
                          local_dir=_dummy_opt_path,
1085
1086
1087
1088
1089
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1090
        with open(json_path) as f:
1091
1092
1093
1094
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1095
1096
    return _dummy_opt_path

1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114

# 定义一个 pytest 钩子,在测试后生成报告
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
    # 获取测试结果
    outcome = yield
    result = outcome.get_result()

    # 如果测试失败并且有浏览器实例,添加截图
    if result.when == "call" and result.failed:
        if hasattr(item, "funcargs") and "browser" in item.funcargs:
            browser = item.funcargs["browser"]
            screenshot_path = "screenshot.png"  # 设置截图路径
            browser.save_screenshot(screenshot_path)

            # 如果测试结果有 extra 属性,则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
zhuwenwen's avatar
zhuwenwen committed
1115
1116


1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1128
        with open(json_path) as f:
1129
1130
1131
1132
1133
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1147
        with open(json_path) as f:
1148
1149
1150
1151
1152
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")