conftest.py 41.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
import json
4
import os
5
import tempfile
6
from enum import Enum
zhuwenwen's avatar
zhuwenwen committed
7

8
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
9
10
import pytest
import pytest_html
Woosuk Kwon's avatar
Woosuk Kwon committed
11

12
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
13
14
import pytest
import torch
15
import torch.nn as nn
16
import torch.nn.functional as F
17
from huggingface_hub import snapshot_download
18
from PIL import Image
19
20
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
21
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
22

23
24
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
25
from vllm import LLM, SamplingParams
26
from vllm.assets.audio import AudioAsset
27
from vllm.assets.image import ImageAsset
28
from vllm.assets.video import VideoAsset
29
from vllm.config import TaskOption, _get_and_verify_dtype
30
from vllm.connections import global_http_connection
31
from vllm.distributed import (cleanup_dist_env_and_memory,
32
33
                              init_distributed_environment,
                              initialize_model_parallel)
34
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
35
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
36
from vllm.logger import init_logger
37
from vllm.outputs import RequestOutput
38
from vllm.sampling_params import BeamSearchParams
zhuwenwen's avatar
zhuwenwen committed
39

40

41
from vllm.transformers_utils.utils import maybe_model_redirect
zhuwenwen's avatar
zhuwenwen committed
42
from .utils import models_path_prefix
43
from vllm.platforms import current_platform
44

45
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
46

47
48
49
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
50
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
51

Cyrus Leung's avatar
Cyrus Leung committed
52
_M = TypeVar("_M")
53

54
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
55
56

PromptImageInput = _PromptMultiModalInput[Image.Image]
57
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
58
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
59

60

61
def _read_prompts(filename: str) -> list[str]:
62
    with open(filename) as f:
63
64
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
65
66


67
class ImageAssetPrompts(TypedDict):
68
69
    stop_sign: str
    cherry_blossom: str
70
71


72
class ImageTestAssets(list[ImageAsset]):
73
74

    def __init__(self) -> None:
75
76
77
78
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
79

80
    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
81
82
83
84
85
86
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
87
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
88
89


90
91
class VideoAssetPrompts(TypedDict):
    baby_reading: str
92
93


94
class VideoTestAssets(list[VideoAsset]):
95
96
97

    def __init__(self) -> None:
        super().__init__([
98
            VideoAsset("baby_reading"),
99
100
        ])

101
102
    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
        return [prompts["baby_reading"]]
103
104


105
class AudioAssetPrompts(TypedDict):
106
107
    mary_had_lamb: str
    winning_call: str
108
109


110
class AudioTestAssets(list[AudioAsset]):
111
112
113
114
115
116
117

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])

118
    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
119
        return [prompts["mary_had_lamb"], prompts["winning_call"]]
120
121


122
IMAGE_ASSETS = ImageTestAssets()
123
"""Singleton instance of {class}`ImageTestAssets`."""
124
VIDEO_ASSETS = VideoTestAssets()
125
"""Singleton instance of {class}`VideoTestAssets`."""
126
AUDIO_ASSETS = AudioTestAssets()
127
"""Singleton instance of {class}`AudioTestAssets`."""
128
129


130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
150
@pytest.fixture(params=[True, False])
151
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
152
153
154
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
155
    skip_v0 = request.node.get_closest_marker("skip_v0")
Joe Runde's avatar
Joe Runde committed
156
157
158
159
160
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
161
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
162
    else:
163
164
        if skip_v0:
            pytest.skip("Skipping test on vllm V0")
165
166
167
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
168
169


170
171
172
173
174
175
176
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


177
178
179
180
181
182
183
184
185
186
187
188
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
189
    cleanup_dist_env_and_memory()
190
191


192
@pytest.fixture()
193
def should_do_global_cleanup_after_test(request) -> bool:
194
195
196
197
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
198

199
    return not request.node.get_closest_marker("skip_global_cleanup")
200
201


202
@pytest.fixture(autouse=True)
203
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
204
    yield
205
    if should_do_global_cleanup_after_test:
206
        cleanup_dist_env_and_memory()
207
208


209
210
211
212
213
214
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
215
@pytest.fixture
216
def example_prompts() -> list[str]:
217
218
    prompts = []
    for filename in _TEST_PROMPTS:
219
        prompts += _read_prompts(filename)
220
221
222
    return prompts


223
224
225
226
227
228
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


229
230
231
232
233
234
235
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


236
@pytest.fixture
237
def example_encoder_decoder_prompts(
238
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
239
240
241
242
243
244
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
245

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
261
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
262
        DecoderPromptType.EMPTY_STR:
263
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
264
        DecoderPromptType.CUSTOM:
265
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
266
267
268
    }


269
@pytest.fixture
270
def example_long_prompts() -> list[str]:
271
272
    prompts = []
    for filename in _LONG_PROMPTS:
273
        prompts += _read_prompts(filename)
274
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
275
276


277
@pytest.fixture(scope="session")
278
def image_assets() -> ImageTestAssets:
279
280
281
    return IMAGE_ASSETS


282
@pytest.fixture(scope="session")
283
def video_assets() -> VideoTestAssets:
284
285
286
    return VIDEO_ASSETS


287
@pytest.fixture(scope="session")
288
def audio_assets() -> AudioTestAssets:
289
290
291
    return AUDIO_ASSETS


292
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
293
_R = TypeVar("_R")
294

Woosuk Kwon's avatar
Woosuk Kwon committed
295
296
297

class HfRunner:

298
    def get_default_device(self):
299
        from vllm.platforms import current_platform
300

301
302
        return ("cpu"
                if current_platform.is_cpu() else current_platform.device_type)
303
304

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
305
306
307
        if x is None or isinstance(x, (bool, )):
            return x

308
        if device is None:
309
            device = self.device
310

311
312
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
313

314
315
316
317
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
318

Woosuk Kwon's avatar
Woosuk Kwon committed
319
320
321
    def __init__(
        self,
        model_name: str,
322
        dtype: str = "auto",
323
        *,
324
        model_kwargs: Optional[dict[str, Any]] = None,
325
        trust_remote_code: bool = True,
326
        is_sentence_transformer: bool = False,
327
        is_cross_encoder: bool = False,
328
        skip_tokenizer_init: bool = False,
329
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
330
    ) -> None:
331
        model_name = maybe_model_redirect(model_name)
332
        self.model_name = model_name
333

334
335
        self.config = AutoConfig.from_pretrained(
            model_name,
336
            trust_remote_code=trust_remote_code,
337
338
        )
        self.device = self.get_default_device()
339
340
341
342
343
344
        self.dtype = torch_dtype = _get_and_verify_dtype(
            self.model_name,
            self.config,
            dtype=dtype,
            is_pooling_model=is_sentence_transformer or is_cross_encoder,
        )
345
346
347
348

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

349
        if is_sentence_transformer:
350
351
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
352
353
354
355
356

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
357
                trust_remote_code=trust_remote_code,
358
            )
359
360
361
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
362
363
364
365
366

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
367
                trust_remote_code=trust_remote_code,
368
            )
369
        else:
370
371
            model = auto_cls.from_pretrained(
                model_name,
372
                trust_remote_code=trust_remote_code,
373
374
375
                **model_kwargs,
            )

376
377
378
379
380
381
            # in case some unquantized custom models are not in same dtype
            if (getattr(model, "quantization_method", None) is None
                    and any(p.dtype != self.dtype
                            for p in model.parameters())):
                model = model.to(dtype=self.dtype)

382
383
384
            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
385
                model = model.to(device=self.device)
386
387

            self.model = model
388

389
390
391
392
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
393
                trust_remote_code=trust_remote_code,
394
            )
395

396
397
398
399
400
401
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
402
            trust_remote_code=trust_remote_code,
403
        )
404
405
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
406

407
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
408
        self,
409
        prompts: list[str],
410
        images: Optional[PromptImageInput] = None,
411
412
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
413
    ) -> list[Union[BatchFeature, BatchEncoding]]:
414
        if images is not None:
415
            assert len(prompts) == len(images)
416

417
418
419
420
421
422
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

423
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
424
        for i, prompt in enumerate(prompts):
425
            processor_kwargs: dict[str, Any] = {
426
427
428
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
429
430
431
432
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
433
434
435
436
437
438
439
440
441
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
442
443

            inputs = self.processor(**processor_kwargs)
444
445
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
446

447
448
449
450
            all_inputs.append(inputs)

        return all_inputs

451
452
453
454
455
456
457
458
459
    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
        all_inputs = self.get_inputs(prompts)
        embeddings = []
        for inputs in all_inputs:
            input_ids = self.wrap_device(inputs)["input_ids"]
            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
            embeddings.append(embedding)
        return embeddings

460
    def classify(self, prompts: list[str]) -> list[str]:
461
462
463
464
465
466
467
468
469
470
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

471
472
    def generate(
        self,
473
        prompts: list[str],
474
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
475
        videos: Optional[PromptVideoInput] = None,
476
477
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
478
    ) -> list[tuple[list[list[int]], list[str]]]:
479
480
481
482
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
483

484
        outputs: list[tuple[list[list[int]], list[str]]] = []
485
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
486
            output_ids = self.model.generate(
487
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
488
489
490
                use_cache=True,
                **kwargs,
            )
491
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
492
493
494
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
495
496
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
497
498
499
500
501
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
502
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
503
        max_tokens: int,
504
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
505
        videos: Optional[PromptVideoInput] = None,
506
        audios: Optional[PromptAudioInput] = None,
507
        **kwargs: Any,
508
    ) -> list[tuple[list[int], str]]:
509
510
        outputs = self.generate(prompts,
                                do_sample=False,
511
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
512
                                images=images,
513
514
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
515
                                **kwargs)
516
517
518

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
519
520
521

    def generate_beam_search(
        self,
522
        prompts: list[str],
523
524
        beam_width: int,
        max_tokens: int,
525
526
527
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
528
    ) -> list[tuple[list[list[int]], list[str]]]:
529
530
531
532
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
533
534
535
536
537
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

538
539
540
541
542
543
544
545
546
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
547

548
549
    def generate_greedy_logprobs(
        self,
550
        prompts: list[str],
551
        max_tokens: int,
552
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
553
        videos: Optional[PromptVideoInput] = None,
554
        audios: Optional[PromptAudioInput] = None,
555
        **kwargs: Any,
556
    ) -> list[list[torch.Tensor]]:
557
558
559
560
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
561

562
        all_logprobs: list[list[torch.Tensor]] = []
563
        for inputs in all_inputs:
564
            output = self.model.generate(
565
                **self.wrap_device(inputs),
566
567
568
569
570
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
571
                **kwargs,
572
            )
573
574
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
575
576
577
            all_logprobs.append(seq_logprobs)
        return all_logprobs

578
    def _hidden_states_to_seq_logprobs(
579
        self,
580
581
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
582
583
        output_embeddings = self.model.get_output_embeddings()

584
        seq_logprobs: list[torch.Tensor] = []
585
586
587
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
588
589
590
591
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
592
                output_embeddings.weight.t(),
593
            )
594
595
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
596
597
598
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

599
600
601
602
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
603
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
604
        num_logprobs: int,
605
    ) -> tuple[list[dict[int, float]], int]:
606
607
608
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

609
        # convert to dict
610
        seq_logprobs_lst: list[dict[int, float]] = []
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

628
629
    def generate_greedy_logprobs_limit(
        self,
630
        prompts: list[str],
631
632
        max_tokens: int,
        num_logprobs: int,
633
634
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
635
        videos: Optional[PromptVideoInput] = None,
636
        **kwargs: Any,
637
    ) -> list[TokensTextLogprobs]:
638
639
640
641
642
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

643
644
645
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
646

647
        for inputs in all_inputs:
648
            output = self.model.generate(
649
                **self.wrap_device(inputs),
650
651
652
653
654
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
655
                **kwargs,
656
657
            )

658
659
660
661
662
663
664
665
666
667
668
669
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
670

671
672
673
674
675
676
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
677
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
678
679
        max_tokens: int,
        num_logprobs: int,
680
        images: Optional[PromptImageInput] = None,
681
        **kwargs: Any,
682
    ) -> list[TokensTextLogprobs]:
683
684
685
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
686

687
688
689
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
690

691
692
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
693
            processor_kwargs: dict[str, Any] = {
694
695
696
697
698
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
699

700
701
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
702
703
704
705

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
706
707
708
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
709
710
711
712
713
714
715
716

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
717
                **encoder_inputs,
718
719
720
721
722
723
724
725
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
726
727
728
729
730
731
732
733
734
735
736

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

737
738
739
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
740

741
742
743
744
745
746
    def predict(self, prompts: list[list[str]], *args,
                **kwargs) -> torch.Tensor:
        return self.model.predict(prompts,
                                  *args,
                                  convert_to_tensor=True,
                                  **kwargs)
747

748
749
750
751
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
752
        del self.model
753
        cleanup_dist_env_and_memory()
754

Woosuk Kwon's avatar
Woosuk Kwon committed
755

Cyrus Leung's avatar
Cyrus Leung committed
756
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
757
758
759
760
761
def hf_runner():
    return HfRunner


class VllmRunner:
762
763
    """
    The default value of some arguments have been modified from
764
    {class}`~vllm.LLM` as follows:
765

766
767
768
769
770
771
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
772
    - `enforce_eager`: Set to `False` to test CUDA graph.
773
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
774
775
776
777

    def __init__(
        self,
        model_name: str,
778
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
779
        tokenizer_name: Optional[str] = None,
780
        tokenizer_mode: str = "auto",
781
782
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
783
        max_model_len: int = 1024,
784
        dtype: str = "auto",
785
        disable_log_stats: bool = True,
786
        tensor_parallel_size: int = 1,
787
        block_size: int = 16 if not current_platform.is_rocm() else 64,
788
        enable_chunked_prefill: Optional[bool] = False,
789
        swap_space: int = 4,
790
        enforce_eager: Optional[bool] = False,
791
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
792
793
794
    ) -> None:
        self.model = LLM(
            model=model_name,
795
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
796
            tokenizer=tokenizer_name,
797
            tokenizer_mode=tokenizer_mode,
798
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
799
            dtype=dtype,
800
            seed=seed,
801
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
802
            enforce_eager=enforce_eager,
803
            disable_log_stats=disable_log_stats,
804
            tensor_parallel_size=tensor_parallel_size,
805
            max_model_len=max_model_len,
806
807
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
808
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
809
810
        )

811
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
812
        self,
813
        prompts: Union[list[str], list[torch.Tensor]],
814
        images: Optional[PromptImageInput] = None,
815
816
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
817
    ) -> list[TextPrompt]:
818

819
820
821
822
823
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
824

825
826
827
828
829
830
831
832
833
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio
834

835
836
837
838
839
840
            text_prompt_kwargs = {
                ("prompt" if isinstance(prompt, str) else "prompt_embeds"):
                prompt,
                "multi_modal_data": multi_modal_data or None
            }
            inputs.append(TextPrompt(**text_prompt_kwargs))
841
842
843
844
845

        return inputs

    def generate(
        self,
846
        prompts: Union[list[str], list[torch.Tensor]],
847
848
849
850
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
851
        **kwargs: Any,
852
    ) -> list[tuple[list[list[int]], list[str]]]:
853
854
855
856
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
857

858
        req_outputs = self.model.generate(inputs,
859
860
                                          sampling_params=sampling_params,
                                          **kwargs)
861

862
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
863
864
865
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
866
867
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
868
869
            for sample in req_output.outputs:
                output_str = sample.text
870
                output_ids = list(sample.token_ids)
871
                req_sample_output_ids.append(prompt_ids + output_ids)
872
                req_sample_output_strs.append((prompt_str or "") + output_str)
873
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
874
875
        return outputs

876
    @staticmethod
877
    def _final_steps_generate_w_logprobs(
878
879
880
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
881
        for req_output in req_outputs:
882
            assert len(req_output.outputs) > 0
883
884
            for sample in req_output.outputs:
                output_str = sample.text
885
                output_ids = list(sample.token_ids)
886
                output_logprobs = sample.logprobs
887
888
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
889
890
        return outputs

891
892
    def generate_w_logprobs(
        self,
893
        prompts: list[str],
894
        sampling_params: SamplingParams,
895
896
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
897
        videos: Optional[PromptVideoInput] = None,
898
        **kwargs: Any,
899
900
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
901
902
903
904
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
905

906
        req_outputs = self.model.generate(inputs,
907
908
                                          sampling_params=sampling_params,
                                          **kwargs)
909
910
911
912
913
914
915

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
916
917
918

    def generate_encoder_decoder_w_logprobs(
        self,
919
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
920
        sampling_params: SamplingParams,
921
922
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
923
924
925
926
927
928
929
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
930
931
932
933
934
935
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
936

Woosuk Kwon's avatar
Woosuk Kwon committed
937
938
    def generate_greedy(
        self,
939
        prompts: Union[list[str], list[torch.Tensor]],
Woosuk Kwon's avatar
Woosuk Kwon committed
940
        max_tokens: int,
941
        images: Optional[PromptImageInput] = None,
942
943
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
944
        **kwargs: Any,
945
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
946
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
947
948
949
950
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
951
952
                                audios=audios,
                                **kwargs)
953
954
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
955

956
957
    def generate_greedy_logprobs(
        self,
958
        prompts: list[str],
959
960
        max_tokens: int,
        num_logprobs: int,
961
        num_prompt_logprobs: Optional[int] = None,
962
963
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
964
        videos: Optional[PromptVideoInput] = None,
965
966
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
967
        **kwargs: Any,
968
969
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
970
971
972
973
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
974
            prompt_logprobs=num_prompt_logprobs,
975
976
            stop_token_ids=stop_token_ids,
            stop=stop)
977
978
979
980
981

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
982
983
                                        videos=videos,
                                        **kwargs)
984

985
986
    def generate_encoder_decoder_greedy_logprobs(
        self,
987
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
988
989
        max_tokens: int,
        num_logprobs: int,
990
        num_prompt_logprobs: Optional[int] = None,
991
        skip_special_tokens: bool = True,
992
993
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
994
995
996
997
998
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
999
            skip_special_tokens=skip_special_tokens,
1000
        )
1001
1002
1003
1004
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

1005
        return self.generate_encoder_decoder_w_logprobs(
1006
1007
            encoder_decoder_prompts, greedy_logprobs_params)

1008
    def generate_beam_search(
1009
        self,
1010
        prompts: list[str],
1011
1012
        beam_width: int,
        max_tokens: int,
1013
1014
1015
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
1016
    ) -> list[tuple[list[list[int]], list[str]]]:
1017
1018
1019
1020
1021
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1022
        outputs = self.model.beam_search(
1023
            inputs,
1024
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
1025
1026
1027
1028
1029
1030
1031
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

1032
    def classify(self, prompts: list[str]) -> list[list[float]]:
1033
1034
1035
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

1036
1037
1038
1039
1040
1041
1042
    def embed(self,
              prompts: list[str],
              images: Optional[PromptImageInput] = None,
              videos: Optional[PromptVideoInput] = None,
              audios: Optional[PromptAudioInput] = None,
              *args,
              **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
1043
1044
1045
1046
1047
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1048
        req_outputs = self.model.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
1049
        return [req_output.outputs.embedding for req_output in req_outputs]
1050

1051
1052
1053
1054
    def encode(self, prompts: list[str]) -> list[list[float]]:
        req_outputs = self.model.encode(prompts)
        return [req_output.outputs.data for req_output in req_outputs]

1055
1056
    def score(
        self,
1057
1058
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
1059
1060
        *args,
        **kwargs,
1061
    ) -> list[float]:
1062
        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
1063
        return [req_output.outputs.score for req_output in req_outputs]
1064

1065
1066
1067
1068
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

1069
1070
1071
1072
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
1073
        del self.model
1074
        cleanup_dist_env_and_memory()
1075

Woosuk Kwon's avatar
Woosuk Kwon committed
1076

1077
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1078
1079
def vllm_runner():
    return VllmRunner
1080
1081


1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1096
1097
1098
1099
1100
1101
1102


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1103
1104
    from vllm.platforms import current_platform
    return current_platform.device_count()
1105
1106


1107
# temp_dir = tempfile.gettempdir()
zhuwenwen's avatar
zhuwenwen committed
1108
1109
1110
1111
_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")

1112
1113
1114
1115


@pytest.fixture
def dummy_opt_path():
1116
1117
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1118
        snapshot_download(repo_id="facebook/opt-125m",
1119
                          local_dir=_dummy_opt_path,
1120
1121
1122
1123
1124
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1125
        with open(json_path) as f:
1126
1127
1128
1129
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1130
1131
    return _dummy_opt_path

1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149

# 定义一个 pytest 钩子,在测试后生成报告
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
    # 获取测试结果
    outcome = yield
    result = outcome.get_result()

    # 如果测试失败并且有浏览器实例,添加截图
    if result.when == "call" and result.failed:
        if hasattr(item, "funcargs") and "browser" in item.funcargs:
            browser = item.funcargs["browser"]
            screenshot_path = "screenshot.png"  # 设置截图路径
            browser.save_screenshot(screenshot_path)

            # 如果测试结果有 extra 属性,则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
zhuwenwen's avatar
zhuwenwen committed
1150
1151


1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1163
        with open(json_path) as f:
1164
1165
1166
1167
1168
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1182
        with open(json_path) as f:
1183
1184
1185
1186
1187
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")