conftest.py 39.3 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import json
4
import os
5
import tempfile
6
from collections import UserList
7
from enum import Enum
zhuwenwen's avatar
zhuwenwen committed
8

9
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
10
11
import pytest
import pytest_html
Woosuk Kwon's avatar
Woosuk Kwon committed
12

13
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
14
15
import pytest
import torch
16
import torch.nn as nn
17
import torch.nn.functional as F
18
from huggingface_hub import snapshot_download
19
from PIL import Image
20
21
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
22
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
23

24
25
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
26
from vllm import LLM, SamplingParams
27
from vllm.assets.image import ImageAsset
28
from vllm.assets.video import VideoAsset
29
from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
30
from vllm.connections import global_http_connection
31
from vllm.distributed import (cleanup_dist_env_and_memory,
32
33
                              init_distributed_environment,
                              initialize_model_parallel)
34
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
youkaichao's avatar
youkaichao committed
35
36
                         TokensPrompt, to_enc_dec_tuple_list,
                         zip_enc_dec_prompts)
37
from vllm.logger import init_logger
38
from vllm.outputs import RequestOutput
39
from vllm.sampling_params import BeamSearchParams
zhuwenwen's avatar
zhuwenwen committed
40

41
from vllm.utils import cuda_device_count_stateless, is_list_of
zhuwenwen's avatar
zhuwenwen committed
42
from .utils import models_path_prefix
43

44
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
45

46
47
48
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
49
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
50

Cyrus Leung's avatar
Cyrus Leung committed
51
_M = TypeVar("_M")
52

53
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
54
55

PromptImageInput = _PromptMultiModalInput[Image.Image]
56
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
57
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
58

59

60
def _read_prompts(filename: str) -> list[str]:
61
    with open(filename) as f:
62
63
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
64
65


66
67
68
class _ImageAssetPrompts(TypedDict):
    stop_sign: str
    cherry_blossom: str
69
70


71
72
class _ImageAssetsBase(UserList[ImageAsset]):
    pass
73

74
75

class _ImageAssets(_ImageAssetsBase):
76
77

    def __init__(self) -> None:
78
79
80
81
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
82

83
    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
84
85
86
87
88
89
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
90
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
91
92


93
94
95
96
class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str


97
98
class _VideoAssetsBase(UserList[VideoAsset]):
    pass
99
100
101
102
103
104
105
106
107


class _VideoAssets(_VideoAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])

108
    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
109
110
111
        return [prompts["sample_demo_1"]]


112
113
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
114
115
VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
116
117


118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
138
@pytest.fixture(params=[True, False])
139
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
140
141
142
143
144
145
146
147
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
148
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
149
    else:
150
151
152
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
153
154


155
156
157
158
159
160
161
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


162
163
164
165
166
167
168
169
170
171
172
173
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
174
    cleanup_dist_env_and_memory()
175
176


177
@pytest.fixture()
178
def should_do_global_cleanup_after_test(request) -> bool:
179
180
181
182
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
183

184
    return not request.node.get_closest_marker("skip_global_cleanup")
185
186


187
@pytest.fixture(autouse=True)
188
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
189
    yield
190
    if should_do_global_cleanup_after_test:
191
        cleanup_dist_env_and_memory()
192
193


194
195
196
197
198
199
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
200
@pytest.fixture
201
def example_prompts() -> list[str]:
202
203
    prompts = []
    for filename in _TEST_PROMPTS:
204
        prompts += _read_prompts(filename)
205
206
207
    return prompts


208
209
210
211
212
213
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


214
215
216
217
218
219
220
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


221
@pytest.fixture
222
def example_encoder_decoder_prompts(
223
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
224
225
226
227
228
229
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
246
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
247
        DecoderPromptType.EMPTY_STR:
248
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
249
        DecoderPromptType.CUSTOM:
250
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
251
252
253
    }


254
@pytest.fixture
255
def example_long_prompts() -> list[str]:
256
257
    prompts = []
    for filename in _LONG_PROMPTS:
258
        prompts += _read_prompts(filename)
259
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
260
261


262
263
264
265
266
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS


267
268
269
270
271
@pytest.fixture(scope="session")
def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS


272
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
273
_R = TypeVar("_R")
274

Woosuk Kwon's avatar
Woosuk Kwon committed
275
276
277

class HfRunner:

278
    def get_default_device(self):
279
        from vllm.platforms import current_platform
280

281
        return ("cpu" if current_platform.is_cpu() else "cuda")
282
283

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
284
285
286
        if x is None or isinstance(x, (bool, )):
            return x

287
        if device is None:
288
            device = self.device
289

290
291
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
292

293
294
295
296
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
297

Woosuk Kwon's avatar
Woosuk Kwon committed
298
299
300
    def __init__(
        self,
        model_name: str,
301
        dtype: str = "auto",
302
        *,
303
        model_kwargs: Optional[dict[str, Any]] = None,
304
        is_sentence_transformer: bool = False,
305
        is_cross_encoder: bool = False,
306
        skip_tokenizer_init: bool = False,
307
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
308
    ) -> None:
309
        self.model_name = model_name
310

311
312
313
314
315
316
317
318
319
320
        self.config = AutoConfig.from_pretrained(
            model_name,
            trust_remote_code=True,
        )
        self.device = self.get_default_device()
        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

321
        if is_sentence_transformer:
322
323
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
324
325
326
327
328
329
330

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
                trust_remote_code=True,
            )
331
332
333
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
334
335
336
337
338
339
340

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
                trust_remote_code=True,
            )
341
        else:
342
343
344
345
346
347
348
349
350
351
352
353
            model = auto_cls.from_pretrained(
                model_name,
                trust_remote_code=True,
                **model_kwargs,
            )

            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
                model = model.to(self.device)

            self.model = model
354

355
356
357
358
359
360
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            )
361

362
363
364
365
366
367
368
369
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
370
371
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
372

373
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
374
        self,
375
        prompts: list[str],
376
        images: Optional[PromptImageInput] = None,
377
378
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
379
    ) -> list[Union[BatchFeature, BatchEncoding]]:
380
        if images is not None:
381
            assert len(prompts) == len(images)
382

383
384
385
386
387
388
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

389
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
390
        for i, prompt in enumerate(prompts):
391
            processor_kwargs: dict[str, Any] = {
392
393
394
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
395
396
397
398
399
400
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
            if audios is not None and (audio_tuple := audios[i]) is not None:
                audio, sr = audio_tuple
401
402
                processor_kwargs["audio"] = audio
                processor_kwargs["sampling_rate"] = sr
403
404

            inputs = self.processor(**processor_kwargs)
405
406
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
407

408
409
410
411
            all_inputs.append(inputs)

        return all_inputs

412
    def classify(self, prompts: list[str]) -> list[str]:
413
414
415
416
417
418
419
420
421
422
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

423
424
    def generate(
        self,
425
        prompts: list[str],
426
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
427
        videos: Optional[PromptVideoInput] = None,
428
429
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
430
    ) -> list[tuple[list[list[int]], list[str]]]:
431
432
433
434
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
435

436
        outputs: list[tuple[list[list[int]], list[str]]] = []
437
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
438
            output_ids = self.model.generate(
439
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
440
441
442
                use_cache=True,
                **kwargs,
            )
443
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
444
445
446
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
447
448
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
449
450
451
452
453
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
454
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
455
        max_tokens: int,
456
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
457
        videos: Optional[PromptVideoInput] = None,
458
        audios: Optional[PromptAudioInput] = None,
459
        **kwargs: Any,
460
    ) -> list[tuple[list[int], str]]:
461
462
        outputs = self.generate(prompts,
                                do_sample=False,
463
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
464
                                images=images,
465
466
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
467
                                **kwargs)
468
469
470

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
471
472
473

    def generate_beam_search(
        self,
474
        prompts: list[str],
475
476
        beam_width: int,
        max_tokens: int,
477
    ) -> list[tuple[list[list[int]], list[str]]]:
478
479
480
481
482
483
484
485
486
487
488
489
490
491
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
                                num_return_sequences=beam_width)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
492

493
494
    def generate_greedy_logprobs(
        self,
495
        prompts: list[str],
496
        max_tokens: int,
497
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
498
        videos: Optional[PromptVideoInput] = None,
499
        audios: Optional[PromptAudioInput] = None,
500
        **kwargs: Any,
501
    ) -> list[list[torch.Tensor]]:
502
503
504
505
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
506

507
        all_logprobs: list[list[torch.Tensor]] = []
508
        for inputs in all_inputs:
509
            output = self.model.generate(
510
                **self.wrap_device(inputs),
511
512
513
514
515
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
516
                **kwargs,
517
            )
518
519
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
520
521
522
            all_logprobs.append(seq_logprobs)
        return all_logprobs

523
    def _hidden_states_to_seq_logprobs(
524
        self,
525
526
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
527
528
        output_embeddings = self.model.get_output_embeddings()

529
        seq_logprobs: list[torch.Tensor] = []
530
531
532
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
533
534
                last_hidden_states.to(output_embeddings.weight.device),
                output_embeddings.weight.t(),
535
            )
536
537
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
538
539
540
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

541
542
543
544
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
545
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
546
        num_logprobs: int,
547
    ) -> tuple[list[dict[int, float]], int]:
548
549
550
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

551
        # convert to dict
552
        seq_logprobs_lst: list[dict[int, float]] = []
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

570
571
    def generate_greedy_logprobs_limit(
        self,
572
        prompts: list[str],
573
574
        max_tokens: int,
        num_logprobs: int,
575
576
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
577
        videos: Optional[PromptVideoInput] = None,
578
        **kwargs: Any,
579
    ) -> list[TokensTextLogprobs]:
580
581
582
583
584
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

585
586
587
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
588

589
        for inputs in all_inputs:
590
            output = self.model.generate(
591
                **self.wrap_device(inputs),
592
593
594
595
596
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
597
                **kwargs,
598
599
            )

600
601
602
603
604
605
606
607
608
609
610
611
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
612

613
614
615
616
617
618
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
619
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
620
621
        max_tokens: int,
        num_logprobs: int,
622
        images: Optional[PromptImageInput] = None,
623
        **kwargs: Any,
624
    ) -> list[TokensTextLogprobs]:
625
626
627
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
628

629
630
631
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
632

633
634
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
635
            processor_kwargs: dict[str, Any] = {
636
637
638
639
640
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
641

642
643
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
644
645
646
647

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
648
649
650
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
651
652
653
654
655
656
657
658

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
659
                **encoder_inputs,
660
661
662
663
664
665
666
667
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
668
669
670
671
672
673
674
675
676
677
678

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

679
    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
680
681
        return self.model.encode(prompts)

682
    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
683
684
        return self.model.predict(prompts, convert_to_tensor=True)

685
686
687
688
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
689
        del self.model
690
        cleanup_dist_env_and_memory()
691

Woosuk Kwon's avatar
Woosuk Kwon committed
692

Cyrus Leung's avatar
Cyrus Leung committed
693
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
694
695
696
697
698
def hf_runner():
    return HfRunner


class VllmRunner:
699
700
701
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
702

703
704
705
706
707
708
709
710
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
711
712
713
714

    def __init__(
        self,
        model_name: str,
715
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
716
        tokenizer_name: Optional[str] = None,
717
        tokenizer_mode: str = "auto",
718
719
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
720
        max_model_len: int = 1024,
721
        dtype: str = "auto",
722
        disable_log_stats: bool = True,
723
        tensor_parallel_size: int = 1,
724
        block_size: int = 16,
725
        enable_chunked_prefill: Optional[bool] = False,
726
        swap_space: int = 4,
727
        enforce_eager: Optional[bool] = False,
728
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
729
730
731
    ) -> None:
        self.model = LLM(
            model=model_name,
732
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
733
            tokenizer=tokenizer_name,
734
            tokenizer_mode=tokenizer_mode,
735
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
736
            dtype=dtype,
737
            seed=seed,
738
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
739
            enforce_eager=enforce_eager,
740
            disable_log_stats=disable_log_stats,
741
            tensor_parallel_size=tensor_parallel_size,
742
            max_model_len=max_model_len,
743
744
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
745
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
746
747
        )

748
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
749
        self,
750
        prompts: list[str],
751
        images: Optional[PromptImageInput] = None,
752
753
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
754
    ) -> list[TextPrompt]:
755

756
757
758
759
760
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
761

762
763
764
765
766
767
768
769
770
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio
771

772
773
774
775
            inputs.append(
                TextPrompt(prompt=prompt,
                           multi_modal_data=multi_modal_data
                           if multi_modal_data else None))
776
777
778
779
780

        return inputs

    def generate(
        self,
781
        prompts: list[str],
782
783
784
785
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
786
        **kwargs: Any,
787
    ) -> list[tuple[list[list[int]], list[str]]]:
788
789
790
791
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
792

793
        req_outputs = self.model.generate(inputs,
794
795
                                          sampling_params=sampling_params,
                                          **kwargs)
796

797
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
798
799
800
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
801
802
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
803
804
            for sample in req_output.outputs:
                output_str = sample.text
805
                output_ids = list(sample.token_ids)
806
807
808
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
809
810
        return outputs

811
    @staticmethod
812
    def _final_steps_generate_w_logprobs(
813
814
815
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
816
        for req_output in req_outputs:
817
            assert len(req_output.outputs) > 0
818
819
            for sample in req_output.outputs:
                output_str = sample.text
820
                output_ids = list(sample.token_ids)
821
                output_logprobs = sample.logprobs
822
823
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
824
825
        return outputs

826
827
    def generate_w_logprobs(
        self,
828
        prompts: list[str],
829
        sampling_params: SamplingParams,
830
831
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
832
        videos: Optional[PromptVideoInput] = None,
833
        **kwargs: Any,
834
835
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
836
837
838
839
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
840

841
        req_outputs = self.model.generate(inputs,
842
843
                                          sampling_params=sampling_params,
                                          **kwargs)
844
845
846
847
848
849
850

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
851
852
853

    def generate_encoder_decoder_w_logprobs(
        self,
854
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
855
        sampling_params: SamplingParams,
856
857
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
858
859
860
861
862
863
864
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
865
866
867
868
869
870
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
871

Woosuk Kwon's avatar
Woosuk Kwon committed
872
873
    def generate_greedy(
        self,
874
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
875
        max_tokens: int,
876
        images: Optional[PromptImageInput] = None,
877
878
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
879
        **kwargs: Any,
880
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
881
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
882
883
884
885
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
886
887
                                audios=audios,
                                **kwargs)
888
889
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
890

891
892
    def generate_greedy_logprobs(
        self,
893
        prompts: list[str],
894
895
        max_tokens: int,
        num_logprobs: int,
896
        num_prompt_logprobs: Optional[int] = None,
897
898
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
899
        videos: Optional[PromptVideoInput] = None,
900
901
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
902
        **kwargs: Any,
903
904
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
905
906
907
908
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
909
            prompt_logprobs=num_prompt_logprobs,
910
911
            stop_token_ids=stop_token_ids,
            stop=stop)
912
913
914
915
916

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
917
918
                                        videos=videos,
                                        **kwargs)
919

920
921
    def generate_encoder_decoder_greedy_logprobs(
        self,
922
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
923
924
        max_tokens: int,
        num_logprobs: int,
925
        num_prompt_logprobs: Optional[int] = None,
926
927
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
928
929
930
931
932
933
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
        )
934
935
936
937
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

938
        return self.generate_encoder_decoder_w_logprobs(
939
940
            encoder_decoder_prompts, greedy_logprobs_params)

941
    def generate_beam_search(
942
        self,
943
        prompts: Union[list[str], list[list[int]]],
944
945
        beam_width: int,
        max_tokens: int,
946
    ) -> list[tuple[list[list[int]], list[str]]]:
youkaichao's avatar
youkaichao committed
947
948
949
950
951
952
        if is_list_of(prompts, str, check="all"):
            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
        else:
            prompts = [
                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
            ]
953
954
955
        outputs = self.model.beam_search(
            prompts,
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
956
957
958
959
960
961
962
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

963
    def classify(self, prompts: list[str]) -> list[list[float]]:
964
965
966
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

Cyrus Leung's avatar
Cyrus Leung committed
967
968
    def encode(
        self,
969
        prompts: list[str],
Cyrus Leung's avatar
Cyrus Leung committed
970
971
972
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
973
    ) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
974
975
976
977
978
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

979
        req_outputs = self.model.embed(inputs)
Cyrus Leung's avatar
Cyrus Leung committed
980
        return [req_output.outputs.embedding for req_output in req_outputs]
981

982
983
    def score(
        self,
984
985
986
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
    ) -> list[float]:
987
        req_outputs = self.model.score(text_1, text_2)
988
        return [req_output.outputs.score for req_output in req_outputs]
989

990
991
992
993
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

994
995
996
997
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
998
        del self.model
999
        cleanup_dist_env_and_memory()
1000

Woosuk Kwon's avatar
Woosuk Kwon committed
1001

1002
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1003
1004
def vllm_runner():
    return VllmRunner
1005
1006
1007
1008
1009
1010
1011
1012
1013


def get_tokenizer_pool_config(tokenizer_group_type):
    if tokenizer_group_type is None:
        return None
    if tokenizer_group_type == "ray":
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type="ray",
                                   extra_config={})
1014
1015
1016
1017
    if isinstance(tokenizer_group_type, type):
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type=tokenizer_group_type,
                                   extra_config={})
1018
    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034


@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1035
1036
1037
1038
1039
1040
1041


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1042
    return cuda_device_count_stateless()
1043
1044


1045
# temp_dir = tempfile.gettempdir()
zhuwenwen's avatar
zhuwenwen committed
1046
1047
1048
1049
_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")

1050
1051
1052
1053


@pytest.fixture
def dummy_opt_path():
1054
1055
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1056
        snapshot_download(repo_id="facebook/opt-125m",
1057
                          local_dir=_dummy_opt_path,
1058
1059
1060
1061
1062
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1063
        with open(json_path) as f:
1064
1065
1066
1067
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1068
1069
    return _dummy_opt_path

1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087

# 定义一个 pytest 钩子,在测试后生成报告
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
    # 获取测试结果
    outcome = yield
    result = outcome.get_result()

    # 如果测试失败并且有浏览器实例,添加截图
    if result.when == "call" and result.failed:
        if hasattr(item, "funcargs") and "browser" in item.funcargs:
            browser = item.funcargs["browser"]
            screenshot_path = "screenshot.png"  # 设置截图路径
            browser.save_screenshot(screenshot_path)

            # 如果测试结果有 extra 属性,则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
zhuwenwen's avatar
zhuwenwen committed
1088
1089


1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1101
        with open(json_path) as f:
1102
1103
1104
1105
1106
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1120
        with open(json_path) as f:
1121
1122
1123
1124
1125
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")