conftest.py 40.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
import json
3
import os
4
import tempfile
5
from enum import Enum
6
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
Woosuk Kwon's avatar
Woosuk Kwon committed
7

8
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
9
10
import pytest
import torch
11
import torch.nn as nn
12
import torch.nn.functional as F
13
from huggingface_hub import snapshot_download
14
from PIL import Image
15
16
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
17
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
18

19
20
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
21
from vllm import LLM, SamplingParams
22
from vllm.assets.audio import AudioAsset
23
from vllm.assets.image import ImageAsset
24
from vllm.assets.video import VideoAsset
25
from vllm.config import TaskOption, _get_and_verify_dtype
26
from vllm.connections import global_http_connection
27
from vllm.distributed import (cleanup_dist_env_and_memory,
28
29
                              init_distributed_environment,
                              initialize_model_parallel)
30
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
31
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
32
from vllm.logger import init_logger
33
from vllm.outputs import RequestOutput
34
from vllm.sampling_params import BeamSearchParams
35
from vllm.utils import cuda_device_count_stateless
36

37
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
38

39
40
41
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
42
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
43

Cyrus Leung's avatar
Cyrus Leung committed
44
_M = TypeVar("_M")
45

46
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
47
48

PromptImageInput = _PromptMultiModalInput[Image.Image]
49
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
50
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
51

52

53
def _read_prompts(filename: str) -> list[str]:
54
    with open(filename) as f:
55
56
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
57
58


59
class ImageAssetPrompts(TypedDict):
60
61
    stop_sign: str
    cherry_blossom: str
62
63


64
class ImageTestAssets(list[ImageAsset]):
65
66

    def __init__(self) -> None:
67
68
69
70
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
71

72
    def prompts(self, prompts: ImageAssetPrompts) -> list[str]:
73
74
75
76
77
78
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
79
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
80
81


82
83
class VideoAssetPrompts(TypedDict):
    baby_reading: str
84
85


86
class VideoTestAssets(list[VideoAsset]):
87
88
89

    def __init__(self) -> None:
        super().__init__([
90
            VideoAsset("baby_reading"),
91
92
        ])

93
94
    def prompts(self, prompts: VideoAssetPrompts) -> list[str]:
        return [prompts["baby_reading"]]
95
96


97
class AudioAssetPrompts(TypedDict):
98
99
100
101
    mary_had_lamb: str
    winning_call: str


102
class AudioTestAssets(list[AudioAsset]):
103
104
105
106
107
108
109

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])

110
    def prompts(self, prompts: AudioAssetPrompts) -> list[str]:
111
112
        return [prompts["mary_had_lamb"], prompts["winning_call"]]

113

114
IMAGE_ASSETS = ImageTestAssets()
115
"""Singleton instance of {class}`ImageTestAssets`."""
116
VIDEO_ASSETS = VideoTestAssets()
117
"""Singleton instance of {class}`VideoTestAssets`."""
118
AUDIO_ASSETS = AudioTestAssets()
119
"""Singleton instance of {class}`AudioTestAssets`."""
120
121


122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
142
@pytest.fixture(params=[True, False])
143
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
144
145
146
147
148
149
150
151
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
152
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
153
    else:
154
155
156
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
157
158


159
160
161
162
163
164
165
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


166
167
168
169
170
171
172
173
174
175
176
177
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
178
    cleanup_dist_env_and_memory()
179
180


181
@pytest.fixture()
182
def should_do_global_cleanup_after_test(request) -> bool:
183
184
185
186
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
187

188
    return not request.node.get_closest_marker("skip_global_cleanup")
189
190


191
@pytest.fixture(autouse=True)
192
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
193
    yield
194
    if should_do_global_cleanup_after_test:
195
        cleanup_dist_env_and_memory()
196
197


198
199
200
201
202
203
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
204
@pytest.fixture
205
def example_prompts() -> list[str]:
206
207
    prompts = []
    for filename in _TEST_PROMPTS:
208
        prompts += _read_prompts(filename)
209
210
211
    return prompts


212
213
214
215
216
217
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


218
219
220
221
222
223
224
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


225
@pytest.fixture
226
def example_encoder_decoder_prompts(
227
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
228
229
230
231
232
233
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
234

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
250
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
251
        DecoderPromptType.EMPTY_STR:
252
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
253
        DecoderPromptType.CUSTOM:
254
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
255
256
257
    }


258
@pytest.fixture
259
def example_long_prompts() -> list[str]:
260
261
    prompts = []
    for filename in _LONG_PROMPTS:
262
        prompts += _read_prompts(filename)
263
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
264
265


266
@pytest.fixture(scope="session")
267
def image_assets() -> ImageTestAssets:
268
269
270
    return IMAGE_ASSETS


271
@pytest.fixture(scope="session")
272
def video_assets() -> VideoTestAssets:
273
274
275
    return VIDEO_ASSETS


276
@pytest.fixture(scope="session")
277
def audio_assets() -> AudioTestAssets:
278
279
280
    return AUDIO_ASSETS


281
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
282
_R = TypeVar("_R")
283

Woosuk Kwon's avatar
Woosuk Kwon committed
284
285
286

class HfRunner:

287
    def get_default_device(self):
288
        from vllm.platforms import current_platform
289

290
291
        return ("cpu"
                if current_platform.is_cpu() else current_platform.device_type)
292
293

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
294
295
296
        if x is None or isinstance(x, (bool, )):
            return x

297
        if device is None:
298
            device = self.device
299

300
301
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
302

303
304
305
306
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
307

Woosuk Kwon's avatar
Woosuk Kwon committed
308
309
310
    def __init__(
        self,
        model_name: str,
311
        dtype: str = "auto",
312
        *,
313
        model_kwargs: Optional[dict[str, Any]] = None,
314
        trust_remote_code: bool = True,
315
        is_sentence_transformer: bool = False,
316
        is_cross_encoder: bool = False,
317
        skip_tokenizer_init: bool = False,
318
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
319
    ) -> None:
320
        self.model_name = model_name
321

322
323
        self.config = AutoConfig.from_pretrained(
            model_name,
324
            trust_remote_code=trust_remote_code,
325
326
327
328
329
330
331
        )
        self.device = self.get_default_device()
        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

332
        if is_sentence_transformer:
333
334
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
335
336
337
338
339

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
340
                trust_remote_code=trust_remote_code,
341
            )
342
343
344
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
345
346
347
348
349

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
350
                trust_remote_code=trust_remote_code,
351
            )
352
        else:
353
354
            model = auto_cls.from_pretrained(
                model_name,
355
                trust_remote_code=trust_remote_code,
356
357
358
                **model_kwargs,
            )

359
360
361
362
363
364
            # in case some unquantized custom models are not in same dtype
            if (getattr(model, "quantization_method", None) is None
                    and any(p.dtype != self.dtype
                            for p in model.parameters())):
                model = model.to(dtype=self.dtype)

365
366
367
            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
368
                model = model.to(device=self.device)
369
370

            self.model = model
371

372
373
374
375
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
376
                trust_remote_code=trust_remote_code,
377
            )
378

379
380
381
382
383
384
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
385
            trust_remote_code=trust_remote_code,
386
        )
387
388
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
389

390
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
391
        self,
392
        prompts: list[str],
393
        images: Optional[PromptImageInput] = None,
394
395
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
396
    ) -> list[Union[BatchFeature, BatchEncoding]]:
397
        if images is not None:
398
            assert len(prompts) == len(images)
399

400
401
402
403
404
405
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

406
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
407
        for i, prompt in enumerate(prompts):
408
            processor_kwargs: dict[str, Any] = {
409
410
411
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
412
413
414
415
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
416
417
418
419
420
421
422
423
424
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
425
426

            inputs = self.processor(**processor_kwargs)
427
428
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
429

430
431
432
433
            all_inputs.append(inputs)

        return all_inputs

434
435
436
437
438
439
440
441
442
    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
        all_inputs = self.get_inputs(prompts)
        embeddings = []
        for inputs in all_inputs:
            input_ids = self.wrap_device(inputs)["input_ids"]
            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
            embeddings.append(embedding)
        return embeddings

443
    def classify(self, prompts: list[str]) -> list[str]:
444
445
446
447
448
449
450
451
452
453
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

454
455
    def generate(
        self,
456
        prompts: list[str],
457
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
458
        videos: Optional[PromptVideoInput] = None,
459
460
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
461
    ) -> list[tuple[list[list[int]], list[str]]]:
462
463
464
465
466
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

467
        outputs: list[tuple[list[list[int]], list[str]]] = []
468
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
469
            output_ids = self.model.generate(
470
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
471
472
473
                use_cache=True,
                **kwargs,
            )
474
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
475
476
477
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
478
479
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
480
481
482
483
484
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
485
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
486
        max_tokens: int,
487
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
488
        videos: Optional[PromptVideoInput] = None,
489
        audios: Optional[PromptAudioInput] = None,
490
        **kwargs: Any,
491
    ) -> list[tuple[list[int], str]]:
492
493
        outputs = self.generate(prompts,
                                do_sample=False,
494
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
495
                                images=images,
496
497
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
498
                                **kwargs)
499
500
501

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
502
503
504

    def generate_beam_search(
        self,
505
        prompts: list[str],
506
507
        beam_width: int,
        max_tokens: int,
508
509
510
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
511
    ) -> list[tuple[list[list[int]], list[str]]]:
512
513
514
515
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
516
517
518
519
520
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

521
522
523
524
525
526
527
528
529
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
530

531
532
    def generate_greedy_logprobs(
        self,
533
        prompts: list[str],
534
        max_tokens: int,
535
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
536
        videos: Optional[PromptVideoInput] = None,
537
        audios: Optional[PromptAudioInput] = None,
538
        **kwargs: Any,
539
    ) -> list[list[torch.Tensor]]:
540
541
542
543
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
544

545
        all_logprobs: list[list[torch.Tensor]] = []
546
        for inputs in all_inputs:
547
            output = self.model.generate(
548
                **self.wrap_device(inputs),
549
550
551
552
553
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
554
                **kwargs,
555
            )
556
557
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
558
559
560
            all_logprobs.append(seq_logprobs)
        return all_logprobs

561
    def _hidden_states_to_seq_logprobs(
562
        self,
563
564
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
565
566
        output_embeddings = self.model.get_output_embeddings()

567
        seq_logprobs: list[torch.Tensor] = []
568
569
570
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
571
572
573
574
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
575
                output_embeddings.weight.t(),
576
            )
577
578
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
579
580
581
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

582
583
584
585
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
586
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
587
        num_logprobs: int,
588
    ) -> tuple[list[dict[int, float]], int]:
589
590
591
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

592
        # convert to dict
593
        seq_logprobs_lst: list[dict[int, float]] = []
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

611
612
    def generate_greedy_logprobs_limit(
        self,
613
        prompts: list[str],
614
615
        max_tokens: int,
        num_logprobs: int,
616
617
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
618
        videos: Optional[PromptVideoInput] = None,
619
        **kwargs: Any,
620
    ) -> list[TokensTextLogprobs]:
621
622
623
624
625
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

626
627
628
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
629

630
        for inputs in all_inputs:
631
            output = self.model.generate(
632
                **self.wrap_device(inputs),
633
634
635
636
637
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
638
                **kwargs,
639
640
            )

641
642
643
644
645
646
647
648
649
650
651
652
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
653

654
655
656
657
658
659
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
660
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
661
662
        max_tokens: int,
        num_logprobs: int,
663
        images: Optional[PromptImageInput] = None,
664
        **kwargs: Any,
665
    ) -> list[TokensTextLogprobs]:
666
667
668
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
669

670
671
672
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
673

674
675
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
676
            processor_kwargs: dict[str, Any] = {
677
678
679
680
681
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
682

683
684
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
685
686
687
688

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
689
690
691
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
692
693
694
695
696
697
698
699

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
700
                **encoder_inputs,
701
702
703
704
705
706
707
708
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
709
710
711
712
713
714
715
716
717
718
719

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

720
721
722
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
723

724
    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
725
726
        return self.model.predict(prompts, convert_to_tensor=True)

727
728
729
730
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
731
        del self.model
732
        cleanup_dist_env_and_memory()
733

Woosuk Kwon's avatar
Woosuk Kwon committed
734

Cyrus Leung's avatar
Cyrus Leung committed
735
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
736
737
738
739
740
def hf_runner():
    return HfRunner


class VllmRunner:
741
742
    """
    The default value of some arguments have been modified from
743
    {class}`~vllm.LLM` as follows:
744

745
746
747
748
749
750
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
751
    - `enforce_eager`: Set to `False` to test CUDA graph.
752
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
753
754
755
756

    def __init__(
        self,
        model_name: str,
757
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
758
        tokenizer_name: Optional[str] = None,
759
        tokenizer_mode: str = "auto",
760
761
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
762
        max_model_len: int = 1024,
763
        dtype: str = "auto",
764
        disable_log_stats: bool = True,
765
        tensor_parallel_size: int = 1,
766
        block_size: int = 16,
767
        enable_chunked_prefill: Optional[bool] = False,
768
        swap_space: int = 4,
769
        enforce_eager: Optional[bool] = False,
770
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
771
772
773
    ) -> None:
        self.model = LLM(
            model=model_name,
774
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
775
            tokenizer=tokenizer_name,
776
            tokenizer_mode=tokenizer_mode,
777
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
778
            dtype=dtype,
779
            seed=seed,
780
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
781
            enforce_eager=enforce_eager,
782
            disable_log_stats=disable_log_stats,
783
            tensor_parallel_size=tensor_parallel_size,
784
            max_model_len=max_model_len,
785
786
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
787
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
788
789
        )

790
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
791
        self,
792
        prompts: Union[list[str], list[torch.Tensor]],
793
        images: Optional[PromptImageInput] = None,
794
795
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
796
    ) -> list[TextPrompt]:
797

798
799
800
801
802
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
803

804
805
806
807
808
809
810
811
812
813
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio

814
815
816
817
818
819
            text_prompt_kwargs = {
                ("prompt" if isinstance(prompt, str) else "prompt_embeds"):
                prompt,
                "multi_modal_data": multi_modal_data or None
            }
            inputs.append(TextPrompt(**text_prompt_kwargs))
820
821
822
823
824

        return inputs

    def generate(
        self,
825
        prompts: Union[list[str], list[torch.Tensor]],
826
827
828
829
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
830
        **kwargs: Any,
831
    ) -> list[tuple[list[list[int]], list[str]]]:
832
833
834
835
836
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

837
        req_outputs = self.model.generate(inputs,
838
839
                                          sampling_params=sampling_params,
                                          **kwargs)
840

841
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
842
843
844
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
845
846
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
847
848
            for sample in req_output.outputs:
                output_str = sample.text
849
                output_ids = list(sample.token_ids)
850
                req_sample_output_ids.append(prompt_ids + output_ids)
851
                req_sample_output_strs.append((prompt_str or "") + output_str)
852
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
853
854
        return outputs

855
    @staticmethod
856
    def _final_steps_generate_w_logprobs(
857
858
859
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
860
        for req_output in req_outputs:
861
            assert len(req_output.outputs) > 0
862
863
            for sample in req_output.outputs:
                output_str = sample.text
864
                output_ids = list(sample.token_ids)
865
                output_logprobs = sample.logprobs
866
867
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
868
869
        return outputs

870
871
    def generate_w_logprobs(
        self,
872
        prompts: list[str],
873
        sampling_params: SamplingParams,
874
875
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
876
        videos: Optional[PromptVideoInput] = None,
877
        **kwargs: Any,
878
879
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
880
881
882
883
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
884

885
        req_outputs = self.model.generate(inputs,
886
887
                                          sampling_params=sampling_params,
                                          **kwargs)
888
889
890
891
892
893
894

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
895
896
897

    def generate_encoder_decoder_w_logprobs(
        self,
898
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
899
        sampling_params: SamplingParams,
900
901
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
902
903
904
905
906
907
908
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
909
910
911
912
913
914
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
915

Woosuk Kwon's avatar
Woosuk Kwon committed
916
917
    def generate_greedy(
        self,
918
        prompts: Union[list[str], list[torch.Tensor]],
Woosuk Kwon's avatar
Woosuk Kwon committed
919
        max_tokens: int,
920
        images: Optional[PromptImageInput] = None,
921
922
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
923
        **kwargs: Any,
924
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
925
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
926
927
928
929
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
930
931
                                audios=audios,
                                **kwargs)
932
933
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
934

935
936
    def generate_greedy_logprobs(
        self,
937
        prompts: list[str],
938
939
        max_tokens: int,
        num_logprobs: int,
940
        num_prompt_logprobs: Optional[int] = None,
941
942
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
943
        videos: Optional[PromptVideoInput] = None,
944
945
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
946
        **kwargs: Any,
947
948
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
949
950
951
952
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
953
            prompt_logprobs=num_prompt_logprobs,
954
955
            stop_token_ids=stop_token_ids,
            stop=stop)
956
957
958
959
960

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
961
962
                                        videos=videos,
                                        **kwargs)
963

964
965
    def generate_encoder_decoder_greedy_logprobs(
        self,
966
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
967
968
        max_tokens: int,
        num_logprobs: int,
969
        num_prompt_logprobs: Optional[int] = None,
970
        skip_special_tokens: bool = True,
971
972
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
973
974
975
976
977
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
978
            skip_special_tokens=skip_special_tokens,
979
        )
980
981
982
983
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

984
        return self.generate_encoder_decoder_w_logprobs(
985
986
            encoder_decoder_prompts, greedy_logprobs_params)

987
    def generate_beam_search(
988
        self,
989
        prompts: list[str],
990
991
        beam_width: int,
        max_tokens: int,
992
993
994
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
995
    ) -> list[tuple[list[list[int]], list[str]]]:
996
997
998
999
1000
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1001
        outputs = self.model.beam_search(
1002
            inputs,
1003
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
1004
1005
1006
1007
1008
1009
1010
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

1011
    def classify(self, prompts: list[str]) -> list[list[float]]:
1012
1013
1014
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

1015
1016
1017
1018
1019
1020
1021
    def encode(self,
               prompts: list[str],
               images: Optional[PromptImageInput] = None,
               videos: Optional[PromptVideoInput] = None,
               audios: Optional[PromptAudioInput] = None,
               *args,
               **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
1022
1023
1024
1025
1026
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1027
        req_outputs = self.model.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
1028
        return [req_output.outputs.embedding for req_output in req_outputs]
1029

1030
1031
    def score(
        self,
1032
1033
1034
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
    ) -> list[float]:
1035
        req_outputs = self.model.score(text_1, text_2)
1036
        return [req_output.outputs.score for req_output in req_outputs]
1037

1038
1039
1040
1041
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

1042
1043
1044
1045
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
1046
        del self.model
1047
        cleanup_dist_env_and_memory()
1048

Woosuk Kwon's avatar
Woosuk Kwon committed
1049

1050
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1051
1052
def vllm_runner():
    return VllmRunner
1053
1054


1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1069
1070
1071
1072
1073
1074
1075


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1076
    return cuda_device_count_stateless()
1077
1078
1079


temp_dir = tempfile.gettempdir()
1080
1081
_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
1082
_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
1083
1084
1085
1086


@pytest.fixture
def dummy_opt_path():
1087
1088
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1089
        snapshot_download(repo_id="facebook/opt-125m",
1090
                          local_dir=_dummy_opt_path,
1091
1092
1093
1094
1095
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1096
        with open(json_path) as f:
1097
1098
1099
1100
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
    return _dummy_opt_path


@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1115
        with open(json_path) as f:
1116
1117
1118
1119
1120
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1134
        with open(json_path) as f:
1135
1136
1137
1138
1139
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
1158
            item.add_marker(skip_optional)
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")