"vllm/vscode:/vscode.git/clone" did not exist on "b95ee898fe1ccb77632ed96fba7c517f6e6be931"
conftest.py 39.1 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import json
4
import os
5
import tempfile
6
from collections import UserList
7
from enum import Enum
8
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
Woosuk Kwon's avatar
Woosuk Kwon committed
9

10
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
11
12
import pytest
import torch
13
import torch.nn as nn
14
import torch.nn.functional as F
15
from huggingface_hub import snapshot_download
16
from PIL import Image
17
18
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          BatchEncoding, BatchFeature)
19
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
20

21
22
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
23
from vllm import LLM, SamplingParams
24
from vllm.assets.audio import AudioAsset
25
from vllm.assets.image import ImageAsset
26
from vllm.assets.video import VideoAsset
27
from vllm.config import TaskOption, _get_and_verify_dtype
28
from vllm.connections import global_http_connection
29
from vllm.distributed import (cleanup_dist_env_and_memory,
30
31
                              init_distributed_environment,
                              initialize_model_parallel)
32
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
33
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
34
from vllm.logger import init_logger
35
from vllm.outputs import RequestOutput
36
from vllm.sampling_params import BeamSearchParams
37
from vllm.utils import cuda_device_count_stateless
38

39
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
40

41
42
43
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
44
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
45

Cyrus Leung's avatar
Cyrus Leung committed
46
_M = TypeVar("_M")
47

48
_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
Cyrus Leung's avatar
Cyrus Leung committed
49
50

PromptImageInput = _PromptMultiModalInput[Image.Image]
51
PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
Cyrus Leung's avatar
Cyrus Leung committed
52
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
53

54

55
def _read_prompts(filename: str) -> list[str]:
56
    with open(filename) as f:
57
58
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
59
60


61
62
63
class _ImageAssetPrompts(TypedDict):
    stop_sign: str
    cherry_blossom: str
64
65


66
67
class _ImageAssetsBase(UserList[ImageAsset]):
    pass
68

69
70

class _ImageAssets(_ImageAssetsBase):
71
72

    def __init__(self) -> None:
73
74
75
76
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
77

78
    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
79
80
81
82
83
84
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
85
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
86
87


88
89
90
91
class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str


92
93
class _VideoAssetsBase(UserList[VideoAsset]):
    pass
94
95
96
97
98
99
100
101
102


class _VideoAssets(_VideoAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])

103
    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
104
105
106
        return [prompts["sample_demo_1"]]


107
108
109
110
111
112
113
114
115
116
117
118
119
class _AudioAssetsBase(UserList[AudioAsset]):
    pass


class _AudioAssets(_AudioAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            AudioAsset("mary_had_lamb"),
            AudioAsset("winning_call"),
        ])


120
121
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
122
123
VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
124
125
AUDIO_ASSETS = _AudioAssets()
"""Singleton instance of :class:`_AudioAssets`."""
126
127


128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
@pytest.fixture(scope="function", autouse=True)
def cleanup_VLLM_USE_V1(monkeypatch):
    """
    The V1 oracle sets "VLLM_USE_V1" during loading. This means
    that each invocation of a test change the env variable.

    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
    made during the test run by vLLM will be cleaned up.

    This fixture is used by every test.
    """

    # If VLLM_USE_V1 is not set, set then delete. This will
    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
    # if VLLM modifies the value of envs.VLLM_USE_V1.
    if "VLLM_USE_V1" not in os.environ:
        monkeypatch.setenv("VLLM_USE_V1", "")
        monkeypatch.delenv("VLLM_USE_V1")


Joe Runde's avatar
Joe Runde committed
148
@pytest.fixture(params=[True, False])
149
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
150
151
152
153
154
155
156
157
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
158
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
159
    else:
160
161
162
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
163
164


165
166
167
168
169
170
171
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


172
173
174
175
176
177
178
179
180
181
182
183
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
184
    cleanup_dist_env_and_memory()
185
186


187
@pytest.fixture()
188
def should_do_global_cleanup_after_test(request) -> bool:
189
190
191
192
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
193

194
    return not request.node.get_closest_marker("skip_global_cleanup")
195
196


197
@pytest.fixture(autouse=True)
198
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
199
    yield
200
    if should_do_global_cleanup_after_test:
201
        cleanup_dist_env_and_memory()
202
203


204
205
206
207
208
209
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
210
@pytest.fixture
211
def example_prompts() -> list[str]:
212
213
    prompts = []
    for filename in _TEST_PROMPTS:
214
        prompts += _read_prompts(filename)
215
216
217
    return prompts


218
219
220
221
222
223
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


224
225
226
227
228
229
230
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


231
@pytest.fixture
232
def example_encoder_decoder_prompts(
233
) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
234
235
236
237
238
239
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
240

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
256
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
257
        DecoderPromptType.EMPTY_STR:
258
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
259
        DecoderPromptType.CUSTOM:
260
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
261
262
263
    }


264
@pytest.fixture
265
def example_long_prompts() -> list[str]:
266
267
    prompts = []
    for filename in _LONG_PROMPTS:
268
        prompts += _read_prompts(filename)
269
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
270
271


272
273
274
275
276
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS


277
278
279
280
281
@pytest.fixture(scope="session")
def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS


282
283
284
285
286
@pytest.fixture(scope="session")
def audio_assets() -> _AudioAssets:
    return AUDIO_ASSETS


287
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
288
_R = TypeVar("_R")
289

Woosuk Kwon's avatar
Woosuk Kwon committed
290
291
292

class HfRunner:

293
    def get_default_device(self):
294
        from vllm.platforms import current_platform
295

296
        return ("cpu" if current_platform.is_cpu() else "cuda")
297
298

    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
299
300
301
        if x is None or isinstance(x, (bool, )):
            return x

302
        if device is None:
303
            device = self.device
304

305
306
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
307

308
309
310
311
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
312

Woosuk Kwon's avatar
Woosuk Kwon committed
313
314
315
    def __init__(
        self,
        model_name: str,
316
        dtype: str = "auto",
317
        *,
318
        model_kwargs: Optional[dict[str, Any]] = None,
319
        is_sentence_transformer: bool = False,
320
        is_cross_encoder: bool = False,
321
        skip_tokenizer_init: bool = False,
322
        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
Woosuk Kwon's avatar
Woosuk Kwon committed
323
    ) -> None:
324
        self.model_name = model_name
325

326
327
328
329
330
331
332
333
334
335
        self.config = AutoConfig.from_pretrained(
            model_name,
            trust_remote_code=True,
        )
        self.device = self.get_default_device()
        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)

        model_kwargs = model_kwargs if model_kwargs is not None else {}
        model_kwargs.setdefault("torch_dtype", torch_dtype)

336
        if is_sentence_transformer:
337
338
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
339
340
341
342
343
344
345

            self.model = SentenceTransformer(
                model_name,
                device=self.device,
                model_kwargs=model_kwargs,
                trust_remote_code=True,
            )
346
347
348
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
349
350
351
352
353
354
355

            self.model = CrossEncoder(
                model_name,
                device=self.device,
                automodel_args=model_kwargs,
                trust_remote_code=True,
            )
356
        else:
357
358
359
360
361
362
363
364
365
366
367
368
            model = auto_cls.from_pretrained(
                model_name,
                trust_remote_code=True,
                **model_kwargs,
            )

            if (getattr(model, "quantization_method", None) != "bitsandbytes"
                    and len({p.device
                             for p in model.parameters()}) < 2):
                model = model.to(self.device)

            self.model = model
369

370
371
372
373
374
375
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            )
376

377
378
379
380
381
382
383
384
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
385
386
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
387

388
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
389
        self,
390
        prompts: list[str],
391
        images: Optional[PromptImageInput] = None,
392
393
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
394
    ) -> list[Union[BatchFeature, BatchEncoding]]:
395
        if images is not None:
396
            assert len(prompts) == len(images)
397

398
399
400
401
402
403
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

404
        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
405
        for i, prompt in enumerate(prompts):
406
            processor_kwargs: dict[str, Any] = {
407
408
409
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
410
411
412
413
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
414
415
416
417
418
419
420
421
422
            if audios is not None and (audio_inputs := audios[i]) is not None:
                # HACK - not all processors take sampling_rate; we should
                # clean this up in the future.
                if len(audio_inputs) == 2:
                    audio, sr = audio_inputs
                    processor_kwargs["audio"] = audio
                    processor_kwargs["sampling_rate"] = sr
                else:
                    processor_kwargs["audio"] = audio_inputs
423
424

            inputs = self.processor(**processor_kwargs)
425
426
            if isinstance(inputs, BatchFeature):
                inputs = inputs.to(dtype=self.dtype)
427

428
429
430
431
            all_inputs.append(inputs)

        return all_inputs

432
    def classify(self, prompts: list[str]) -> list[str]:
433
434
435
436
437
438
439
440
441
442
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

443
444
    def generate(
        self,
445
        prompts: list[str],
446
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
447
        videos: Optional[PromptVideoInput] = None,
448
449
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
450
    ) -> list[tuple[list[list[int]], list[str]]]:
451
452
453
454
455
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

456
        outputs: list[tuple[list[list[int]], list[str]]] = []
457
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
458
            output_ids = self.model.generate(
459
                **self.wrap_device(inputs),
Woosuk Kwon's avatar
Woosuk Kwon committed
460
461
462
                use_cache=True,
                **kwargs,
            )
463
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
464
465
466
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
467
468
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
469
470
471
472
473
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
474
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
475
        max_tokens: int,
476
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
477
        videos: Optional[PromptVideoInput] = None,
478
        audios: Optional[PromptAudioInput] = None,
479
        **kwargs: Any,
480
    ) -> list[tuple[list[int], str]]:
481
482
        outputs = self.generate(prompts,
                                do_sample=False,
483
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
484
                                images=images,
485
486
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
487
                                **kwargs)
488
489
490

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
491
492
493

    def generate_beam_search(
        self,
494
        prompts: list[str],
495
496
        beam_width: int,
        max_tokens: int,
497
498
499
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
500
    ) -> list[tuple[list[list[int]], list[str]]]:
501
502
503
504
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
505
506
507
508
509
                                num_return_sequences=beam_width,
                                images=images,
                                videos=videos,
                                audios=audios)

510
511
512
513
514
515
516
517
518
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
519

520
521
    def generate_greedy_logprobs(
        self,
522
        prompts: list[str],
523
        max_tokens: int,
524
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
525
        videos: Optional[PromptVideoInput] = None,
526
        audios: Optional[PromptAudioInput] = None,
527
        **kwargs: Any,
528
    ) -> list[list[torch.Tensor]]:
529
530
531
532
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
533

534
        all_logprobs: list[list[torch.Tensor]] = []
535
        for inputs in all_inputs:
536
            output = self.model.generate(
537
                **self.wrap_device(inputs),
538
539
540
541
542
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
543
                **kwargs,
544
            )
545
546
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
547
548
549
            all_logprobs.append(seq_logprobs)
        return all_logprobs

550
    def _hidden_states_to_seq_logprobs(
551
        self,
552
553
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
    ) -> list[torch.Tensor]:
554
555
        output_embeddings = self.model.get_output_embeddings()

556
        seq_logprobs: list[torch.Tensor] = []
557
558
559
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
560
561
562
563
                last_hidden_states.to(
                    device=output_embeddings.weight.device,
                    dtype=output_embeddings.weight.dtype,
                ),
564
                output_embeddings.weight.t(),
565
            )
566
567
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
568
569
570
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

571
572
573
574
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
575
        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
576
        num_logprobs: int,
577
    ) -> tuple[list[dict[int, float]], int]:
578
579
580
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

581
        # convert to dict
582
        seq_logprobs_lst: list[dict[int, float]] = []
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

600
601
    def generate_greedy_logprobs_limit(
        self,
602
        prompts: list[str],
603
604
        max_tokens: int,
        num_logprobs: int,
605
606
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
607
        videos: Optional[PromptVideoInput] = None,
608
        **kwargs: Any,
609
    ) -> list[TokensTextLogprobs]:
610
611
612
613
614
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

615
616
617
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
618

619
        for inputs in all_inputs:
620
            output = self.model.generate(
621
                **self.wrap_device(inputs),
622
623
624
625
626
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
627
                **kwargs,
628
629
            )

630
631
632
633
634
635
636
637
638
639
640
641
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
642

643
644
645
646
647
648
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
649
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
650
651
        max_tokens: int,
        num_logprobs: int,
652
        images: Optional[PromptImageInput] = None,
653
        **kwargs: Any,
654
    ) -> list[TokensTextLogprobs]:
655
656
657
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
658

659
660
661
        all_logprobs: list[list[dict[int, float]]] = []
        all_output_ids: list[list[int]] = []
        all_output_strs: list[str] = []
662

663
664
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
665
            processor_kwargs: dict[str, Any] = {
666
667
668
669
670
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
671

672
673
            encoder_inputs = self.processor(**processor_kwargs)
            encoder_inputs = self.wrap_device(encoder_inputs)
674
675
676
677

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
678
679
680
                decoder_inputs = self.tokenizer(decoder_prompt,
                                                return_tensors="pt")
                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
681
682
683
684
685
686
687
688

            output = self.model.generate(
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
689
                **encoder_inputs,
690
691
692
693
694
695
696
697
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
698
699
700
701
702
703
704
705
706
707
708

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

709
710
711
    def encode(self, prompts: list[str], *args,
               **kwargs) -> list[list[torch.Tensor]]:
        return self.model.encode(prompts, *args, **kwargs)
712

713
    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
714
715
        return self.model.predict(prompts, convert_to_tensor=True)

716
717
718
719
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
720
        del self.model
721
        cleanup_dist_env_and_memory()
722

Woosuk Kwon's avatar
Woosuk Kwon committed
723

Cyrus Leung's avatar
Cyrus Leung committed
724
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
725
726
727
728
729
def hf_runner():
    return HfRunner


class VllmRunner:
730
731
732
    """
    The default value of some arguments have been modified from
    :class:`~vllm.LLM` as follows:
733

734
735
736
737
738
739
740
741
    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
    - `seed`: Set to `0` instead of `None` for test reproducibility.
    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
    - `enable_chunked_prefill`: Set to `False` instead of `None` for
      test reproducibility.
    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
    """
Woosuk Kwon's avatar
Woosuk Kwon committed
742
743
744
745

    def __init__(
        self,
        model_name: str,
746
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
747
        tokenizer_name: Optional[str] = None,
748
        tokenizer_mode: str = "auto",
749
750
        trust_remote_code: bool = True,
        seed: Optional[int] = 0,
751
        max_model_len: int = 1024,
752
        dtype: str = "auto",
753
        disable_log_stats: bool = True,
754
        tensor_parallel_size: int = 1,
755
        block_size: int = 16,
756
        enable_chunked_prefill: Optional[bool] = False,
757
        swap_space: int = 4,
758
        enforce_eager: Optional[bool] = False,
759
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
760
761
762
    ) -> None:
        self.model = LLM(
            model=model_name,
763
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
764
            tokenizer=tokenizer_name,
765
            tokenizer_mode=tokenizer_mode,
766
            trust_remote_code=trust_remote_code,
Woosuk Kwon's avatar
Woosuk Kwon committed
767
            dtype=dtype,
768
            seed=seed,
769
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
770
            enforce_eager=enforce_eager,
771
            disable_log_stats=disable_log_stats,
772
            tensor_parallel_size=tensor_parallel_size,
773
            max_model_len=max_model_len,
774
775
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
776
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
777
778
        )

779
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
780
        self,
781
        prompts: list[str],
782
        images: Optional[PromptImageInput] = None,
783
784
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
785
    ) -> list[TextPrompt]:
786

787
788
789
790
791
        if any(x is not None and len(x) != len(prompts)
               for x in [images, videos, audios]):
            raise ValueError(
                "All non-None multimodal inputs must have the same length as "
                "prompts")
792

793
794
795
796
797
798
799
800
801
802
803
804
805
806
        inputs = []
        for i, prompt in enumerate(prompts):
            multi_modal_data = {}
            if images is not None and (image := images[i]) is not None:
                multi_modal_data["image"] = image
            if videos is not None and (video := videos[i]) is not None:
                multi_modal_data["video"] = video
            if audios is not None and (audio := audios[i]) is not None:
                multi_modal_data["audio"] = audio

            inputs.append(
                TextPrompt(prompt=prompt,
                           multi_modal_data=multi_modal_data
                           if multi_modal_data else None))
807
808
809
810
811

        return inputs

    def generate(
        self,
812
        prompts: list[str],
813
814
815
816
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
817
        **kwargs: Any,
818
    ) -> list[tuple[list[list[int]], list[str]]]:
819
820
821
822
823
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

824
        req_outputs = self.model.generate(inputs,
825
826
                                          sampling_params=sampling_params,
                                          **kwargs)
827

828
        outputs: list[tuple[list[list[int]], list[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
829
830
831
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
832
833
            req_sample_output_ids: list[list[int]] = []
            req_sample_output_strs: list[str] = []
834
835
            for sample in req_output.outputs:
                output_str = sample.text
836
                output_ids = list(sample.token_ids)
837
838
839
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
840
841
        return outputs

842
    @staticmethod
843
    def _final_steps_generate_w_logprobs(
844
845
846
        req_outputs: list[RequestOutput],
    ) -> list[TokensTextLogprobsPromptLogprobs]:
        outputs: list[TokensTextLogprobsPromptLogprobs] = []
847
        for req_output in req_outputs:
848
            assert len(req_output.outputs) > 0
849
850
            for sample in req_output.outputs:
                output_str = sample.text
851
                output_ids = list(sample.token_ids)
852
                output_logprobs = sample.logprobs
853
854
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
855
856
        return outputs

857
858
    def generate_w_logprobs(
        self,
859
        prompts: list[str],
860
        sampling_params: SamplingParams,
861
862
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
863
        videos: Optional[PromptVideoInput] = None,
864
        **kwargs: Any,
865
866
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
867
868
869
870
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
871

872
        req_outputs = self.model.generate(inputs,
873
874
                                          sampling_params=sampling_params,
                                          **kwargs)
875
876
877
878
879
880
881

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
882
883
884

    def generate_encoder_decoder_w_logprobs(
        self,
885
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
886
        sampling_params: SamplingParams,
887
888
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
889
890
891
892
893
894
895
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
896
897
898
899
900
901
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
902

Woosuk Kwon's avatar
Woosuk Kwon committed
903
904
    def generate_greedy(
        self,
905
        prompts: list[str],
Woosuk Kwon's avatar
Woosuk Kwon committed
906
        max_tokens: int,
907
        images: Optional[PromptImageInput] = None,
908
909
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
910
        **kwargs: Any,
911
    ) -> list[tuple[list[int], str]]:
Woosuk Kwon's avatar
Woosuk Kwon committed
912
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
913
914
915
916
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
917
918
                                audios=audios,
                                **kwargs)
919
920
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
921

922
923
    def generate_greedy_logprobs(
        self,
924
        prompts: list[str],
925
926
        max_tokens: int,
        num_logprobs: int,
927
        num_prompt_logprobs: Optional[int] = None,
928
929
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
930
        videos: Optional[PromptVideoInput] = None,
931
932
        stop_token_ids: Optional[list[int]] = None,
        stop: Optional[list[str]] = None,
933
        **kwargs: Any,
934
935
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
936
937
938
939
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
940
            prompt_logprobs=num_prompt_logprobs,
941
942
            stop_token_ids=stop_token_ids,
            stop=stop)
943
944
945
946
947

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
948
949
                                        videos=videos,
                                        **kwargs)
950

951
952
    def generate_encoder_decoder_greedy_logprobs(
        self,
953
        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
954
955
        max_tokens: int,
        num_logprobs: int,
956
        num_prompt_logprobs: Optional[int] = None,
957
        skip_special_tokens: bool = True,
958
959
    ) -> Union[list[TokensTextLogprobs],
               list[TokensTextLogprobsPromptLogprobs]]:
960
961
962
963
964
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
965
            skip_special_tokens=skip_special_tokens,
966
        )
967
968
969
970
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

971
        return self.generate_encoder_decoder_w_logprobs(
972
973
            encoder_decoder_prompts, greedy_logprobs_params)

974
    def generate_beam_search(
975
        self,
976
        prompts: list[str],
977
978
        beam_width: int,
        max_tokens: int,
979
980
981
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
982
    ) -> list[tuple[list[list[int]], list[str]]]:
983
984
985
986
987
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

988
        outputs = self.model.beam_search(
989
            inputs,
990
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
991
992
993
994
995
996
997
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

998
    def classify(self, prompts: list[str]) -> list[list[float]]:
999
1000
1001
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

1002
1003
1004
1005
1006
1007
1008
    def encode(self,
               prompts: list[str],
               images: Optional[PromptImageInput] = None,
               videos: Optional[PromptVideoInput] = None,
               audios: Optional[PromptAudioInput] = None,
               *args,
               **kwargs) -> list[list[float]]:
Cyrus Leung's avatar
Cyrus Leung committed
1009
1010
1011
1012
1013
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

1014
        req_outputs = self.model.embed(inputs, *args, **kwargs)
Cyrus Leung's avatar
Cyrus Leung committed
1015
        return [req_output.outputs.embedding for req_output in req_outputs]
1016

1017
1018
    def score(
        self,
1019
1020
1021
        text_1: Union[str, list[str]],
        text_2: Union[str, list[str]],
    ) -> list[float]:
1022
        req_outputs = self.model.score(text_1, text_2)
1023
        return [req_output.outputs.score for req_output in req_outputs]
1024

1025
1026
1027
1028
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

1029
1030
1031
1032
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
1033
        del self.model
1034
        cleanup_dist_env_and_memory()
1035

Woosuk Kwon's avatar
Woosuk Kwon committed
1036

1037
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
1038
1039
def vllm_runner():
    return VllmRunner
1040
1041


1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
1056
1057
1058
1059
1060
1061
1062


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

1063
    return cuda_device_count_stateless()
1064
1065
1066


temp_dir = tempfile.gettempdir()
1067
1068
_dummy_opt_path = os.path.join(temp_dir, "dummy_opt")
_dummy_llava_path = os.path.join(temp_dir, "dummy_llava")
1069
_dummy_gemma2_embedding_path = os.path.join(temp_dir, "dummy_gemma2_embedding")
1070
1071
1072
1073


@pytest.fixture
def dummy_opt_path():
1074
1075
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1076
        snapshot_download(repo_id="facebook/opt-125m",
1077
                          local_dir=_dummy_opt_path,
1078
1079
1080
1081
1082
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1083
        with open(json_path) as f:
1084
1085
1086
1087
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
    return _dummy_opt_path


@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1102
        with open(json_path) as f:
1103
1104
1105
1106
1107
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1121
        with open(json_path) as f:
1122
1123
1124
1125
1126
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
1145
            item.add_marker(skip_optional)
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157


@pytest.fixture(scope="session")
def cli_config_file():
    """Return the path to the CLI config file."""
    return os.path.join(_TEST_DIR, "config", "test_config.yaml")


@pytest.fixture(scope="session")
def cli_config_file_with_model():
    """Return the path to the CLI config file with model."""
    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")