conftest.py 37.5 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import json
4
import os
5
import tempfile
6
from collections import UserList
7
from enum import Enum
8
9
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
                    TypedDict, TypeVar, Union)
10
11
import pytest
import pytest_html
Woosuk Kwon's avatar
Woosuk Kwon committed
12

13
import numpy as np
Woosuk Kwon's avatar
Woosuk Kwon committed
14
15
import pytest
import torch
16
import torch.nn as nn
17
import torch.nn.functional as F
18
from huggingface_hub import snapshot_download
19
from PIL import Image
20
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
21
                          BatchFeature)
22
from transformers.models.auto.auto_factory import _BaseAutoModelClass
Woosuk Kwon's avatar
Woosuk Kwon committed
23

24
25
from tests.models.utils import (TokensTextLogprobs,
                                TokensTextLogprobsPromptLogprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
26
from vllm import LLM, SamplingParams
27
from vllm.assets.image import ImageAsset
28
from vllm.assets.video import VideoAsset
29
from vllm.config import TaskOption, TokenizerPoolConfig
30
from vllm.connections import global_http_connection
31
from vllm.distributed import (cleanup_dist_env_and_memory,
32
33
                              init_distributed_environment,
                              initialize_model_parallel)
34
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
youkaichao's avatar
youkaichao committed
35
36
                         TokensPrompt, to_enc_dec_tuple_list,
                         zip_enc_dec_prompts)
37
from vllm.logger import init_logger
38
from vllm.outputs import RequestOutput
39
from vllm.sampling_params import BeamSearchParams
40
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
youkaichao's avatar
youkaichao committed
41
                        identity, is_list_of)
zhuwenwen's avatar
zhuwenwen committed
42
from .utils import models_path_prefix
43

44
logger = init_logger(__name__)
Woosuk Kwon's avatar
Woosuk Kwon committed
45

46
47
48
_TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
49
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
50

Cyrus Leung's avatar
Cyrus Leung committed
51
52
53
54
55
56
_M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
57

58

59
def _read_prompts(filename: str) -> List[str]:
60
    with open(filename) as f:
61
62
        prompts = f.readlines()
        return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
63
64


65
66
67
class _ImageAssetPrompts(TypedDict):
    stop_sign: str
    cherry_blossom: str
68
69


70
71
class _ImageAssetsBase(UserList[ImageAsset]):
    pass
72

73
74

class _ImageAssets(_ImageAssetsBase):
75
76

    def __init__(self) -> None:
77
78
79
80
        super().__init__([
            ImageAsset("stop_sign"),
            ImageAsset("cherry_blossom"),
        ])
81
82
83
84
85
86
87
88

    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
        """
        Convenience method to define the prompt for each test image.

        The order of the returned prompts matches the order of the
        assets when iterating through this object.
        """
89
        return [prompts["stop_sign"], prompts["cherry_blossom"]]
90
91


92
93
94
95
class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str


96
97
class _VideoAssetsBase(UserList[VideoAsset]):
    pass
98
99
100
101
102
103
104
105
106
107
108
109
110


class _VideoAssets(_VideoAssetsBase):

    def __init__(self) -> None:
        super().__init__([
            VideoAsset("sample_demo_1.mp4"),
        ])

    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
        return [prompts["sample_demo_1"]]


111
112
IMAGE_ASSETS = _ImageAssets()
"""Singleton instance of :class:`_ImageAssets`."""
113
114
VIDEO_ASSETS = _VideoAssets()
"""Singleton instance of :class:`_VideoAssets`."""
115
116


Joe Runde's avatar
Joe Runde committed
117
@pytest.fixture(params=[True, False])
118
def run_with_both_engines(request, monkeypatch):
Joe Runde's avatar
Joe Runde committed
119
120
121
122
123
124
125
126
    # Automatically runs tests twice, once with V1 and once without
    use_v1 = request.param
    # Tests decorated with `@skip_v1` are only run without v1
    skip_v1 = request.node.get_closest_marker("skip_v1")

    if use_v1:
        if skip_v1:
            pytest.skip("Skipping test on vllm V1")
127
        monkeypatch.setenv('VLLM_USE_V1', '1')
Joe Runde's avatar
Joe Runde committed
128
    else:
129
130
131
        monkeypatch.setenv('VLLM_USE_V1', '0')

    yield
Joe Runde's avatar
Joe Runde committed
132
133


134
135
136
137
138
139
140
@pytest.fixture(autouse=True)
def init_test_http_connection():
    # pytest_asyncio may use a different event loop per test
    # so we need to make sure the async client is created anew
    global_http_connection.reuse_client = False


141
142
143
144
145
146
147
148
149
150
151
152
@pytest.fixture
def dist_init():
    temp_file = tempfile.mkstemp()[1]
    init_distributed_environment(
        world_size=1,
        rank=0,
        distributed_init_method=f"file://{temp_file}",
        local_rank=0,
        backend="nccl",
    )
    initialize_model_parallel(1, 1)
    yield
153
    cleanup_dist_env_and_memory()
154
155


156
@pytest.fixture()
157
def should_do_global_cleanup_after_test(request) -> bool:
158
159
160
161
    """Allow subdirectories to skip global cleanup by overriding this fixture.
    This can provide a ~10x speedup for non-GPU unit tests since they don't need
    to initialize torch.
    """
162

163
    return not request.node.get_closest_marker("skip_global_cleanup")
164
165


166
@pytest.fixture(autouse=True)
167
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
168
    yield
169
    if should_do_global_cleanup_after_test:
170
        cleanup_dist_env_and_memory()
171
172


173
174
175
176
177
178
@pytest.fixture(autouse=True)
def dynamo_reset():
    yield
    torch._dynamo.reset()


Woosuk Kwon's avatar
Woosuk Kwon committed
179
180
@pytest.fixture
def example_prompts() -> List[str]:
181
182
    prompts = []
    for filename in _TEST_PROMPTS:
183
        prompts += _read_prompts(filename)
184
185
186
    return prompts


187
188
189
190
191
192
@pytest.fixture
def example_system_message() -> str:
    with open(_SYS_MSG) as f:
        return f.read()


193
194
195
196
197
198
199
class DecoderPromptType(Enum):
    """For encoder/decoder models only."""
    CUSTOM = 1
    NONE = 2
    EMPTY_STR = 3


200
@pytest.fixture
201
202
def example_encoder_decoder_prompts(
) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
203
204
205
206
207
208
    '''
    Returns an encoder prompt list and a decoder prompt list, wherein each pair
    of same-index entries in both lists corresponds to an (encoder prompt,
    decoder prompt) tuple.

    Returns:
209

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
    * Encoder prompt list
    * Decoder prompt list (reverse of encoder prompt list)
    '''

    encoder_prompts = []
    for filename in _TEST_PROMPTS:
        encoder_prompts += _read_prompts(filename)

    custom_decoder_prompts = encoder_prompts[::-1]
    empty_str_decoder_prompts = [""] * len(encoder_prompts)
    none_decoder_prompts = [None] * len(encoder_prompts)

    # NONE decoder prompt type
    return {
        DecoderPromptType.NONE:
225
        zip_enc_dec_prompts(encoder_prompts, none_decoder_prompts),
226
        DecoderPromptType.EMPTY_STR:
227
        zip_enc_dec_prompts(encoder_prompts, empty_str_decoder_prompts),
228
        DecoderPromptType.CUSTOM:
229
        zip_enc_dec_prompts(encoder_prompts, custom_decoder_prompts),
230
231
232
    }


233
234
235
236
@pytest.fixture
def example_long_prompts() -> List[str]:
    prompts = []
    for filename in _LONG_PROMPTS:
237
        prompts += _read_prompts(filename)
238
    return prompts
Woosuk Kwon's avatar
Woosuk Kwon committed
239
240


241
242
243
244
245
@pytest.fixture(scope="session")
def image_assets() -> _ImageAssets:
    return IMAGE_ASSETS


246
247
248
249
250
@pytest.fixture(scope="session")
def video_assets() -> _VideoAssets:
    return VIDEO_ASSETS


251
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
252
_R = TypeVar("_R")
253

Woosuk Kwon's avatar
Woosuk Kwon committed
254
255
256

class HfRunner:

257
    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
258
        from vllm.platforms import current_platform
259
260
261
        if x is None or isinstance(x, (bool, )):
            return x

262
        if device is None:
263
            device = "cpu" if current_platform.is_cpu() else "cuda"
264

265
266
        if isinstance(x, dict):
            return {k: self.wrap_device(v, device) for k, v in x.items()}
267

268
269
270
271
        if hasattr(x, "device") and x.device.type == device:
            return x

        return x.to(device)
272

Woosuk Kwon's avatar
Woosuk Kwon committed
273
274
275
276
    def __init__(
        self,
        model_name: str,
        dtype: str = "half",
277
        *,
278
        model_kwargs: Optional[Dict[str, Any]] = None,
279
        is_sentence_transformer: bool = False,
280
        is_cross_encoder: bool = False,
281
        skip_tokenizer_init: bool = False,
282
        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
283
        postprocess_inputs: Callable[..., BatchEncoding] = identity,
Woosuk Kwon's avatar
Woosuk Kwon committed
284
    ) -> None:
285
        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
286

287
        self.model_name = model_name
288

289
        if is_sentence_transformer:
290
291
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
292
293
294
295
            self.model = self.wrap_device(
                SentenceTransformer(
                    model_name,
                    device="cpu",
296
                    trust_remote_code=True,
297
                ).to(dtype=torch_dtype))
298
299
300
301
302
303
304
305
        elif is_cross_encoder:
            # Lazy init required for AMD CI
            from sentence_transformers import CrossEncoder
            self.model = CrossEncoder(model_name,
                                      device="cpu",
                                      trust_remote_code=True)
            self.model.model = self.wrap_device(self.model.model)\
                .to(dtype=torch_dtype)
306
        else:
307
            model_kwargs = model_kwargs if model_kwargs is not None else {}
308
            self.model = self.wrap_device(
309
                auto_cls.from_pretrained(
310
311
312
                    model_name,
                    torch_dtype=torch_dtype,
                    trust_remote_code=True,
313
                    **model_kwargs,
314
                ))
315

316
317
318
319
320
321
        if not skip_tokenizer_init:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                torch_dtype=torch_dtype,
                trust_remote_code=True,
            )
322

323
324
325
326
327
328
329
330
        # don't put this import at the top level
        # it will call torch.cuda.device_count()
        from transformers import AutoProcessor  # noqa: F401
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        )
331
332
        if skip_tokenizer_init:
            self.tokenizer = self.processor.tokenizer
Woosuk Kwon's avatar
Woosuk Kwon committed
333

334
        self.dtype = dtype
335
336
        self.postprocess_inputs = postprocess_inputs

337
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
338
339
        self,
        prompts: List[str],
340
        images: Optional[PromptImageInput] = None,
341
342
343
344
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
    ) -> List[BatchEncoding]:
        if images is not None:
345
            assert len(prompts) == len(images)
346

347
348
349
350
351
352
353
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

        all_inputs: List[BatchEncoding] = []
354
        for i, prompt in enumerate(prompts):
355
356
357
358
            processor_kwargs: Dict[str, Any] = {
                "text": prompt,
                "return_tensors": "pt",
            }
Cyrus Leung's avatar
Cyrus Leung committed
359
360
361
362
363
364
            if images is not None and (image := images[i]) is not None:
                processor_kwargs["images"] = image
            if videos is not None and (video := videos[i]) is not None:
                processor_kwargs["videos"] = video
            if audios is not None and (audio_tuple := audios[i]) is not None:
                audio, sr = audio_tuple
365
366
                processor_kwargs["audio"] = audio
                processor_kwargs["sampling_rate"] = sr
367
368

            inputs = self.processor(**processor_kwargs)
369
            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
370

371
372
373
374
            all_inputs.append(inputs)

        return all_inputs

375
376
377
378
379
380
381
382
383
384
385
    def classify(self, prompts: List[str]) -> List[str]:
        # output is final logits
        all_inputs = self.get_inputs(prompts)
        outputs = []
        for inputs in all_inputs:
            output = self.model(**self.wrap_device(inputs))
            logits = output.logits.softmax(dim=-1)[0].tolist()
            outputs.append(logits)

        return outputs

386
387
388
389
    def generate(
        self,
        prompts: List[str],
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
390
        videos: Optional[PromptVideoInput] = None,
391
392
393
394
395
396
397
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
398

399
400
        outputs: List[Tuple[List[List[int]], List[str]]] = []
        for inputs in all_inputs:
Woosuk Kwon's avatar
Woosuk Kwon committed
401
            output_ids = self.model.generate(
402
                **self.wrap_device(inputs, device=self.model.device.type),
Woosuk Kwon's avatar
Woosuk Kwon committed
403
404
405
                use_cache=True,
                **kwargs,
            )
406
            output_str = self.processor.batch_decode(
Woosuk Kwon's avatar
Woosuk Kwon committed
407
408
409
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
410
411
            )
            output_ids = output_ids.cpu().tolist()
Woosuk Kwon's avatar
Woosuk Kwon committed
412
413
414
415
416
417
418
            outputs.append((output_ids, output_str))
        return outputs

    def generate_greedy(
        self,
        prompts: List[str],
        max_tokens: int,
419
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
420
        videos: Optional[PromptVideoInput] = None,
421
        audios: Optional[PromptAudioInput] = None,
422
        **kwargs: Any,
Woosuk Kwon's avatar
Woosuk Kwon committed
423
    ) -> List[Tuple[List[int], str]]:
424
425
        outputs = self.generate(prompts,
                                do_sample=False,
426
                                max_new_tokens=max_tokens,
Chang Su's avatar
Chang Su committed
427
                                images=images,
428
429
                                videos=videos,
                                audios=audios,
Chang Su's avatar
Chang Su committed
430
                                **kwargs)
431
432
433

        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
434
435
436
437
438
439

    def generate_beam_search(
        self,
        prompts: List[str],
        beam_width: int,
        max_tokens: int,
440
    ) -> List[Tuple[List[List[int]], List[str]]]:
441
442
443
444
445
446
447
448
449
450
451
452
453
454
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                num_beams=beam_width,
                                num_return_sequences=beam_width)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            for j in range(len(output_ids)):
                output_ids[j] = [
                    x for x in output_ids[j]
                    if x != self.tokenizer.pad_token_id
                ]
            outputs[i] = (output_ids, output_str)
        return outputs
Woosuk Kwon's avatar
Woosuk Kwon committed
455

456
457
458
459
    def generate_greedy_logprobs(
        self,
        prompts: List[str],
        max_tokens: int,
460
        images: Optional[PromptImageInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
461
        videos: Optional[PromptVideoInput] = None,
462
        audios: Optional[PromptAudioInput] = None,
463
        **kwargs: Any,
464
    ) -> List[List[torch.Tensor]]:
465
466
467
468
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)
469

470
471
        all_logprobs: List[List[torch.Tensor]] = []
        for inputs in all_inputs:
472
            output = self.model.generate(
473
                **self.wrap_device(inputs, device=self.model.device.type),
474
475
476
477
478
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
479
                **kwargs,
480
            )
481
482
            seq_logprobs = self._hidden_states_to_seq_logprobs(
                output.hidden_states)
483
484
485
            all_logprobs.append(seq_logprobs)
        return all_logprobs

486
    def _hidden_states_to_seq_logprobs(
487
        self,
488
489
490
491
        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
    ) -> List[torch.Tensor]:
        output_embeddings = self.model.get_output_embeddings()

492
493
494
495
        seq_logprobs: List[torch.Tensor] = []
        for _, hidden_state in enumerate(hidden_states):
            last_hidden_states = hidden_state[-1][0]
            logits = torch.matmul(
496
497
                last_hidden_states.to(output_embeddings.weight.device),
                output_embeddings.weight.t(),
498
            )
499
500
            if getattr(output_embeddings, "bias", None) is not None:
                logits += output_embeddings.bias.unsqueeze(0)
501
502
503
            logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
            seq_logprobs.append(logprobs)

504
505
506
507
508
509
510
511
512
513
        return seq_logprobs

    def _hidden_states_to_logprobs(
        self,
        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
        num_logprobs: int,
    ) -> Tuple[List[Dict[int, float]], int]:
        seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
        output_len = len(hidden_states)

514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
        # convert to dict
        seq_logprobs_lst: List[Dict[int, float]] = []
        for tok_idx, tok_logprobs in enumerate(seq_logprobs):
            # drop prompt logprobs
            if tok_idx == 0:
                tok_logprobs = tok_logprobs[-1, :].reshape(1, -1)
            topk = tok_logprobs.topk(num_logprobs)

            tok_logprobs_dct = {}
            for token_id, logprob in zip(topk.indices[0], topk.values[0]):
                tok_logprobs_dct[token_id.item()] = logprob.item()

            seq_logprobs_lst.append(tok_logprobs_dct)

        return (
            seq_logprobs_lst,
            output_len,
        )

533
534
535
536
537
    def generate_greedy_logprobs_limit(
        self,
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
538
539
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
Cyrus Leung's avatar
Cyrus Leung committed
540
        videos: Optional[PromptVideoInput] = None,
541
        **kwargs: Any,
542
    ) -> List[TokensTextLogprobs]:
543
544
545
546
547
        all_inputs = self.get_inputs(prompts,
                                     images=images,
                                     videos=videos,
                                     audios=audios)

548
549
550
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []
551

552
        for inputs in all_inputs:
553
            output = self.model.generate(
554
                **self.wrap_device(inputs, device=self.model.device.type),
555
556
557
558
559
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
560
                **kwargs,
561
562
            )

563
564
565
566
567
568
569
570
571
572
573
574
            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.hidden_states,
                                                num_logprobs)

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_len = len(seq_logprobs_lst)
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))
575

576
577
578
579
580
581
        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

    def generate_encoder_decoder_greedy_logprobs_limit(
        self,
582
        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
583
584
        max_tokens: int,
        num_logprobs: int,
585
        images: Optional[PromptImageInput] = None,
586
        **kwargs: Any,
587
    ) -> List[TokensTextLogprobs]:
588
589
590
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''
591

592
593
594
        all_logprobs: List[List[Dict[int, float]]] = []
        all_output_ids: List[List[int]] = []
        all_output_strs: List[str] = []
595

596
597
598
599
600
601
602
603
        for i, (encoder_prompt, decoder_prompt) in enumerate(
                to_enc_dec_tuple_list(encoder_decoder_prompts)):
            processor_kwargs: Dict[str, Any] = {
                "text": encoder_prompt,
                "return_tensors": "pt",
            }
            if images is not None and images[i] is not None:
                processor_kwargs["images"] = images[i]
604

605
            encoder_input_ids = self.wrap_device(
606
                self.processor(**processor_kwargs).input_ids,
607
608
609
610
611
612
613
                device=self.model.device.type,
            )

            if decoder_prompt is None:
                decoder_input_ids = None
            else:
                decoder_input_ids = self.wrap_device(
614
                    self.tokenizer(decoder_prompt,
615
616
617
                                   return_tensors="pt").input_ids,
                    device=self.model.device.type,
                )
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634

            output = self.model.generate(
                encoder_input_ids,
                decoder_input_ids=decoder_input_ids,
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,
                return_dict_in_generate=True,
                **kwargs,
            )

            (
                seq_logprobs_lst,
                output_len,
            ) = self._hidden_states_to_logprobs(output.decoder_hidden_states,
                                                num_logprobs)
635
636
637
638
639
640
641
642
643
644
645

            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
            output_ids = seq_ids[-output_len:]
            all_output_ids.append(output_ids.tolist())
            all_output_strs.append(self.tokenizer.decode(output_ids))

        outputs = zip(all_output_ids, all_output_strs, all_logprobs)
        return [(output_ids, output_str, output_logprobs)
                for output_ids, output_str, output_logprobs in outputs]

646
647
648
    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
        return self.model.encode(prompts)

649
650
651
    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
        return self.model.predict(prompts, convert_to_tensor=True)

652
653
654
655
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
656
        del self.model
657
        cleanup_dist_env_and_memory()
658

Woosuk Kwon's avatar
Woosuk Kwon committed
659

Cyrus Leung's avatar
Cyrus Leung committed
660
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
661
662
663
664
665
666
667
668
669
def hf_runner():
    return HfRunner


class VllmRunner:

    def __init__(
        self,
        model_name: str,
670
        task: TaskOption = "auto",
Woosuk Kwon's avatar
Woosuk Kwon committed
671
        tokenizer_name: Optional[str] = None,
672
        tokenizer_mode: str = "auto",
673
674
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
675
        max_model_len: int = 1024,
Woosuk Kwon's avatar
Woosuk Kwon committed
676
        dtype: str = "half",
677
        disable_log_stats: bool = True,
678
        tensor_parallel_size: int = 1,
679
680
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
681
        swap_space: int = 4,
682
        enforce_eager: Optional[bool] = False,
683
        **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
684
685
686
    ) -> None:
        self.model = LLM(
            model=model_name,
687
            task=task,
Woosuk Kwon's avatar
Woosuk Kwon committed
688
            tokenizer=tokenizer_name,
689
            tokenizer_mode=tokenizer_mode,
Woosuk Kwon's avatar
Woosuk Kwon committed
690
691
            trust_remote_code=True,
            dtype=dtype,
692
            swap_space=swap_space,
Cyrus Leung's avatar
Cyrus Leung committed
693
            enforce_eager=enforce_eager,
694
            disable_log_stats=disable_log_stats,
695
            tensor_parallel_size=tensor_parallel_size,
696
            max_model_len=max_model_len,
697
698
            block_size=block_size,
            enable_chunked_prefill=enable_chunked_prefill,
699
            **kwargs,
Woosuk Kwon's avatar
Woosuk Kwon committed
700
701
        )

702
    def get_inputs(
Woosuk Kwon's avatar
Woosuk Kwon committed
703
704
        self,
        prompts: List[str],
705
        images: Optional[PromptImageInput] = None,
706
707
708
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
    ) -> List[TextPrompt]:
709
        if images is not None:
710
            assert len(prompts) == len(images)
711

712
713
714
715
716
717
        if videos is not None:
            assert len(prompts) == len(videos)

        if audios is not None:
            assert len(prompts) == len(audios)

718
719
720
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
Cyrus Leung's avatar
Cyrus Leung committed
721
722
                if image is not None:
                    inputs[i]["multi_modal_data"] = {"image": image}
723

724
725
        if videos is not None:
            for i, video in enumerate(videos):
Cyrus Leung's avatar
Cyrus Leung committed
726
727
                if video is not None:
                    inputs[i]["multi_modal_data"] = {"video": video}
728
729
730

        if audios is not None:
            for i, audio in enumerate(audios):
Cyrus Leung's avatar
Cyrus Leung committed
731
732
                if audio is not None:
                    inputs[i]["multi_modal_data"] = {"audio": audio}
733
734
735
736
737
738
739
740
741
742

        return inputs

    def generate(
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
743
        **kwargs: Any,
744
745
746
747
748
    ) -> List[Tuple[List[List[int]], List[str]]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
749

750
        req_outputs = self.model.generate(inputs,
751
752
                                          sampling_params=sampling_params,
                                          **kwargs)
753
754

        outputs: List[Tuple[List[List[int]], List[str]]] = []
Woosuk Kwon's avatar
Woosuk Kwon committed
755
756
757
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
758
759
            req_sample_output_ids: List[List[int]] = []
            req_sample_output_strs: List[str] = []
760
761
            for sample in req_output.outputs:
                output_str = sample.text
762
                output_ids = list(sample.token_ids)
763
764
765
                req_sample_output_ids.append(prompt_ids + output_ids)
                req_sample_output_strs.append(prompt_str + output_str)
            outputs.append((req_sample_output_ids, req_sample_output_strs))
Woosuk Kwon's avatar
Woosuk Kwon committed
766
767
        return outputs

768
    @staticmethod
769
770
    def _final_steps_generate_w_logprobs(
        req_outputs: List[RequestOutput],
771
772
    ) -> List[TokensTextLogprobsPromptLogprobs]:
        outputs: List[TokensTextLogprobsPromptLogprobs] = []
773
        for req_output in req_outputs:
774
            assert len(req_output.outputs) > 0
775
776
            for sample in req_output.outputs:
                output_str = sample.text
777
                output_ids = list(sample.token_ids)
778
                output_logprobs = sample.logprobs
779
780
            outputs.append((output_ids, output_str, output_logprobs,
                            req_output.prompt_logprobs))
781
782
        return outputs

783
784
785
786
    def generate_w_logprobs(
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
787
788
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
789
        videos: Optional[PromptVideoInput] = None,
790
        **kwargs: Any,
791
792
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
793
794
795
796
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)
797

798
        req_outputs = self.model.generate(inputs,
799
800
                                          sampling_params=sampling_params,
                                          **kwargs)
801
802
803
804
805
806
807

        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
808
809
810

    def generate_encoder_decoder_w_logprobs(
        self,
811
        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
812
        sampling_params: SamplingParams,
813
814
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
815
816
817
818
819
820
821
        '''
        Logprobs generation for vLLM encoder/decoder models
        '''

        assert sampling_params.logprobs is not None
        req_outputs = self.model.generate(encoder_decoder_prompts,
                                          sampling_params=sampling_params)
822
823
824
825
826
827
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
        # Omit prompt logprobs if not required by sampling params
        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
                if sampling_params.prompt_logprobs is None else
                toks_str_logsprobs_prompt_logprobs)
828

Woosuk Kwon's avatar
Woosuk Kwon committed
829
830
831
832
    def generate_greedy(
        self,
        prompts: List[str],
        max_tokens: int,
833
        images: Optional[PromptImageInput] = None,
834
835
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
836
        **kwargs: Any,
Woosuk Kwon's avatar
Woosuk Kwon committed
837
838
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
839
840
841
842
        outputs = self.generate(prompts,
                                greedy_params,
                                images=images,
                                videos=videos,
843
844
                                audios=audios,
                                **kwargs)
845
846
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
847

848
849
850
851
852
    def generate_greedy_logprobs(
        self,
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
853
        num_prompt_logprobs: Optional[int] = None,
854
855
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
856
        videos: Optional[PromptVideoInput] = None,
857
        stop_token_ids: Optional[List[int]] = None,
858
        stop: Optional[List[str]] = None,
859
        **kwargs: Any,
860
861
862
863
864
865
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
866
            prompt_logprobs=num_prompt_logprobs,
867
868
            stop_token_ids=stop_token_ids,
            stop=stop)
869
870
871
872
873

        return self.generate_w_logprobs(prompts,
                                        greedy_logprobs_params,
                                        images=images,
                                        audios=audios,
874
875
                                        videos=videos,
                                        **kwargs)
876

877
878
    def generate_encoder_decoder_greedy_logprobs(
        self,
879
        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
880
881
        max_tokens: int,
        num_logprobs: int,
882
883
884
885
886
887
888
889
890
        num_prompt_logprobs: Optional[int] = None,
    ) -> Union[List[TokensTextLogprobs],
               List[TokensTextLogprobsPromptLogprobs]]:
        greedy_logprobs_params = SamplingParams(
            temperature=0.0,
            max_tokens=max_tokens,
            logprobs=num_logprobs,
            prompt_logprobs=(num_prompt_logprobs),
        )
891
892
893
894
        '''
        Greedy logprobs generation for vLLM encoder/decoder models
        '''

895
        return self.generate_encoder_decoder_w_logprobs(
896
897
            encoder_decoder_prompts, greedy_logprobs_params)

898
    def generate_beam_search(
899
900
901
902
903
        self,
        prompts: Union[List[str], List[List[int]]],
        beam_width: int,
        max_tokens: int,
    ) -> List[Tuple[List[List[int]], List[str]]]:
youkaichao's avatar
youkaichao committed
904
905
906
907
908
909
        if is_list_of(prompts, str, check="all"):
            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
        else:
            prompts = [
                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
            ]
910
911
912
        outputs = self.model.beam_search(
            prompts,
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
913
914
915
916
917
918
919
        returned_outputs = []
        for output in outputs:
            token_ids = [x.tokens for x in output.sequences]
            texts = [x.text for x in output.sequences]
            returned_outputs.append((token_ids, texts))
        return returned_outputs

920
921
922
923
    def classify(self, prompts: List[str]) -> List[List[float]]:
        req_outputs = self.model.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

Cyrus Leung's avatar
Cyrus Leung committed
924
925
926
927
928
929
930
931
932
933
934
935
    def encode(
        self,
        prompts: List[str],
        images: Optional[PromptImageInput] = None,
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
    ) -> List[List[float]]:
        inputs = self.get_inputs(prompts,
                                 images=images,
                                 videos=videos,
                                 audios=audios)

936
        req_outputs = self.model.embed(inputs)
Cyrus Leung's avatar
Cyrus Leung committed
937
        return [req_output.outputs.embedding for req_output in req_outputs]
938

939
940
941
942
    def score(
        self,
        text_1: Union[str, List[str]],
        text_2: Union[str, List[str]],
943
    ) -> List[float]:
944
        req_outputs = self.model.score(text_1, text_2)
945
        return [req_output.outputs.score for req_output in req_outputs]
946

947
948
949
950
    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
        executor = self.model.llm_engine.model_executor
        return executor.apply_model(func)

951
952
953
954
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
955
        del self.model
956
        cleanup_dist_env_and_memory()
957

Woosuk Kwon's avatar
Woosuk Kwon committed
958

959
@pytest.fixture(scope="session")
Woosuk Kwon's avatar
Woosuk Kwon committed
960
961
def vllm_runner():
    return VllmRunner
962
963
964
965
966
967
968
969
970


def get_tokenizer_pool_config(tokenizer_group_type):
    if tokenizer_group_type is None:
        return None
    if tokenizer_group_type == "ray":
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type="ray",
                                   extra_config={})
971
972
973
974
    if isinstance(tokenizer_group_type, type):
        return TokenizerPoolConfig(pool_size=1,
                                   pool_type=tokenizer_group_type,
                                   extra_config={})
975
    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991


@pytest.fixture()
def temporary_enable_log_propagate():
    import logging
    logger = logging.getLogger("vllm")
    logger.propagate = True
    yield
    logger.propagate = False


@pytest.fixture()
def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
992
993
994
995
996
997
998


@pytest.fixture(scope="session")
def num_gpus_available():
    """Get number of GPUs without initializing the CUDA context
    in current process."""

999
    return cuda_device_count_stateless()
1000
1001


1002
# temp_dir = tempfile.gettempdir()
zhuwenwen's avatar
zhuwenwen committed
1003
1004
1005
1006
_dummy_opt_path = os.path.join(models_path_prefix, "dummy_opt")
_dummy_llava_path = os.path.join(models_path_prefix, "dummy_llava")
_dummy_gemma2_embedding_path = os.path.join(models_path_prefix, "dummy_gemma2_embedding")

1007
1008
1009
1010


@pytest.fixture
def dummy_opt_path():
1011
1012
    json_path = os.path.join(_dummy_opt_path, "config.json")
    if not os.path.exists(_dummy_opt_path):
1013
        snapshot_download(repo_id="facebook/opt-125m",
1014
                          local_dir=_dummy_opt_path,
1015
1016
1017
1018
1019
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1020
        with open(json_path) as f:
1021
1022
1023
1024
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
            json.dump(config, f)
1025
1026
    return _dummy_opt_path

1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044

# 定义一个 pytest 钩子,在测试后生成报告
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
    # 获取测试结果
    outcome = yield
    result = outcome.get_result()

    # 如果测试失败并且有浏览器实例,添加截图
    if result.when == "call" and result.failed:
        if hasattr(item, "funcargs") and "browser" in item.funcargs:
            browser = item.funcargs["browser"]
            screenshot_path = "screenshot.png"  # 设置截图路径
            browser.save_screenshot(screenshot_path)

            # 如果测试结果有 extra 属性,则添加截图
            if hasattr(result, "extra"):
                result.extra.append(pytest_html.extras.image(screenshot_path))
zhuwenwen's avatar
zhuwenwen committed
1045
1046


1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
@pytest.fixture
def dummy_llava_path():
    json_path = os.path.join(_dummy_llava_path, "config.json")
    if not os.path.exists(_dummy_llava_path):
        snapshot_download(repo_id="llava-hf/llava-1.5-7b-hf",
                          local_dir=_dummy_llava_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1058
        with open(json_path) as f:
1059
1060
1061
1062
1063
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_llava_path
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076


@pytest.fixture
def dummy_gemma2_embedding_path():
    json_path = os.path.join(_dummy_gemma2_embedding_path, "config.json")
    if not os.path.exists(_dummy_gemma2_embedding_path):
        snapshot_download(repo_id="BAAI/bge-multilingual-gemma2",
                          local_dir=_dummy_gemma2_embedding_path,
                          ignore_patterns=[
                              "*.bin", "*.bin.index.json", "*.pt", "*.h5",
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
1077
        with open(json_path) as f:
1078
1079
1080
1081
1082
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
            json.dump(config, f)
    return _dummy_gemma2_embedding_path
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101


# Add the flag `--optional` to allow run tests
# that are marked with @pytest.mark.optional
def pytest_addoption(parser):
    parser.addoption("--optional",
                     action="store_true",
                     default=False,
                     help="run optional test")


def pytest_collection_modifyitems(config, items):
    if config.getoption("--optional"):
        # --optional given in cli: do not skip optional tests
        return
    skip_optional = pytest.mark.skip(reason="need --optional option to run")
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)