test_multimodal_broadcast.py 1.55 KB
Newer Older
1
2
3
4
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.

Run:
```sh
5
pytest -s -v test_multimodal_broadcast.py
6
7
8
9
10
11
12
```
"""

import pytest

from vllm.utils import cuda_device_count_stateless

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from ..utils import fork_new_process_for_each_test


@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
    ("llava-hf/llava-1.5-7b-hf", "ray"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
    ("llava-hf/llava-1.5-7b-hf", "mp"),
    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
])
@fork_new_process_for_each_test
def test_models(hf_runner, vllm_runner, image_assets, model: str,
                distributed_executor_backend: str) -> None:

    dtype = "half"
    max_tokens = 5
    num_logprobs = 5
    tensor_parallel_size = 2

    if model.startswith("llava-hf/llava-1.5"):
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
        from ..models.test_llava_next import models, run_test
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
39
40
41
42
43

    run_test(
        hf_runner,
        vllm_runner,
        image_assets,
44
        model=models[0],
45
46
        # So that LLaVA-NeXT processor may return nested list
        size_factors=[0.25, 0.5, 1.0],
47
48
        dtype=dtype,
        max_tokens=max_tokens,
49
        num_logprobs=num_logprobs,
50
51
52
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
    )