encoder_decoder_multimodal.py 3.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
"""
This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
7

8
import os
9
import time
10
11
12
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple
13

14
from vllm import LLM, EngineArgs, PromptType, SamplingParams
15
from vllm.assets.audio import AudioAsset
16
from vllm.utils.argparse_utils import FlexibleArgumentParser
17
18


19
20
21
22
23
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: Sequence[PromptType]


24
def run_whisper():
25
26
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

27
    engine_args = EngineArgs(
28
29
30
31
32
33
34
35
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,
        max_num_seqs=16,
        limit_mm_per_prompt={"audio": 1},
        dtype="half",
    )

    prompts = [
36
        {  # Test implicit prompt
37
38
39
40
41
            "prompt": "<|startoftranscript|>",
            "multi_modal_data": {
                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
            },
        },
42
        {  # Test explicit encoder/decoder prompt
43
44
45
46
47
48
49
            "encoder_prompt": {
                "prompt": "",
                "multi_modal_data": {
                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
                },
            },
            "decoder_prompt": "<|startoftranscript|>",
50
        },
51
    ]
52
53
54
55
56

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
57
58
59
60
61
62
63


model_example_map = {
    "whisper": run_whisper,
}


64
65
def parse_args():
    parser = FlexibleArgumentParser(
66
67
68
69
70
71
72
        description="Demo on using vLLM for offline inference with "
        "vision language models for text generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
73
        default="whisper",
74
75
76
77
78
79
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--seed",
        type=int,
80
        default=0,
81
82
        help="Set the seed when initializing `vllm.LLM`.",
    )
83
84
85
    return parser.parse_args()


86
87
88
89
90
def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

91
92
    req_data = model_example_map[model]()

93
94
95
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
96
97
        req_data.engine_args.limit_mm_per_prompt or {}
    )
98

99
100
101
102
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)

    prompts = req_data.prompts
103
104
105
106
107
108

    # Create a sampling params object.
    sampling_params = SamplingParams(
        temperature=0,
        top_p=1.0,
        max_tokens=64,
109
        skip_special_tokens=False,
110
111
112
113
114
115
116
117
118
119
120
121
122
    )

    start = time.time()

    # Generate output tokens from the prompts. The output is a list of
    # RequestOutput objects that contain the prompt, generated
    # text, and other information.
    outputs = llm.generate(prompts, sampling_params)

    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
123
        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
124
125
126
127
128
129
130
131

    duration = time.time() - start

    print("Duration:", duration)
    print("RPS:", len(prompts) / duration)


if __name__ == "__main__":
132
    args = parse_args()
133
    main(args)