infer_vllm.py 8.4 KB
Newer Older
zhangwq5's avatar
add  
zhangwq5 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on audio language models.

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""

import os
from dataclasses import asdict
from typing import NamedTuple, Optional

from huggingface_hub import snapshot_download
from transformers import AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser

audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
question_per_audio_count = {
    0: "What is 1+1?",
    1: "What is recited in the audio?",
    2: "What sport and what nursery rhyme are referenced?",
}


class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    stop_token_ids: Optional[list[int]] = None
    lora_requests: Optional[list[LoRARequest]] = None


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


# Granite Speech
def run_granite_speech(model_name:str, question: str, audio_count: int) -> ModelRequestData:
    # NOTE - the setting in this example are somehat different than what is
    # optimal for granite speech, and it is generally recommended to use beam
    # search. Check the model README for suggested settings.
    # https://huggingface.co/ibm-granite/granite-speech-3.3-8b

    engine_args = EngineArgs(
        dtype="float16",
        model=model_name,
        trust_remote_code=True,
        max_model_len=2048,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=64,
        limit_mm_per_prompt={"audio": audio_count},
    )

    # The model has an audio-specific lora directly in its model dir;
    # it should be enabled whenever you pass audio inputs to the model.
    speech_lora_path = model_name
    audio_placeholder = "<|audio|>" * audio_count
    prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"  # noqa: E501

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompts,
        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
    )


# Ultravox 0.5-1B
def run_ultravox(model_name: str, question: str, audio_count: int) -> ModelRequestData:

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        trust_remote_code=True,
        limit_mm_per_prompt={"audio": audio_count},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )


model_example_map = {
    "granite_speech": run_granite_speech,
    "ultravox": run_ultravox
}


def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using vLLM for offline inference with "
        "audio language models"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="ultravox",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--model-name", type=str, default=None, help="Path to the model directory."
    )
    parser.add_argument(
        "--num-prompts", type=int, default=1, help="Number of prompts to run."
    )
    parser.add_argument(
        "--num-audios",
        type=int,
        default=1,
        choices=[0, 1, 2],
        help="Number of audio items per prompt.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=2,
        help="Set the seed when initializing `vllm.LLM`.",
    )

    return parser.parse_args()


def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

    audio_count = args.num_audios
    req_data = model_example_map[model](
        args.model_name, question_per_audio_count[audio_count], audio_count
    )

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {"seed": args.seed, "gpu_memory_utilization": 0.9}
    llm = LLM(**engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, 
        max_tokens=64, 
        stop_token_ids=req_data.stop_token_ids,
        logprobs=10  
    )

    mm_data = {}
    if audio_count > 0:
        mm_data = {
            "audio": [
                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
            ]
        }

    assert args.num_prompts > 0
    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        inputs = [inputs] * args.num_prompts

    lora_request = (
        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
    )

    outputs = llm.generate(
        inputs,
        sampling_params=sampling_params,
        lora_request=lora_request,
    )

    for i, o in enumerate(outputs):
        print(f"--- Prompt {i+1} ---")
        generated_text = o.outputs[0].text
        print(f"Generated Text: {generated_text}")
        
        logprobs_per_step = o.outputs[0].logprobs
        
        if logprobs_per_step is None:
            print("Logprobs not returned. Check your SamplingParams.")
            continue

        print("\nLogprobs per generated token:")
        for step_idx, step_logprobs_dict in enumerate(logprobs_per_step):
            
            generated_token_info = None
            for token_id, logprob_obj in step_logprobs_dict.items():
                if logprob_obj.rank == 1:
                    generated_token_info = (token_id, logprob_obj.decoded_token)
                    break 
            
            if generated_token_info:
                token_id, token_text = generated_token_info
                print(f"  Step {step_idx}:")
                print(f"    - Generated Token: {token_id} ('{token_text}')")
            else:
                print(f"  Step {step_idx}: (Could not find rank-1 token)")
                continue

            sorted_logprobs = sorted(step_logprobs_dict.values(), key=lambda x: x.rank)
            
            print("    - Top Logprobs:")
            for logprob_obj in sorted_logprobs:
                token_id = next(tid for tid, lp in step_logprobs_dict.items() if lp is logprob_obj) # 反向查找ID
                token_text = logprob_obj.decoded_token
                logprob_value = logprob_obj.logprob
                rank = logprob_obj.rank
                
                print(f"        - Rank {rank}: Token {token_id} ('{token_text}') -> Logprob: {logprob_value:.4f}")


    import json

    serializable_data_all_prompts = []

    for o in outputs:
        logprobs_per_step = o.outputs[0].logprobs
        
        generated_token_logprobs = []
        
        if logprobs_per_step:
            for step_logprobs_dict in logprobs_per_step:

                found_token = False
                for token_id, logprob_obj in step_logprobs_dict.items():
                    if logprob_obj.rank == 1:
                        generated_token_logprobs.append(logprob_obj.logprob)
                        found_token = True
                        break  
                
                if not found_token:
                    generated_token_logprobs.append(None) 

        serializable_data_all_prompts.append(generated_token_logprobs)

    output_filename = './generated_token_logprobs_A800_fp16.json'
    with open(output_filename, 'w') as f:
        json.dump(serializable_data_all_prompts, f, indent=2) 

    print(f"成功将每个生成token的logprob写入到文件: {output_filename}")


if __name__ == "__main__":
    args = parse_args()
    main(args)