# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ import os from dataclasses import asdict from typing import NamedTuple, Optional from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.lora.request import LoRARequest from vllm.utils import FlexibleArgumentParser audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] question_per_audio_count = { 0: "What is 1+1?", 1: "What is recited in the audio?", 2: "What sport and what nursery rhyme are referenced?", } class ModelRequestData(NamedTuple): engine_args: EngineArgs prompt: str stop_token_ids: Optional[list[int]] = None lora_requests: Optional[list[LoRARequest]] = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on # lower-end GPUs. # Unless specified, these settings have been tested to work on a single L4. # Granite Speech def run_granite_speech(model_name:str, question: str, audio_count: int) -> ModelRequestData: # NOTE - the setting in this example are somehat different than what is # optimal for granite speech, and it is generally recommended to use beam # search. Check the model README for suggested settings. # https://huggingface.co/ibm-granite/granite-speech-3.3-8b engine_args = EngineArgs( dtype="float16", model=model_name, trust_remote_code=True, max_model_len=2048, max_num_seqs=2, enable_lora=True, max_lora_rank=64, limit_mm_per_prompt={"audio": audio_count}, ) # The model has an audio-specific lora directly in its model dir; # it should be enabled whenever you pass audio inputs to the model. speech_lora_path = model_name audio_placeholder = "<|audio|>" * audio_count prompts = f"<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>{audio_placeholder}{question}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501 return ModelRequestData( engine_args=engine_args, prompt=prompts, lora_requests=[LoRARequest("speech", 1, speech_lora_path)], ) # Ultravox 0.5-1B def run_ultravox(model_name: str, question: str, audio_count: int) -> ModelRequestData: tokenizer = AutoTokenizer.from_pretrained(model_name) messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) engine_args = EngineArgs( model=model_name, max_model_len=4096, max_num_seqs=5, trust_remote_code=True, limit_mm_per_prompt={"audio": audio_count}, ) return ModelRequestData( engine_args=engine_args, prompt=prompt, ) model_example_map = { "granite_speech": run_granite_speech, "ultravox": run_ultravox } def parse_args(): parser = FlexibleArgumentParser( description="Demo on using vLLM for offline inference with " "audio language models" ) parser.add_argument( "--model-type", "-m", type=str, default="ultravox", choices=model_example_map.keys(), help='Huggingface "model_type".', ) parser.add_argument( "--model-name", type=str, default=None, help="Path to the model directory." ) parser.add_argument( "--num-prompts", type=int, default=1, help="Number of prompts to run." ) parser.add_argument( "--num-audios", type=int, default=1, choices=[0, 1, 2], help="Number of audio items per prompt.", ) parser.add_argument( "--seed", type=int, default=2, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() def main(args): model = args.model_type if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") audio_count = args.num_audios req_data = model_example_map[model]( args.model_name, question_per_audio_count[audio_count], audio_count ) # Disable other modalities to save memory default_limits = {"image": 0, "video": 0, "audio": 0} req_data.engine_args.limit_mm_per_prompt = default_limits | dict( req_data.engine_args.limit_mm_per_prompt or {} ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed, "gpu_memory_utilization": 0.9} llm = LLM(**engine_args) sampling_params = SamplingParams( temperature=0.0, max_tokens=64, stop_token_ids=req_data.stop_token_ids, logprobs=10 ) mm_data = {} if audio_count > 0: mm_data = { "audio": [ asset.audio_and_sample_rate for asset in audio_assets[:audio_count] ] } assert args.num_prompts > 0 inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: inputs = [inputs] * args.num_prompts lora_request = ( req_data.lora_requests * args.num_prompts if req_data.lora_requests else None ) outputs = llm.generate( inputs, sampling_params=sampling_params, lora_request=lora_request, ) for i, o in enumerate(outputs): print(f"--- Prompt {i+1} ---") generated_text = o.outputs[0].text print(f"Generated Text: {generated_text}") logprobs_per_step = o.outputs[0].logprobs if logprobs_per_step is None: print("Logprobs not returned. Check your SamplingParams.") continue print("\nLogprobs per generated token:") for step_idx, step_logprobs_dict in enumerate(logprobs_per_step): generated_token_info = None for token_id, logprob_obj in step_logprobs_dict.items(): if logprob_obj.rank == 1: generated_token_info = (token_id, logprob_obj.decoded_token) break if generated_token_info: token_id, token_text = generated_token_info print(f" Step {step_idx}:") print(f" - Generated Token: {token_id} ('{token_text}')") else: print(f" Step {step_idx}: (Could not find rank-1 token)") continue sorted_logprobs = sorted(step_logprobs_dict.values(), key=lambda x: x.rank) print(" - Top Logprobs:") for logprob_obj in sorted_logprobs: token_id = next(tid for tid, lp in step_logprobs_dict.items() if lp is logprob_obj) # 反向查找ID token_text = logprob_obj.decoded_token logprob_value = logprob_obj.logprob rank = logprob_obj.rank print(f" - Rank {rank}: Token {token_id} ('{token_text}') -> Logprob: {logprob_value:.4f}") import json serializable_data_all_prompts = [] for o in outputs: logprobs_per_step = o.outputs[0].logprobs generated_token_logprobs = [] if logprobs_per_step: for step_logprobs_dict in logprobs_per_step: found_token = False for token_id, logprob_obj in step_logprobs_dict.items(): if logprob_obj.rank == 1: generated_token_logprobs.append(logprob_obj.logprob) found_token = True break if not found_token: generated_token_logprobs.append(None) serializable_data_all_prompts.append(generated_token_logprobs) output_filename = './generated_token_logprobs_A800_fp16.json' with open(output_filename, 'w') as f: json.dump(serializable_data_all_prompts, f, indent=2) print(f"成功将每个生成token的logprob写入到文件: {output_filename}") if __name__ == "__main__": args = parse_args() main(args)