import soundfile as sf import torch from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor from qwen_omni_utils import process_mm_info ''' FORCE_QWENVL_VIDEO_READER=decord # 强制使用decord 后端 ''' # default: Load the model on the available device(s) model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2") # We recommend enabling flash_attention_2 for better acceleration and memory saving. # model = Qwen2_5OmniModel.from_pretrained( # "Qwen/Qwen2.5-Omni-7B", # torch_dtype="auto", # device_map="auto", # attn_implementation="flash_attention_2", # ) processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B") conversation = [ { "role": "system", "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", }, { "role": "user", "content": [ {"type": "video", "video": "./draw.mp4"}, ], }, ] # Preparation for inference text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=True) inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) inputs = inputs.to(model.device).to(model.dtype) # Inference: Generation of the output text and audio text_ids, audio = model.generate(**inputs, use_audio_in_video=True) text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(text) sf.write( "output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000, )