phi4_speech_inference.py

import os
import requests
import torch
from PIL import Image
import soundfile
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

model_path = '/home/wanglch/Phi4/Phi-4-multimodal-instruct/'

kwargs = {}
kwargs['torch_dtype'] = torch.bfloat16

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    torch_dtype='auto',
    _attn_implementation='flash_attention_2',
).cuda()

generation_config = GenerationConfig.from_pretrained(model_path, 'generation_config.json')

user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'
 
AUDIO_FILE_1 = '/home/wanglch/Phi4/Phi-4-multimodal-instruct/examples/what_is_the_traffic_sign_in_the_image.wav'
AUDIO_FILE_2 = '/home/wanglch/Phi4/Phi-4-multimodal-instruct/examples/what_is_shown_in_this_image.wav'

if not os.path.exists(AUDIO_FILE_1):
    raise FileNotFoundError(f'Please prepare the audio file {AUDIO_FILE_1} before running the following code.')

########################## speech only ################################
speech_prompt = "Based on the attached audio, generate a comprehensive text transcription of the spoken content."
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'

print(f'>>> Prompt\n{prompt}')
audio = soundfile.read(AUDIO_FILE_1)
inputs = processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda:0')
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')

if not os.path.exists(AUDIO_FILE_2):
    raise FileNotFoundError(f'Please prepare the audio file {AUDIO_FILE_2} before running the following code.')
########################### speech only (multi-turn) ################################
audio_1 = soundfile.read(AUDIO_FILE_2)
audio_2 = soundfile.read(AUDIO_FILE_1)
chat = [
    {'role': 'user', 'content': f'<|audio_1|>Based on the attached audio, generate a comprehensive text transcription of the spoken content.'},
    {
        'role': 'assistant',
        'content': "What is shown in this image.",
    },
    {'role': 'user', 'content': f'<|audio_2|>Based on the attached audio, generate a comprehensive text transcription of the spoken content.'},
]
prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
# need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
if prompt.endswith('<|endoftext|>'):
    prompt = prompt.rstrip('<|endoftext|>')

print(f'>>> Prompt\n{prompt}')

inputs = processor(text=prompt, audios=[audio_1, audio_2], return_tensors='pt').to('cuda:0')
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')