Commit 5e8fb565 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1799 canceled with stages
import os
import pkg_resources
from setuptools import setup, find_packages
setup(
name="clip",
py_modules=["clip"],
version="1.0",
description="",
author="OpenAI",
packages=find_packages(exclude=["tests*"]),
install_requires=[
str(r)
for r in pkg_resources.parse_requirements(
open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
)
],
include_package_data=True,
extras_require={'dev': ['pytest']},
)
import numpy as np
import pytest
import torch
from PIL import Image
import clip
@pytest.mark.parametrize('model_name', clip.available_models())
def test_consistency(model_name):
device = "cpu"
jit_model, transform = clip.load(model_name, device=device, jit=True)
py_model, _ = clip.load(model_name, device=device, jit=False)
image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
with torch.no_grad():
logits_per_image, _ = jit_model(image, text)
jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
logits_per_image, _ = py_model(image, text)
py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
MIT License
Copyright (c) 2024 gpt-omni
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Mini-Omni2
Mini-Omni2是功能上最接近GPT4o的多模态模型之一,Mini-Omni2是个视觉-音频助理,实时语音对话,能同时处理视觉、听觉和文本三种模态,针对用户视频和语音查询,实时提供端到端的语音响应。
## 论文
`Mini-Omni2: Towards Open-source GPT-4o Model with Vision, Speech and Duplex`
- https://arxiv.org/pdf/2410.11190
## 模型结构
Omni2的语言模型采用Qwen2-0.5B,adapter采用llama中常用的MLP,Encoder采用各种预训练效果较好模态的编码器,以便用很简单的方法就能训练出多模态模型。
<div align=center>
<img src="./doc/omni2.png"/>
</div>
## 算法原理
Omni2的原理是以文本到文本的大语言模型能力为基础,吸收各种已经预训练好的不同模态的编码器的编码能力,添加一层MLP来适应多模态任务,通过一定的微调策略,在缺乏多模态配对训练数据的情况下也能取得一定的多模态效果,下图为其三阶段微调策略。
<div align=center>
<img src="./doc/train.png"/>
</div>
## 环境配置
```
mv mini-omni2_pytorch mini-omni2 # 去框架名后缀
```
### Docker(方法一)
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
# <your IMAGE ID>为以上拉取的docker的镜像ID替换,本镜像为:83714c19d308
docker run -it --shm-size=64G -v $PWD/mini-omni2:/home/mini-omni2 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name omni2 <your IMAGE ID> bash
cd /home/mini-omni2
pip install -r requirements.txt # requirements.txt
# 安装ffmpeg
apt update
apt-get install ffmpeg
# 安装CLIP
cd CLIP
pip install . #clip==1.0
```
### Dockerfile(方法二)
```
cd mini-omni2/docker
docker build --no-cache -t omni2:latest .
docker run --shm-size=64G --name omni2 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../mini-omni2:/home/mini-omni2 -it omni2 bash
# 若遇到Dockerfile启动的方式安装环境需要长时间等待,可注释掉里面的pip安装,启动容器后再安装python库:pip install -r requirements.txt。
cd /home/mini-omni2
# 安装ffmpeg
apt update
apt-get install ffmpeg
# 安装CLIPclip-1.0
cd CLIP
pip install . #clip==1.0
```
### Anaconda(方法三)
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装:
- https://developer.hpccube.com/tool/
```
DTK驱动:dtk24.04.2
python:python3.10
torch:2.3.0
torchvision:0.18.1
torchaudio:2.1.2
triton:2.1.0
flash-attn:2.0.4
deepspeed:0.14.2
apex:1.3.0
xformers:0.0.25
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
2、其它非特殊库参照requirements.txt安装
```
cd mini-omni2
pip install -r requirements.txt # requirements.txt
# 安装ffmpeg
apt update
apt-get install ffmpeg
# 安装CLIP
cd CLIP
pip install . #clip==1.0
```
## 数据集
## 训练
## 推理
1、下载预训练权重`gpt-omni/mini-omni2`,将mini-omni2下所有文件放入checkpoint文件夹下;
2、下载预训练权重`hubertsiuzdak/snac_24khz`,将文件夹hubertsiuzdak放在根目录mini-omni2下;
```
cd mini-omni2
python inference_vision.py
```
更多资料可参考源项目的[`README_origin`](./README_origin.md)
## result
`输入: `
```
# 音色
input_audio_path = './data/samples/vision_qa_audio.wav'
# 图片
input_image_path = './data/samples/vision_qa_image.jpg'
```
`输出:`
```
# 文本
text output: The person in the image appears to be a middle-aged man with a fair complexion. He has short, neatly combed gray hair and a receding hairline. His facial features include a prominent nose, thin lips, and a gentle smile. He is wearing a dark suit with a notched lapel, a white shirt, and a dark tie with diagonal stripes. The background is a neutral, dark color that provides a contrast to his light-colored suit. The overall impression is one of professionalism and formality.
# 音频
'vision_qa_output.wav'
```
### 精度
DCU与GPU精度一致,推理框架:pytorch。
## 应用场景
### 算法类别
`对话问答`
### 热点应用行业
`制造,广媒,金融,能源,医疗,家居,教育`
## 预训练权重
预训练权重快速下载中心:[SCNet AIModels](http://113.200.138.88:18080/aimodels) ,项目中的预训练权重可从快速下载通道下载:[gpt-omni/mini-omni2](http://113.200.138.88:18080/project-dependency/mini-omni2.git)[hubertsiuzdak/snac_24khz](http://113.200.138.88:18080/aimodels/hubertsiuzdak/snac_24khz.git)
Hugging Face下载地址为:[gpt-omni/mini-omni2](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)[hubertsiuzdak/snac_24khz](https://huggingface.co/hubertsiuzdak/snac_24khz)
## 源码仓库及问题反馈
- http://developer.sourcefind.cn/codes/modelzoo/mini-omni2_pytorch.git
## 参考资料
- https://github.com/gpt-omni/mini-omni2.git
- https://github.com/QwenLM/Qwen2.5.git
# Mini-Omni2
<p align="center">
<img src="./data/figures/title_new.png" width="90%"/>
</p>
<p align="center">
🤗 <a href="https://huggingface.co/gpt-omni/mini-omni2">Hugging Face</a> | 📖 <a href="https://github.com/gpt-omni/mini-omni2">Github</a>
| 📑 <a href="https://arxiv.org/abs/2410.11190">Technical report</a>
</p>
<h5 align="center"> If you like little Omni2, please give us a star⭐ and cite our <a href="(https://arxiv.org/abs/2410.11190)">paper</a>!</h2>
## Introduction
Mini-Omni2 is an **omni-interactive** model. It can **understand image, audio and text inputs and has end-to-end voice conversations with users**. Featuring **real-time voice output**, **omni-capable multimodal understanding** and flexible interaction **ability with interruption mechanism while speaking**.
<p align="center">
<img src="./data/figures/framework.jpeg" width="100%"/>
</p>
## Updates
- **2024.10:** Release the model, technical report, inference and chat demo code.
## Features
**Multimodal interaction**: with the ability to understand images, speech and text, just like GPT-4o.
**Real-time speech-to-speech** conversational capabilities. No extra ASR or TTS models required, just like [Mini-Omni](https://github.com/gpt-omni/mini-omni).
<!-- ✅ **Streaming audio output**: with first-chunk latency of audio stream less than 0.3s. -->
<!-- ✅ **Duplex interaction**: hearing while speaking, it can be interrupted by key words like "stop omni". -->
## Demo
NOTE: need to unmute first.
https://github.com/user-attachments/assets/ad97ca7f-f8b4-40c3-a7e8-fa54b4edf155
## ToDo
- [ ] update interruption mechanism
- [ ] visual-assistant model and data
## Install
Create a new conda environment and install the required packages:
```sh
conda create -n omni python=3.10
conda activate omni
git clone https://github.com/gpt-omni/mini-omni2.git
cd mini-omni2
pip install -r requirements.txt
```
## Quick start
**Interactive demo**
- start server
NOTE: you need to start the server before running the streamlit or gradio demo with API_URL set to the server address.
```sh
sudo apt-get install ffmpeg
conda activate omni
cd mini-omni2
python3 server.py --ip '0.0.0.0' --port 60808
```
- run streamlit demo
NOTE: you need to run streamlit **locally** with PyAudio installed.
```sh
pip install PyAudio==0.2.14
API_URL=http://0.0.0.0:60808/chat streamlit run webui/omni_streamlit.py
```
**Local test**
```sh
conda activate omni
cd mini-omni2
# test run the preset audio samples and questions
python inference_vision.py
```
## Mini-Omni2 Overview
**1. Multimodal Modeling**:
We use multiple sequences as the input and output of the model. In the input part, we will concatenate image, audio and text features to perform a series of comprehensive tasks, as shown in the following figures. In the output part, we use text-guided delayed parallel output to generate real-time speech responses.
<p align="center">
<img src="./data/figures/inputids.png" width="100%"/>
</p>
**2. Multi-stage Training**:
We propose an efficient alignment training method and conduct encoder adaptation, modal alignment, and multimodal fine-tuning respectively in the three-stage training.
<p align="center">
<img src="./data/figures/training.jpeg" width="100%"/>
</p>
<!-- **3. Cases**:
Here are more cases of Mini-Omni2:
<p align="center">
<img src="./data/figures/samples.png" width="100%"/>
</p> -->
## FAQ
**1. Does the model support other languages?**
No, the model is only trained on English. However, as we use whisper as the audio encoder, the model can understand other languages which is supported by whisper (like chinese), but the output is only in English.
**2. Error: can not run streamlit in local browser, with remote streamlit server**
You need start streamlit **locally** with PyAudio installed.
## Acknowledgements
- [Qwen2](https://github.com/QwenLM/Qwen2/) as the LLM backbone.
- [litGPT](https://github.com/Lightning-AI/litgpt/) for training and inference.
- [whisper](https://github.com/openai/whisper/) for audio encoding.
- [clip](https://github.com/openai/CLIP) for image encoding.
- [snac](https://github.com/hubertsiuzdak/snac/) for audio decoding.
- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) for generating synthetic speech.
- [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca) and [MOSS](https://github.com/OpenMOSS/MOSS/tree/main) for alignment.
## Citation
```bibtex
@article{xie2024miniomni2opensourcegpt4ovision,
title={Mini-Omni2: Towards Open-source GPT-4o with Vision, Speech and Duplex Capabilities},
author={Zhifei Xie and Changqiao Wu},
year={2024},
eprint={2410.11190},
archivePrefix={arXiv},
primaryClass={eess.AS},
journal={ArXiv},
volume={abs/2410.11190},
}
```
## Star History
[![Star History Chart](https://api.star-history.com/svg?repos=gpt-omni/mini-omni2&type=Date)](https://star-history.com/#gpt-omni/mini-omni2&Date)
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential
# RUN source /opt/dtk-24.04.2/env.sh
# # 安装pip相关依赖
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
#torch==2.3.1
#torchvision==0.18.1
#torchaudio==2.3.1
litgpt==0.4.3
snac==1.2.0
soundfile==0.12.1
openai-whisper
streamlit==1.37.1
# PyAudio==0.2.14
pydub==0.25.1
onnxruntime==1.19.0
# numpy==1.26.3
gradio==4.42.0
librosa==0.10.2.post1
#flask==3.0.3
fire
transformers==4.45.2
tokenizers==0.20.1
#git+https://github.com/mini-omni/CLIP.git
docker run -it --shm-size=64G -v $PWD/mini-omni2:/home/mini-omni2 -v /public/DL_DATA/AI:/home/AI -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=//dev/dri/ --group-add video --name omni2 83714c19d308 bash
# python -m torch.utils.collect_env
---
license: mit
tags:
- audio
---
# SNAC 🍿
Multi-**S**cale **N**eural **A**udio **C**odec (SNAC) compressess audio into discrete codes at a low bitrate.
👉 This model was primarily trained on speech data, and its recommended use case is speech synthesis. See below for other pretrained models.
🔗 GitHub repository: https://github.com/hubertsiuzdak/snac/
## Overview
SNAC encodes audio into hierarchical tokens similarly to SoundStream, EnCodec, and DAC. However, SNAC introduces a simple change where coarse tokens are sampled less frequently,
covering a broader time span.
This model compresses 24 kHz audio into discrete codes at a 0.98 kbps bitrate. It uses 3 RVQ levels with token rates of 12, 23, and
47 Hz.
## Pretrained models
Currently, all models support only single audio channel (mono).
| Model | Bitrate | Sample Rate | Params | Recommended use case |
|-----------------------------------------------------------------------------|-----------|-------------|--------|--------------------------|
| hubertsiuzdak/snac_24khz (this model) | 0.98 kbps | 24 kHz | 19.8 M | 🗣️ Speech |
| [hubertsiuzdak/snac_32khz](https://huggingface.co/hubertsiuzdak/snac_32khz) | 1.9 kbps | 32 kHz | 54.5 M | 🎸 Music / Sound Effects |
| [hubertsiuzdak/snac_44khz](https://huggingface.co/hubertsiuzdak/snac_44khz) | 2.6 kbps | 44 kHz | 54.5 M | 🎸 Music / Sound Effects |
## Usage
Install it using:
```bash
pip install snac
```
To encode (and decode) audio with SNAC in Python, use the following code:
```python
import torch
from snac import SNAC
model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()
audio = torch.randn(1, 1, 24000).cuda() # B, 1, T
with torch.inference_mode():
codes = model.encode(audio)
audio_hat = model.decode(codes)
```
You can also encode and reconstruct in a single call:
```python
with torch.inference_mode():
audio_hat, codes = model(audio)
```
⚠️ Note that `codes` is a list of token sequences of variable lengths, each corresponding to a different temporal
resolution.
```
>>> [code.shape[1] for code in codes]
[12, 24, 48]
```
## Acknowledgements
Module definitions are adapted from the [Descript Audio Codec](https://github.com/descriptinc/descript-audio-codec).
\ No newline at end of file
icon.png

53.8 KB

import os
import lightning as L
import torch
import glob
import time
from snac import SNAC
from litgpt import Tokenizer
from litgpt.utils import (
num_parameters,
)
from litgpt.generate.base import (
generate_AA,
generate_ASR,
generate_TA,
generate_TT,
generate_AT,
generate_TA_BATCH,
next_token_image_batch
)
import soundfile as sf
from litgpt.model import GPT, Config
from lightning.fabric.utilities.load import _lazy_load as lazy_load
from utils.snac_utils import layershift, reconscruct_snac, reconstruct_tensors, get_time_str
from utils.snac_utils import get_snac, generate_audio_data
import whisper
from tqdm import tqdm
from huggingface_hub import snapshot_download
torch.set_printoptions(sci_mode=False)
# TODO
text_vocabsize = 151936
text_specialtokens = 64
audio_vocabsize = 4096
audio_specialtokens = 64
padded_text_vocabsize = text_vocabsize + text_specialtokens
padded_audio_vocabsize = audio_vocabsize + audio_specialtokens
_eot = text_vocabsize
_pad_t = text_vocabsize + 1
_input_t = text_vocabsize + 2
_answer_t = text_vocabsize + 3
_asr = text_vocabsize + 4
_eoa = audio_vocabsize
_pad_a = audio_vocabsize + 1
_input_a = audio_vocabsize + 2
_answer_a = audio_vocabsize + 3
_split = audio_vocabsize + 4
_image = audio_vocabsize + 5
_eoimage = audio_vocabsize + 6
def get_input_ids_TA(text, text_tokenizer):
input_ids_item = [[] for _ in range(8)]
text_tokens = text_tokenizer.encode(text)
for i in range(7):
input_ids_item[i] = [layershift(_pad_a, i)] * (len(text_tokens) + 2) + [
layershift(_answer_a, i)
]
input_ids_item[i] = torch.tensor(input_ids_item[i]).unsqueeze(0)
input_ids_item[-1] = [_input_t] + text_tokens.tolist() + [_eot] + [_answer_t]
input_ids_item[-1] = torch.tensor(input_ids_item[-1]).unsqueeze(0)
return input_ids_item
def get_input_ids_TT(text, text_tokenizer):
input_ids_item = [[] for i in range(8)]
text_tokens = text_tokenizer.encode(text).tolist()
for i in range(7):
input_ids_item[i] = torch.tensor(
[layershift(_pad_a, i)] * (len(text_tokens) + 3)
).unsqueeze(0)
input_ids_item[-1] = [_input_t] + text_tokens + [_eot] + [_answer_t]
input_ids_item[-1] = torch.tensor(input_ids_item[-1]).unsqueeze(0)
return input_ids_item
def get_input_ids_whisper(
mel, leng, whispermodel, device,
special_token_a=_answer_a, special_token_t=_answer_t,
):
with torch.no_grad():
mel = mel.unsqueeze(0).to(device)
# audio_feature = whisper.decode(whispermodel,mel, options).audio_features
audio_feature = whispermodel.embed_audio(mel)[0][:leng]
T = audio_feature.size(0)
input_ids = []
for i in range(7):
input_ids_item = []
input_ids_item.append(layershift(_input_a, i))
input_ids_item += [layershift(_pad_a, i)] * T
input_ids_item += [(layershift(_eoa, i)), layershift(special_token_a, i)]
input_ids.append(torch.tensor(input_ids_item).unsqueeze(0))
input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, special_token_t])
input_ids.append(input_id_T.unsqueeze(0))
return audio_feature.unsqueeze(0), input_ids
def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
with torch.no_grad():
mel = mel.unsqueeze(0).to(device)
# audio_feature = whisper.decode(whispermodel,mel, options).audio_features
audio_feature = whispermodel.embed_audio(mel)[0][:leng]
T = audio_feature.size(0)
input_ids_AA = []
for i in range(7):
input_ids_item = []
input_ids_item.append(layershift(_input_a, i))
input_ids_item += [layershift(_pad_a, i)] * T
input_ids_item += [(layershift(_eoa, i)), layershift(_answer_a, i)]
input_ids_AA.append(torch.tensor(input_ids_item))
input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, _answer_t])
input_ids_AA.append(input_id_T)
input_ids_AT = []
for i in range(7):
input_ids_item = []
input_ids_item.append(layershift(_input_a, i))
input_ids_item += [layershift(_pad_a, i)] * T
input_ids_item += [(layershift(_eoa, i)), layershift(_pad_a, i)]
input_ids_AT.append(torch.tensor(input_ids_item))
input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, _answer_t])
input_ids_AT.append(input_id_T)
input_ids = [input_ids_AA, input_ids_AT]
stacked_inputids = [[] for _ in range(8)]
for i in range(2):
for j in range(8):
stacked_inputids[j].append(input_ids[i][j])
stacked_inputids = [torch.stack(tensors) for tensors in stacked_inputids]
return torch.stack([audio_feature, audio_feature]), stacked_inputids
def load_audio(path):
audio = whisper.load_audio(path)
duration_ms = (len(audio) / 16000) * 1000
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
return mel, int(duration_ms / 20) + 1
def A1_A2_batch(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
snacmodel, out_dir=None):
with fabric.init_tensor():
model.set_kv_cache(batch_size=2)
tokenlist = generate_TA_BATCH(
model,
audio_feature,
input_ids,
[leng, leng],
["A1A2", "A1T2"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
text_tokenlist = tokenlist[-1]
if text_vocabsize in text_tokenlist:
text_tokenlist = text_tokenlist[: text_tokenlist.index(text_vocabsize)]
text = text_tokenizer.decode(torch.tensor(text_tokenlist)).strip()
audio_tokenlist = tokenlist[:-1]
audiolist = reconscruct_snac(audio_tokenlist)
audio = reconstruct_tensors(audiolist)
if out_dir is None:
out_dir = "./output/default/A1-A2-batch"
else:
out_dir = out_dir + "/A1-A2-batch"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with torch.inference_mode():
audio_hat = snacmodel.decode(audio)
sf.write(
f"{out_dir}/{step:02d}.wav",
audio_hat.squeeze().cpu().numpy(),
24000,
)
model.clear_kv_cache()
return text
def A1_T2(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step):
with fabric.init_tensor():
model.set_kv_cache(batch_size=1)
tokenlist = generate_AT(
model,
audio_feature,
input_ids,
[leng],
["AT"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
def A1_A2(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
snacmodel, out_dir=None):
with fabric.init_tensor():
model.set_kv_cache(batch_size=1)
tokenlist = generate_AA(
model,
audio_feature,
input_ids,
[leng],
["A1T2"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
audiolist = reconscruct_snac(tokenlist)
tokenlist = tokenlist[-1]
if text_vocabsize in tokenlist:
tokenlist = tokenlist[: tokenlist.index(text_vocabsize)]
if out_dir is None:
out_dir = "./output/default/A1-A2"
else:
out_dir = out_dir + "/A1-A2"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
audio = reconstruct_tensors(audiolist)
with torch.inference_mode():
audio_hat = snacmodel.decode(audio)
sf.write(
f"{out_dir}/{step:02d}.wav",
audio_hat.squeeze().cpu().numpy(),
24000,
)
model.clear_kv_cache()
return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
def A1_T1(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step):
with fabric.init_tensor():
model.set_kv_cache(batch_size=1)
tokenlist = generate_ASR(
model,
audio_feature,
input_ids,
[leng],
["A1T1"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
model.clear_kv_cache()
return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
def T1_A2(fabric, input_ids, model, text_tokenizer, step,
snacmodel, out_dir=None):
with fabric.init_tensor():
model.set_kv_cache(batch_size=1)
tokenlist = generate_TA(
model,
None,
input_ids,
None,
["T1A2"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
audiolist = reconscruct_snac(tokenlist)
tokenlist = tokenlist[-1]
if text_vocabsize in tokenlist:
tokenlist = tokenlist[: tokenlist.index(text_vocabsize)]
audio = reconstruct_tensors(audiolist)
if out_dir is None:
out_dir = "./output/default/T1-A2"
else:
out_dir = out_dir + "/T1-A2"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with torch.inference_mode():
audio_hat = snacmodel.decode(audio)
sf.write(
f"{out_dir}/{step:02d}.wav",
audio_hat.squeeze().cpu().numpy(),
24000,
)
model.clear_kv_cache()
return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
def T1_T2(fabric, input_ids, model, text_tokenizer, step):
with fabric.init_tensor():
model.set_kv_cache(batch_size=1)
tokenlist = generate_TT(
model,
None,
input_ids,
None,
["T1T2"],
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
eos_id_a=_eoa,
eos_id_t=_eot,
pad_id_t=_pad_t,
shift=padded_text_vocabsize,
include_prompt=True,
generate_text=True,
)
model.clear_kv_cache()
return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
def load_model(ckpt_dir, device):
# snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
snacmodel = SNAC.from_pretrained(repo_id="hubertsiuzdak/snac_24khz", local_dir=f"hubertsiuzdak/snac_24khz", local_files_only=True).eval().to(device)
whisper_model_path = ckpt_dir + "/small.pt"
if not os.path.exists(whisper_model_path):
whisper_model_path = "small"
whispermodel = whisper.load_model(whisper_model_path).to(device)
text_tokenizer = Tokenizer(ckpt_dir)
fabric = L.Fabric(devices=1, strategy="auto")
config = Config.from_file(ckpt_dir + "/model_config.yaml")
config.post_adapter = False
with fabric.init_module(empty_init=False):
model = GPT(config)
model = fabric.setup(model)
state_dict = lazy_load(ckpt_dir + "/lit_model.pth")
model.load_state_dict(state_dict, strict=True)
model.to(device).eval()
return fabric, model, text_tokenizer, snacmodel, whispermodel
def download_model(ckpt_dir):
repo_id = "gpt-omni/mini-omni2"
snapshot_download(repo_id, local_dir=ckpt_dir, revision="main")
def get_text_stream(list_output, index, text_tokenizer):
text_tokens = list_output[-1][index:]
index += len(text_tokens)
is_text_end = False
if text_vocabsize in text_tokens:
text_tokens = text_tokens[:text_tokens.index(text_vocabsize)]
is_text_end = True
if len(text_tokens) == 0:
return "", index, is_text_end
res_text = text_tokenizer.decode(torch.tensor(text_tokens))
return res_text, index, is_text_end
class OmniInference:
def __init__(self, ckpt_dir='./checkpoint', device='cuda:0'):
self.device = device
if not os.path.exists(ckpt_dir):
print(f"checkpoint directory {ckpt_dir} not found, downloading from huggingface")
download_model(ckpt_dir)
self.fabric, self.model, self.text_tokenizer, self.snacmodel, self.whispermodel = load_model(ckpt_dir, device)
def warm_up(self, sample='./data/samples/output1.wav'):
for _ in self.run_AT_batch_stream(sample):
pass
@torch.inference_mode()
def run_AT_batch_stream(self,
audio_path,
stream_stride=4,
max_returned_tokens=2048,
temperature=0.9,
top_k=1,
top_p=1.0,
eos_id_a=_eoa,
eos_id_t=_eot,
save_path=None
):
assert os.path.exists(audio_path), f"audio file {audio_path} not found"
model = self.model
with self.fabric.init_tensor():
model.set_kv_cache(batch_size=2,device=self.device)
mel, leng = load_audio(audio_path)
audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, self.whispermodel, self.device)
T = input_ids[0].size(1)
device = input_ids[0].device
assert max_returned_tokens > T, f"max_returned_tokens {max_returned_tokens} should be greater than audio length {T}"
if model.max_seq_length < max_returned_tokens - 1:
raise NotImplementedError(
f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}"
)
input_pos = torch.tensor([T], device=device)
list_output = [[] for i in range(8)]
tokens_A, token_T = next_token_image_batch(
model,
audio_feature.to(torch.float32).to(model.device),
None,
input_ids,
[T - 3, T - 3],
["A1T2", "A1T2"],
input_pos=torch.arange(0, T, device=device),
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
for i in range(7):
list_output[i].append(tokens_A[i].tolist()[0])
list_output[7].append(token_T.tolist()[0])
model_input_ids = [[] for i in range(8)]
for i in range(7):
tokens_A[i] = tokens_A[i].clone() + padded_text_vocabsize + i * padded_audio_vocabsize
model_input_ids[i].append(tokens_A[i].clone().to(device).to(torch.int32))
model_input_ids[i].append(torch.tensor([layershift(4097, i)], device=device))
model_input_ids[i] = torch.stack(model_input_ids[i])
model_input_ids[-1].append(token_T.clone().to(torch.int32))
model_input_ids[-1].append(token_T.clone().to(torch.int32))
model_input_ids[-1] = torch.stack(model_input_ids[-1])
text_end = False
index = 1
nums_generate = stream_stride
begin_generate = False
current_index = 0
text_index = 0
is_text_end = False
for _ in tqdm(range(2, max_returned_tokens - T + 1)):
tokens_A, token_T = next_token_image_batch(
model,
None,
None,
model_input_ids,
None,
None,
input_pos=input_pos,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
if text_end:
token_T = torch.tensor([_pad_t], device=device)
if tokens_A[-1] == eos_id_a:
break
if token_T == eos_id_t:
text_end = True
for i in range(7):
list_output[i].append(tokens_A[i].tolist()[0])
list_output[7].append(token_T.tolist()[0])
model_input_ids = [[] for i in range(8)]
for i in range(7):
tokens_A[i] = tokens_A[i].clone() +padded_text_vocabsize + i * padded_audio_vocabsize
model_input_ids[i].append(tokens_A[i].clone().to(device).to(torch.int32))
model_input_ids[i].append(
torch.tensor([layershift(4097, i)], device=device)
)
model_input_ids[i] = torch.stack(model_input_ids[i])
model_input_ids[-1].append(token_T.clone().to(torch.int32))
model_input_ids[-1].append(token_T.clone().to(torch.int32))
model_input_ids[-1] = torch.stack(model_input_ids[-1])
if index == 7:
begin_generate = True
if begin_generate:
current_index += 1
if current_index == nums_generate:
current_index = 0
snac = get_snac(list_output, index, nums_generate)
audio_stream = generate_audio_data(snac, self.snacmodel, self.device)
if is_text_end:
text_stream = ""
else:
text_stream, text_index, is_text_end = get_text_stream(list_output, text_index, self.text_tokenizer)
yield (audio_stream, text_stream)
input_pos = input_pos.add_(1)
index += 1
text = self.text_tokenizer.decode(torch.tensor(list_output[-1]))
print(f"text output: {text}")
if save_path is not None:
audiolist = reconscruct_snac(list_output)
audio = reconstruct_tensors(audiolist)
with torch.inference_mode():
audio_hat = self.snacmodel.decode(audio)
sf.write(save_path, audio_hat.squeeze().cpu().numpy(), 24000)
model.clear_kv_cache()
return list_output
def test_infer():
device = "cuda:0"
out_dir = f"./output/{get_time_str()}"
ckpt_dir = f"./checkpoint"
if not os.path.exists(ckpt_dir):
print(f"checkpoint directory {ckpt_dir} not found, downloading from huggingface")
download_model(ckpt_dir)
fabric, model, text_tokenizer, snacmodel, whispermodel = load_model(ckpt_dir, device)
task = ['A1A2', 'asr', "T1A2", "AA-BATCH", 'T1T2', 'AT']
# prepare test data
# TODO
test_audio_list = sorted(glob.glob('./data/samples/output*.wav'))
test_audio_transcripts = [
"What is your name?",
"what are your hobbies?",
"Do you like beijing",
"How are you feeling today?",
"what is the weather like today?",
]
test_text_list = [
"What is your name?",
"How are you feeling today?",
"Can you describe your surroundings?",
"What did you do yesterday?",
"What is your favorite book and why?",
"How do you make a cup of tea?",
"What is the weather like today?",
"Can you explain the concept of time?",
"Can you tell me a joke?",
]
# LOAD MODEL
with torch.no_grad():
if "A1A2" in task:
print("===============================================================")
print(" testing A1A2")
print("===============================================================")
step = 0
for path in test_audio_list:
try:
mel, leng = load_audio(path)
audio_feature, input_ids = get_input_ids_whisper(mel, leng, whispermodel, device)
text = A1_A2(
fabric,
audio_feature,
input_ids,
leng,
model,
text_tokenizer,
step,
snacmodel,
out_dir=out_dir,
)
print(f"input: {test_audio_transcripts[step]}")
print(f"output: {text}")
step += 1
print(
"+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
)
except:
print(f"[error] failed to process {path}")
print("===============================================================")
if 'asr' in task:
print("===============================================================")
print(" testing asr")
print("===============================================================")
index = 0
step = 0
for path in test_audio_list:
mel, leng = load_audio(path)
audio_feature, input_ids = get_input_ids_whisper(mel, leng, whispermodel, device, special_token_a=_pad_a, special_token_t=_asr)
output = A1_T1(fabric, audio_feature, input_ids ,leng, model, text_tokenizer, index).lower().replace(',','').replace('.','').replace('?','')
print(f"audio_path: {path}")
print(f"audio transcript: {test_audio_transcripts[index]}")
print(f"asr output: {output}")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
index += 1
if "T1A2" in task:
step = 0
print("\n")
print("===============================================================")
print(" testing T1A2")
print("===============================================================")
for text in test_text_list:
input_ids = get_input_ids_TA(text, text_tokenizer)
text_output = T1_A2(fabric, input_ids, model, text_tokenizer, step,
snacmodel, out_dir=out_dir)
print(f"input: {text}")
print(f"output: {text_output}")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
step += 1
print("===============================================================")
if "T1T2" in task:
step = 0
print("\n")
print("===============================================================")
print(" testing T1T2")
print("===============================================================")
for text in test_text_list:
input_ids = get_input_ids_TT(text, text_tokenizer)
text_output = T1_T2(fabric, input_ids, model, text_tokenizer, step)
print(f" Input: {text}")
print(f"Output: {text_output}")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("===============================================================")
if "AT" in task:
print("===============================================================")
print(" testing A1T2")
print("===============================================================")
step = 0
for path in test_audio_list:
mel, leng = load_audio(path)
audio_feature, input_ids = get_input_ids_whisper(
mel, leng, whispermodel, device,
special_token_a=_pad_a, special_token_t=_answer_t
)
text = A1_T2(
fabric, audio_feature, input_ids, leng, model, text_tokenizer, step
)
print(f"input: {test_audio_transcripts[step]}")
print(f"output: {text}")
step += 1
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("===============================================================")
if "AA-BATCH" in task:
print("===============================================================")
print(" testing A1A2-BATCH")
print("===============================================================")
step = 0
for path in test_audio_list:
mel, leng = load_audio(path)
audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device)
text = A1_A2_batch(
fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
snacmodel, out_dir=out_dir
)
print(f"input: {test_audio_transcripts[step]}")
print(f"output: {text}")
step += 1
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("===============================================================")
print("*********************** test end *****************************")
if __name__ == "__main__":
test_infer()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment