v1.0

5e8fb565 · chenzk · 5e8fb565 · 5e8fb565 · 5e8fb565 · 5e8fb565
Commit 5e8fb565 authored Oct 23, 2024 by chenzk
20 changed files
--- a/CLIP/setup.py
+++ b/CLIP/setup.py
+import os
+
+import pkg_resources
+from setuptools import setup, find_packages
+
+setup(
+    name="clip",
+    py_modules=["clip"],
+    version="1.0",
+    description="",
+    author="OpenAI",
+    packages=find_packages(exclude=["tests*"]),
+    install_requires=[
+        str(r)
+        for r in pkg_resources.parse_requirements(
+            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
+        )
+    ],
+    include_package_data=True,
+    extras_require={'dev': ['pytest']},
+)
--- a/CLIP/tests/test_consistency.py
+++ b/CLIP/tests/test_consistency.py
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+import clip
+
+
+@pytest.mark.parametrize('model_name', clip.available_models())
+def test_consistency(model_name):
+    device = "cpu"
+    jit_model, transform = clip.load(model_name, device=device, jit=True)
+    py_model, _ = clip.load(model_name, device=device, jit=False)
+
+    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
+    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
+
+    with torch.no_grad():
+        logits_per_image, _ = jit_model(image, text)
+        jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+        logits_per_image, _ = py_model(image, text)
+        py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+    assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2024 gpt-omni
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# Mini-Omni2
+Mini-Omni2是功能上最接近GPT4o的多模态模型之一，Mini-Omni2是个视觉-音频助理，实时语音对话，能同时处理视觉、听觉和文本三种模态，针对用户视频和语音查询，实时提供端到端的语音响应。
+## 论文
+`Mini-Omni2: Towards Open-source GPT-4o Model with Vision, Speech and Duplex`
+- https://arxiv.org/pdf/2410.11190
+ 
+## 模型结构
+Omni2的语言模型采用Qwen2-0.5B，adapter采用llama中常用的MLP，Encoder采用各种预训练效果较好模态的编码器，以便用很简单的方法就能训练出多模态模型。
+<div align=center>
+    <img src="./doc/omni2.png"/>
+</div>
+
+## 算法原理
+Omni2的原理是以文本到文本的大语言模型能力为基础，吸收各种已经预训练好的不同模态的编码器的编码能力，添加一层MLP来适应多模态任务，通过一定的微调策略，在缺乏多模态配对训练数据的情况下也能取得一定的多模态效果，下图为其三阶段微调策略。
+<div align=center>
+    <img src="./doc/train.png"/>
+</div>
+
+## 环境配置
+```
+mv mini-omni2_pytorch mini-omni2 # 去框架名后缀
+```
+
+### Docker（方法一）
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
+# <your IMAGE ID>为以上拉取的docker的镜像ID替换，本镜像为：83714c19d308
+docker run -it --shm-size=64G -v $PWD/mini-omni2:/home/mini-omni2 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name omni2 <your IMAGE ID> bash
+cd /home/mini-omni2
+pip install -r requirements.txt # requirements.txt
+# 安装ffmpeg
+apt update
+apt-get install ffmpeg
+# 安装CLIP
+cd CLIP
+pip install . #clip==1.0
+```
+### Dockerfile（方法二）
+```
+cd mini-omni2/docker
+docker build --no-cache -t omni2:latest .
+docker run --shm-size=64G --name omni2 -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../mini-omni2:/home/mini-omni2 -it omni2 bash
+# 若遇到Dockerfile启动的方式安装环境需要长时间等待，可注释掉里面的pip安装，启动容器后再安装python库：pip install -r requirements.txt。
+cd /home/mini-omni2
+# 安装ffmpeg
+apt update
+apt-get install ffmpeg
+# 安装CLIPclip-1.0
+cd CLIP
+pip install . #clip==1.0
+```
+### Anaconda（方法三）
+1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装：
+- https://developer.hpccube.com/tool/
+```
+DTK驱动:dtk24.04.2
+python:python3.10
+torch:2.3.0
+torchvision:0.18.1
+torchaudio:2.1.2
+triton:2.1.0
+flash-attn:2.0.4
+deepspeed:0.14.2
+apex:1.3.0
+xformers:0.0.25
+```
+
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
+
+2、其它非特殊库参照requirements.txt安装
+```
+cd mini-omni2
+pip install -r requirements.txt # requirements.txt
+# 安装ffmpeg
+apt update
+apt-get install ffmpeg
+# 安装CLIP
+cd CLIP
+pip install . #clip==1.0
+```
+
+## 数据集
+无
+
+## 训练
+无
+
+## 推理
+1、下载预训练权重`gpt-omni/mini-omni2`，将mini-omni2下所有文件放入checkpoint文件夹下；
+
+2、下载预训练权重`hubertsiuzdak/snac_24khz`，将文件夹hubertsiuzdak放在根目录mini-omni2下；
+```
+cd mini-omni2
+python inference_vision.py
+```
+更多资料可参考源项目的[`README_origin`](./README_origin.md)
+
+## result
+`输入: `
+```
+# 音色
+input_audio_path = './data/samples/vision_qa_audio.wav'
+# 图片
+input_image_path = './data/samples/vision_qa_image.jpg'
+```
+`输出:`
+```
+# 文本
+text output: The person in the image appears to be a middle-aged man with a fair complexion. He has short, neatly combed gray hair and a receding hairline. His facial features include a prominent nose, thin lips, and a gentle smile. He is wearing a dark suit with a notched lapel, a white shirt, and a dark tie with diagonal stripes. The background is a neutral, dark color that provides a contrast to his light-colored suit. The overall impression is one of professionalism and formality.
+# 音频
+'vision_qa_output.wav'
+```
+
+### 精度
+DCU与GPU精度一致，推理框架：pytorch。
+
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`制造,广媒,金融,能源,医疗,家居,教育`
+## 预训练权重
+预训练权重快速下载中心：[SCNet AIModels](http://113.200.138.88:18080/aimodels) ，项目中的预训练权重可从快速下载通道下载：[gpt-omni/mini-omni2](http://113.200.138.88:18080/project-dependency/mini-omni2.git) 、[hubertsiuzdak/snac_24khz](http://113.200.138.88:18080/aimodels/hubertsiuzdak/snac_24khz.git)。
+
+Hugging Face下载地址为：[gpt-omni/mini-omni2](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) 、[hubertsiuzdak/snac_24khz](https://huggingface.co/hubertsiuzdak/snac_24khz)
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/mini-omni2_pytorch.git
+## 参考资料
+- https://github.com/gpt-omni/mini-omni2.git
+- https://github.com/QwenLM/Qwen2.5.git
+
--- a/README_origin.md
+++ b/README_origin.md
+
+# Mini-Omni2
+
+<p align="center">
+    <img src="./data/figures/title_new.png" width="90%"/>
+</p>
+
+
+<p align="center">
+🤗 <a href="https://huggingface.co/gpt-omni/mini-omni2">Hugging Face</a>   | 📖 <a href="https://github.com/gpt-omni/mini-omni2">Github</a> 
+|     📑 <a href="https://arxiv.org/abs/2410.11190">Technical report</a> 
+</p>
+
+<h5 align="center"> If you like little Omni2, please give us a star⭐ and cite our <a href="(https://arxiv.org/abs/2410.11190)">paper</a>!</h2>
+
+## Introduction
+Mini-Omni2 is an **omni-interactive** model. It can **understand image, audio and text inputs and has end-to-end voice conversations with users**. Featuring **real-time voice output**, **omni-capable multimodal understanding** and flexible interaction **ability with interruption mechanism while speaking**.
+
+<p align="center">
+    <img src="./data/figures/framework.jpeg" width="100%"/>
+</p>
+
+
+## Updates
+
+- **2024.10:** Release the model, technical report, inference and chat demo code.
+
+## Features
+✅ **Multimodal interaction**: with the ability to understand images, speech and text, just like GPT-4o.
+
+✅ **Real-time speech-to-speech** conversational capabilities. No extra ASR or TTS models required, just like [Mini-Omni](https://github.com/gpt-omni/mini-omni).
+
+<!-- ✅ **Streaming audio output**: with first-chunk latency of audio stream less than 0.3s. -->
+
+<!-- ✅ **Duplex interaction**: hearing while speaking, it can be interrupted by key words like "stop omni". -->
+
+
+## Demo
+
+NOTE: need to unmute first.
+
+https://github.com/user-attachments/assets/ad97ca7f-f8b4-40c3-a7e8-fa54b4edf155
+
+
+## ToDo
+- [ ] update interruption mechanism
+- [ ] visual-assistant model and data
+
+
+## Install
+
+Create a new conda environment and install the required packages:
+
+```sh
+conda create -n omni python=3.10
+conda activate omni
+
+git clone https://github.com/gpt-omni/mini-omni2.git
+cd mini-omni2
+pip install -r requirements.txt
+```
+
+## Quick start
+
+**Interactive demo**
+
+- start server
+
+NOTE: you need to start the server before running the streamlit or gradio demo with API_URL set to the server address.
+
+```sh
+sudo apt-get install ffmpeg
+conda activate omni
+cd mini-omni2
+python3 server.py --ip '0.0.0.0' --port 60808
+```
+
+
+- run streamlit demo
+
+NOTE: you need to run streamlit **locally** with PyAudio installed. 
+
+```sh
+pip install PyAudio==0.2.14
+API_URL=http://0.0.0.0:60808/chat streamlit run webui/omni_streamlit.py
+```
+
+
+**Local test**
+
+```sh
+conda activate omni
+cd mini-omni2
+# test run the preset audio samples and questions
+python inference_vision.py
+```
+
+## Mini-Omni2 Overview
+
+**1. Multimodal Modeling**:
+We use multiple sequences as the input and output of the model. In the input part, we will concatenate image, audio and text features to perform a series of comprehensive tasks, as shown in the following figures. In the output part, we use text-guided delayed parallel output to generate real-time speech responses.
+<p align="center">
+    <img src="./data/figures/inputids.png" width="100%"/>
+</p>
+
+**2. Multi-stage Training**:
+We propose an efficient alignment training method and conduct encoder adaptation, modal alignment, and multimodal fine-tuning respectively in the three-stage training.
+<p align="center">
+    <img src="./data/figures/training.jpeg" width="100%"/>
+</p>
+
+<!-- **3. Cases**:
+Here are more cases of Mini-Omni2:
+<p align="center">
+    <img src="./data/figures/samples.png" width="100%"/>
+</p> -->
+
+## FAQ
+
+**1. Does the model support other languages?**
+
+No, the model is only trained on English. However, as we use whisper as the audio encoder, the model can understand other languages which is supported by whisper (like chinese), but the output is only in English.
+
+**2. Error: can not run streamlit in local browser, with remote streamlit server**
+    
+You need start streamlit **locally** with PyAudio installed.
+
+
+## Acknowledgements 
+
+- [Qwen2](https://github.com/QwenLM/Qwen2/) as the LLM backbone.
+- [litGPT](https://github.com/Lightning-AI/litgpt/) for training and inference.
+- [whisper](https://github.com/openai/whisper/)  for audio encoding.
+- [clip](https://github.com/openai/CLIP)  for image encoding.
+- [snac](https://github.com/hubertsiuzdak/snac/)  for audio decoding.
+- [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) for generating synthetic speech.
+- [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca) and [MOSS](https://github.com/OpenMOSS/MOSS/tree/main) for alignment.
+
+## Citation
+
+```bibtex
+@article{xie2024miniomni2opensourcegpt4ovision,
+      title={Mini-Omni2: Towards Open-source GPT-4o with Vision, Speech and Duplex Capabilities}, 
+      author={Zhifei Xie and Changqiao Wu},
+      year={2024},
+      eprint={2410.11190},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS},
+      journal={ArXiv},
+      volume={abs/2410.11190},
+}
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=gpt-omni/mini-omni2&type=Date)](https://star-history.com/#gpt-omni/mini-omni2&Date)
--- a/__init__.py
+++ b/__init__.py
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
--- a/data/figures/framework.jpeg
+++ b/data/figures/framework.jpeg
--- a/data/figures/inputids.png
+++ b/data/figures/inputids.png
--- a/data/figures/samples.png
+++ b/data/figures/samples.png
--- a/data/figures/title_new.png
+++ b/data/figures/title_new.png
--- a/data/figures/training.jpeg
+++ b/data/figures/training.jpeg
--- a/data/omni2-demo.mp4
+++ b/data/omni2-demo.mp4
--- a/doc/omni2.png
+++ b/doc/omni2.png
--- a/doc/train.png
+++ b/doc/train.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-24.04.2/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+#torch==2.3.1
+#torchvision==0.18.1
+#torchaudio==2.3.1
+litgpt==0.4.3
+snac==1.2.0
+soundfile==0.12.1
+openai-whisper
+streamlit==1.37.1
+# PyAudio==0.2.14
+pydub==0.25.1
+onnxruntime==1.19.0
+# numpy==1.26.3
+gradio==4.42.0
+librosa==0.10.2.post1
+#flask==3.0.3
+fire
+transformers==4.45.2
+tokenizers==0.20.1
+#git+https://github.com/mini-omni/CLIP.git
--- a/docker_start.sh
+++ b/docker_start.sh
+docker run -it --shm-size=64G -v $PWD/mini-omni2:/home/mini-omni2 -v /public/DL_DATA/AI:/home/AI -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=//dev/dri/ --group-add video --name omni2 83714c19d308 bash                                                                                                                                                                                       
+# python -m torch.utils.collect_env
--- a/hubertsiuzdak/snac_24khz/README.md
+++ b/hubertsiuzdak/snac_24khz/README.md
+---
+license: mit
+tags:
+- audio
+---
+
+# SNAC 🍿
+
+Multi-**S**cale **N**eural **A**udio **C**odec (SNAC) compressess audio into discrete codes at a low bitrate.
+
+👉 This model was primarily trained on speech data, and its recommended use case is speech synthesis. See below for other pretrained models.
+
+🔗 GitHub repository: https://github.com/hubertsiuzdak/snac/
+
+## Overview
+
+SNAC encodes audio into hierarchical tokens similarly to SoundStream, EnCodec, and DAC. However, SNAC introduces a simple change where coarse tokens are sampled less frequently,
+covering a broader time span.
+
+This model compresses 24 kHz audio into discrete codes at a 0.98 kbps bitrate. It uses 3 RVQ levels with token rates of 12, 23, and
+47 Hz.
+
+## Pretrained models
+
+Currently, all models support only single audio channel (mono).
+
+| Model                                                                       | Bitrate   | Sample Rate | Params | Recommended use case     | 
+|-----------------------------------------------------------------------------|-----------|-------------|--------|--------------------------|
+| hubertsiuzdak/snac_24khz (this model) | 0.98 kbps | 24 kHz      | 19.8 M | 🗣️ Speech               | 
+| [hubertsiuzdak/snac_32khz](https://huggingface.co/hubertsiuzdak/snac_32khz) | 1.9 kbps  | 32 kHz      | 54.5 M | 🎸 Music / Sound Effects | 
+| [hubertsiuzdak/snac_44khz](https://huggingface.co/hubertsiuzdak/snac_44khz) | 2.6 kbps  | 44 kHz      | 54.5 M | 🎸 Music / Sound Effects |
+
+## Usage
+
+Install it using:
+
+```bash
+pip install snac
+```
+To encode (and decode) audio with SNAC in Python, use the following code:
+
+```python
+import torch
+from snac import SNAC
+
+model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().cuda()
+audio = torch.randn(1, 1, 24000).cuda()  # B, 1, T
+
+with torch.inference_mode():
+    codes = model.encode(audio)
+    audio_hat = model.decode(codes)
+```
+
+You can also encode and reconstruct in a single call:
+
+```python
+with torch.inference_mode():
+    audio_hat, codes = model(audio)
+```
+
+⚠️ Note that `codes` is a list of token sequences of variable lengths, each corresponding to a different temporal
+resolution.
+
+```
+>>> [code.shape[1] for code in codes]
+[12, 24, 48]
+```
+
+## Acknowledgements
+
+Module definitions are adapted from the [Descript Audio Codec](https://github.com/descriptinc/descript-audio-codec).
\ No newline at end of file
--- a/icon.png
+++ b/icon.png
--- a/inference.py
+++ b/inference.py
+import os
+import lightning as L
+import torch
+import glob
+import time
+from snac import SNAC
+from litgpt import Tokenizer
+from litgpt.utils import (
+    num_parameters,
+)
+from litgpt.generate.base import (
+    generate_AA,
+    generate_ASR,
+    generate_TA,
+    generate_TT,
+    generate_AT,
+    generate_TA_BATCH,
+    next_token_image_batch
+)
+import soundfile as sf
+from litgpt.model import GPT, Config
+from lightning.fabric.utilities.load import _lazy_load as lazy_load
+from utils.snac_utils import layershift, reconscruct_snac, reconstruct_tensors, get_time_str
+from utils.snac_utils import get_snac, generate_audio_data
+import whisper
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+
+
+torch.set_printoptions(sci_mode=False)
+
+
+# TODO
+text_vocabsize = 151936
+text_specialtokens = 64
+audio_vocabsize = 4096
+audio_specialtokens = 64
+
+padded_text_vocabsize = text_vocabsize + text_specialtokens
+padded_audio_vocabsize = audio_vocabsize + audio_specialtokens
+
+_eot = text_vocabsize
+_pad_t = text_vocabsize + 1
+_input_t = text_vocabsize + 2
+_answer_t = text_vocabsize + 3
+_asr = text_vocabsize + 4
+
+_eoa = audio_vocabsize
+_pad_a = audio_vocabsize + 1
+_input_a = audio_vocabsize + 2
+_answer_a = audio_vocabsize + 3
+_split = audio_vocabsize + 4
+_image = audio_vocabsize + 5
+_eoimage = audio_vocabsize + 6
+
+
+def get_input_ids_TA(text, text_tokenizer):
+    input_ids_item = [[] for _ in range(8)]
+    text_tokens = text_tokenizer.encode(text)
+    for i in range(7):
+        input_ids_item[i] = [layershift(_pad_a, i)] * (len(text_tokens) + 2) + [
+            layershift(_answer_a, i)
+        ]
+        input_ids_item[i] = torch.tensor(input_ids_item[i]).unsqueeze(0)
+    input_ids_item[-1] = [_input_t] + text_tokens.tolist() + [_eot] + [_answer_t]
+    input_ids_item[-1] = torch.tensor(input_ids_item[-1]).unsqueeze(0)
+    return input_ids_item
+
+
+def get_input_ids_TT(text, text_tokenizer):
+    input_ids_item = [[] for i in range(8)]
+    text_tokens = text_tokenizer.encode(text).tolist()
+
+    for i in range(7):
+        input_ids_item[i] = torch.tensor(
+            [layershift(_pad_a, i)] * (len(text_tokens) + 3)
+        ).unsqueeze(0)
+    input_ids_item[-1] = [_input_t] + text_tokens + [_eot] + [_answer_t]
+    input_ids_item[-1] = torch.tensor(input_ids_item[-1]).unsqueeze(0)
+
+    return input_ids_item
+
+
+def get_input_ids_whisper(
+    mel, leng, whispermodel, device, 
+    special_token_a=_answer_a, special_token_t=_answer_t,
+):
+
+    with torch.no_grad():
+        mel = mel.unsqueeze(0).to(device)
+        # audio_feature = whisper.decode(whispermodel,mel, options).audio_features
+        audio_feature = whispermodel.embed_audio(mel)[0][:leng]
+
+    T = audio_feature.size(0)
+    input_ids = []
+    for i in range(7):
+        input_ids_item = []
+        input_ids_item.append(layershift(_input_a, i))
+        input_ids_item += [layershift(_pad_a, i)] * T
+        input_ids_item += [(layershift(_eoa, i)), layershift(special_token_a, i)]
+        input_ids.append(torch.tensor(input_ids_item).unsqueeze(0))
+    input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, special_token_t])
+    input_ids.append(input_id_T.unsqueeze(0))
+    return audio_feature.unsqueeze(0), input_ids
+
+
+def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
+    with torch.no_grad():
+        mel = mel.unsqueeze(0).to(device)
+        # audio_feature = whisper.decode(whispermodel,mel, options).audio_features
+        audio_feature = whispermodel.embed_audio(mel)[0][:leng]
+    T = audio_feature.size(0)
+    input_ids_AA = []
+    for i in range(7):
+        input_ids_item = []
+        input_ids_item.append(layershift(_input_a, i))
+        input_ids_item += [layershift(_pad_a, i)] * T
+        input_ids_item += [(layershift(_eoa, i)), layershift(_answer_a, i)]
+        input_ids_AA.append(torch.tensor(input_ids_item))
+    input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, _answer_t])
+    input_ids_AA.append(input_id_T)
+
+    input_ids_AT = []
+    for i in range(7):
+        input_ids_item = []
+        input_ids_item.append(layershift(_input_a, i))
+        input_ids_item += [layershift(_pad_a, i)] * T
+        input_ids_item += [(layershift(_eoa, i)), layershift(_pad_a, i)]
+        input_ids_AT.append(torch.tensor(input_ids_item))
+    input_id_T = torch.tensor([_input_t] + [_pad_t] * T + [_eot, _answer_t])
+    input_ids_AT.append(input_id_T)
+
+    input_ids = [input_ids_AA, input_ids_AT]
+    stacked_inputids = [[] for _ in range(8)]
+    for i in range(2):
+        for j in range(8):
+            stacked_inputids[j].append(input_ids[i][j])
+    stacked_inputids = [torch.stack(tensors) for tensors in stacked_inputids]
+    return torch.stack([audio_feature, audio_feature]), stacked_inputids
+
+
+def load_audio(path):
+    audio = whisper.load_audio(path)
+    duration_ms = (len(audio) / 16000) * 1000
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio)
+    return mel, int(duration_ms / 20) + 1
+
+
+def A1_A2_batch(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
+                snacmodel, out_dir=None):
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=2)
+    tokenlist = generate_TA_BATCH(
+        model,
+        audio_feature,
+        input_ids,
+        [leng, leng],
+        ["A1A2", "A1T2"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+    text_tokenlist = tokenlist[-1]
+    if text_vocabsize in text_tokenlist:
+        text_tokenlist = text_tokenlist[: text_tokenlist.index(text_vocabsize)]
+    text = text_tokenizer.decode(torch.tensor(text_tokenlist)).strip()
+
+    audio_tokenlist = tokenlist[:-1]
+    audiolist = reconscruct_snac(audio_tokenlist)
+    audio = reconstruct_tensors(audiolist)
+    if out_dir is None:
+        out_dir = "./output/default/A1-A2-batch"
+    else:
+        out_dir = out_dir + "/A1-A2-batch"
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    with torch.inference_mode():
+        audio_hat = snacmodel.decode(audio)
+    sf.write(
+        f"{out_dir}/{step:02d}.wav",
+        audio_hat.squeeze().cpu().numpy(),
+        24000,
+    )
+    model.clear_kv_cache()
+    return text
+
+
+def A1_T2(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step):
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=1)
+    tokenlist = generate_AT(
+        model,
+        audio_feature,
+        input_ids,
+        [leng],
+        ["AT"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+    return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
+
+
+def A1_A2(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
+          snacmodel, out_dir=None):
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=1)
+    tokenlist = generate_AA(
+        model,
+        audio_feature,
+        input_ids,
+        [leng],
+        ["A1T2"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+    audiolist = reconscruct_snac(tokenlist)
+    tokenlist = tokenlist[-1]
+    if text_vocabsize in tokenlist:
+        tokenlist = tokenlist[: tokenlist.index(text_vocabsize)]
+    if out_dir is None:
+        out_dir = "./output/default/A1-A2"
+    else:
+        out_dir = out_dir + "/A1-A2"
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+        
+    audio = reconstruct_tensors(audiolist)
+    with torch.inference_mode():
+        audio_hat = snacmodel.decode(audio)
+    sf.write(
+        f"{out_dir}/{step:02d}.wav",
+        audio_hat.squeeze().cpu().numpy(),
+        24000,
+    )
+    model.clear_kv_cache()
+    return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
+
+
+def A1_T1(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step):
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=1)
+    tokenlist = generate_ASR(
+        model,
+        audio_feature,
+        input_ids,
+        [leng],
+        ["A1T1"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+    model.clear_kv_cache()
+    return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
+
+
+def T1_A2(fabric, input_ids, model, text_tokenizer, step,
+          snacmodel, out_dir=None):
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=1)
+    tokenlist = generate_TA(
+        model,
+        None,
+        input_ids,
+        None,
+        ["T1A2"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+
+    audiolist = reconscruct_snac(tokenlist)
+    tokenlist = tokenlist[-1]
+
+    if text_vocabsize in tokenlist:
+        tokenlist = tokenlist[: tokenlist.index(text_vocabsize)]
+    audio = reconstruct_tensors(audiolist)
+    if out_dir is None:
+        out_dir = "./output/default/T1-A2"
+    else:
+        out_dir = out_dir + "/T1-A2"
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    with torch.inference_mode():
+        audio_hat = snacmodel.decode(audio)
+    sf.write(
+        f"{out_dir}/{step:02d}.wav",
+        audio_hat.squeeze().cpu().numpy(),
+        24000,
+    )
+    model.clear_kv_cache()
+    return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
+
+
+def T1_T2(fabric, input_ids, model, text_tokenizer, step):
+
+    with fabric.init_tensor():
+        model.set_kv_cache(batch_size=1)
+    tokenlist = generate_TT(
+        model,
+        None,
+        input_ids,
+        None,
+        ["T1T2"],
+        max_returned_tokens=2048,
+        temperature=0.9,
+        top_k=1,
+        eos_id_a=_eoa,
+        eos_id_t=_eot,
+        pad_id_t=_pad_t,
+        shift=padded_text_vocabsize,
+        include_prompt=True,
+        generate_text=True,
+    )
+    model.clear_kv_cache()
+    return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
+
+    
+def load_model(ckpt_dir, device):
+    # snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
+    snacmodel = SNAC.from_pretrained(repo_id="hubertsiuzdak/snac_24khz", local_dir=f"hubertsiuzdak/snac_24khz", local_files_only=True).eval().to(device)
+    whisper_model_path = ckpt_dir + "/small.pt"
+    if not os.path.exists(whisper_model_path):
+        whisper_model_path = "small"
+    whispermodel = whisper.load_model(whisper_model_path).to(device)
+    text_tokenizer = Tokenizer(ckpt_dir)
+    fabric = L.Fabric(devices=1, strategy="auto")
+    config = Config.from_file(ckpt_dir + "/model_config.yaml")
+    config.post_adapter = False
+
+    with fabric.init_module(empty_init=False):
+        model = GPT(config)
+
+    model = fabric.setup(model)
+    state_dict = lazy_load(ckpt_dir + "/lit_model.pth")
+    model.load_state_dict(state_dict, strict=True)
+    model.to(device).eval()
+
+    return fabric, model, text_tokenizer, snacmodel, whispermodel
+
+    
+def download_model(ckpt_dir):
+    repo_id = "gpt-omni/mini-omni2"
+    snapshot_download(repo_id, local_dir=ckpt_dir, revision="main")
+
+    
+def get_text_stream(list_output, index, text_tokenizer):
+    text_tokens = list_output[-1][index:]
+    index += len(text_tokens)
+    is_text_end = False
+    if text_vocabsize in text_tokens:
+        text_tokens = text_tokens[:text_tokens.index(text_vocabsize)]
+        is_text_end = True
+    if len(text_tokens) == 0:
+        return "", index, is_text_end
+    res_text = text_tokenizer.decode(torch.tensor(text_tokens))
+    return res_text, index, is_text_end
+
+    
+class OmniInference:
+
+    def __init__(self, ckpt_dir='./checkpoint', device='cuda:0'):
+        self.device = device
+        if not os.path.exists(ckpt_dir):
+            print(f"checkpoint directory {ckpt_dir} not found, downloading from huggingface")
+            download_model(ckpt_dir)
+        self.fabric, self.model, self.text_tokenizer, self.snacmodel, self.whispermodel = load_model(ckpt_dir, device)
+
+    def warm_up(self, sample='./data/samples/output1.wav'):
+        for _ in self.run_AT_batch_stream(sample):
+            pass
+
+    @torch.inference_mode()
+    def run_AT_batch_stream(self, 
+                            audio_path, 
+                            stream_stride=4,
+                            max_returned_tokens=2048, 
+                            temperature=0.9, 
+                            top_k=1, 
+                            top_p=1.0,
+                            eos_id_a=_eoa,
+                            eos_id_t=_eot,
+                            save_path=None
+        ):
+
+        assert os.path.exists(audio_path), f"audio file {audio_path} not found"
+        model = self.model
+
+        with self.fabric.init_tensor():
+            model.set_kv_cache(batch_size=2,device=self.device)
+
+        mel, leng = load_audio(audio_path)
+        audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, self.whispermodel, self.device)
+        T = input_ids[0].size(1)
+        device = input_ids[0].device
+
+        assert max_returned_tokens > T, f"max_returned_tokens {max_returned_tokens} should be greater than audio length {T}"
+
+        if model.max_seq_length < max_returned_tokens - 1:
+            raise NotImplementedError(
+                f"max_seq_length {model.max_seq_length} needs to be >= {max_returned_tokens - 1}"
+            )
+
+        input_pos = torch.tensor([T], device=device)
+        list_output = [[] for i in range(8)]
+        tokens_A, token_T = next_token_image_batch(
+            model,
+            audio_feature.to(torch.float32).to(model.device),
+            None,
+            input_ids,
+            [T - 3, T - 3],
+            ["A1T2", "A1T2"],
+            input_pos=torch.arange(0, T, device=device),
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        )
+
+        for i in range(7):
+            list_output[i].append(tokens_A[i].tolist()[0])
+        list_output[7].append(token_T.tolist()[0])
+
+        model_input_ids = [[] for i in range(8)]
+        for i in range(7):
+            tokens_A[i] = tokens_A[i].clone() + padded_text_vocabsize + i * padded_audio_vocabsize
+            model_input_ids[i].append(tokens_A[i].clone().to(device).to(torch.int32))
+            model_input_ids[i].append(torch.tensor([layershift(4097, i)], device=device))
+            model_input_ids[i] = torch.stack(model_input_ids[i])
+
+        model_input_ids[-1].append(token_T.clone().to(torch.int32))
+        model_input_ids[-1].append(token_T.clone().to(torch.int32))
+        model_input_ids[-1] = torch.stack(model_input_ids[-1])
+
+        text_end = False
+        index = 1
+        nums_generate = stream_stride
+        begin_generate = False
+        current_index = 0
+
+        text_index = 0
+        is_text_end = False
+
+        for _ in tqdm(range(2, max_returned_tokens - T + 1)):
+            tokens_A, token_T = next_token_image_batch(
+                model,
+                None,
+                None,
+                model_input_ids,
+                None,
+                None,
+                input_pos=input_pos,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+
+            if text_end:
+                token_T = torch.tensor([_pad_t], device=device)
+
+            if tokens_A[-1] == eos_id_a:
+                break
+
+            if token_T == eos_id_t:
+                text_end = True
+
+            for i in range(7):
+                list_output[i].append(tokens_A[i].tolist()[0])
+            list_output[7].append(token_T.tolist()[0])
+
+            model_input_ids = [[] for i in range(8)]
+            for i in range(7):
+                tokens_A[i] = tokens_A[i].clone() +padded_text_vocabsize + i * padded_audio_vocabsize
+                model_input_ids[i].append(tokens_A[i].clone().to(device).to(torch.int32))
+                model_input_ids[i].append(
+                    torch.tensor([layershift(4097, i)], device=device)
+                )
+                model_input_ids[i] = torch.stack(model_input_ids[i])
+
+            model_input_ids[-1].append(token_T.clone().to(torch.int32))
+            model_input_ids[-1].append(token_T.clone().to(torch.int32))
+            model_input_ids[-1] = torch.stack(model_input_ids[-1])
+
+            if index == 7:
+                begin_generate = True
+
+            if begin_generate:
+                current_index += 1
+                if current_index == nums_generate:
+                    current_index = 0
+                    snac = get_snac(list_output, index, nums_generate)
+                    audio_stream = generate_audio_data(snac, self.snacmodel, self.device)
+                    if is_text_end:
+                        text_stream = ""
+                    else:
+                        text_stream, text_index, is_text_end = get_text_stream(list_output, text_index, self.text_tokenizer)
+
+                    yield (audio_stream, text_stream)
+
+            input_pos = input_pos.add_(1)
+            index += 1
+        text = self.text_tokenizer.decode(torch.tensor(list_output[-1]))
+        print(f"text output: {text}")
+
+        if save_path is not None:
+            audiolist = reconscruct_snac(list_output)
+            audio = reconstruct_tensors(audiolist)
+            with torch.inference_mode():
+                audio_hat = self.snacmodel.decode(audio)
+                sf.write(save_path, audio_hat.squeeze().cpu().numpy(), 24000)
+        
+        model.clear_kv_cache()
+        return list_output
+
+
+def test_infer():
+    device = "cuda:0"
+    out_dir = f"./output/{get_time_str()}"
+    ckpt_dir = f"./checkpoint"
+    if not os.path.exists(ckpt_dir):
+        print(f"checkpoint directory {ckpt_dir} not found, downloading from huggingface")
+        download_model(ckpt_dir)
+
+    fabric, model, text_tokenizer, snacmodel, whispermodel = load_model(ckpt_dir, device)
+
+    task = ['A1A2', 'asr', "T1A2", "AA-BATCH", 'T1T2', 'AT']
+
+    # prepare test data
+    # TODO
+    test_audio_list = sorted(glob.glob('./data/samples/output*.wav'))
+    test_audio_transcripts = [
+        "What is your name?",
+        "what are your hobbies?",
+        "Do you like beijing",
+        "How are you feeling today?",
+        "what is the weather like today?",
+    ]
+    test_text_list = [
+        "What is your name?",
+        "How are you feeling today?",
+        "Can you describe your surroundings?",
+        "What did you do yesterday?",
+        "What is your favorite book and why?",
+        "How do you make a cup of tea?",
+        "What is the weather like today?",
+        "Can you explain the concept of time?",
+        "Can you tell me a joke?",
+    ]
+
+    # LOAD MODEL
+    with torch.no_grad():
+        if "A1A2" in task:
+            print("===============================================================")
+            print("                       testing A1A2")
+            print("===============================================================")
+            step = 0
+            for path in test_audio_list:
+                try:
+                    mel, leng = load_audio(path)
+                    audio_feature, input_ids = get_input_ids_whisper(mel, leng, whispermodel, device)
+                    text = A1_A2(
+                        fabric,
+                        audio_feature,
+                        input_ids,
+                        leng,
+                        model,
+                        text_tokenizer,
+                        step,
+                        snacmodel,
+                        out_dir=out_dir,
+                    )
+                    print(f"input: {test_audio_transcripts[step]}")
+                    print(f"output: {text}")
+                    step += 1
+                    print(
+                        "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+                    )
+                except:
+                    print(f"[error] failed to process {path}")
+            print("===============================================================")
+
+        if 'asr' in task:
+            print("===============================================================")
+            print("                       testing asr")
+            print("===============================================================")
+
+            index = 0
+            step = 0
+            for path in test_audio_list:
+                mel, leng = load_audio(path)
+                audio_feature, input_ids = get_input_ids_whisper(mel, leng, whispermodel, device, special_token_a=_pad_a, special_token_t=_asr)
+                output = A1_T1(fabric, audio_feature, input_ids ,leng, model, text_tokenizer, index).lower().replace(',','').replace('.','').replace('?','')
+                print(f"audio_path: {path}")
+                print(f"audio transcript: {test_audio_transcripts[index]}")
+                print(f"asr output: {output}")
+                print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+                index += 1
+
+        if "T1A2" in task:
+            step = 0
+            print("\n")
+            print("===============================================================")
+            print("                       testing T1A2")
+            print("===============================================================")
+            for text in test_text_list:
+                input_ids = get_input_ids_TA(text, text_tokenizer)
+                text_output = T1_A2(fabric, input_ids, model, text_tokenizer, step,
+                                    snacmodel, out_dir=out_dir)
+                print(f"input: {text}")
+                print(f"output: {text_output}")
+                print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+                step += 1
+            print("===============================================================")
+
+        if "T1T2" in task:
+            step = 0
+            print("\n")
+            print("===============================================================")
+            print("                       testing T1T2")
+            print("===============================================================")
+
+            for text in test_text_list:
+                input_ids = get_input_ids_TT(text, text_tokenizer)
+                text_output = T1_T2(fabric, input_ids, model, text_tokenizer, step)
+                print(f" Input: {text}")
+                print(f"Output: {text_output}")
+                print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+            print("===============================================================")
+
+        if "AT" in task:
+            print("===============================================================")
+            print("                       testing A1T2")
+            print("===============================================================")
+            step = 0
+            for path in test_audio_list:
+                mel, leng = load_audio(path)
+                audio_feature, input_ids = get_input_ids_whisper(
+                    mel, leng, whispermodel, device, 
+                    special_token_a=_pad_a, special_token_t=_answer_t
+                )
+                text = A1_T2(
+                    fabric, audio_feature, input_ids, leng, model, text_tokenizer, step
+                )
+                print(f"input: {test_audio_transcripts[step]}")
+                print(f"output: {text}")
+                step += 1
+                print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+            print("===============================================================")
+
+        if "AA-BATCH" in task:
+            print("===============================================================")
+            print("                       testing A1A2-BATCH")
+            print("===============================================================")
+            step = 0
+            for path in test_audio_list:
+                mel, leng = load_audio(path)
+                audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device)
+                text = A1_A2_batch(
+                    fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
+                    snacmodel, out_dir=out_dir
+                )
+                print(f"input: {test_audio_transcripts[step]}")
+                print(f"output: {text}")
+                step += 1
+                print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
+            print("===============================================================")
+
+        print("*********************** test end *****************************")
+
+
+
+if __name__ == "__main__":
+    test_infer()