v1.0

f8d86cb0 · chenzk · f8d86cb0 · f8d86cb0 · f8d86cb0 · f8d86cb0
Commit f8d86cb0 authored Apr 02, 2025 by chenzk
20 changed files
--- a/.gradio/certificate.pem
+++ b/.gradio/certificate.pem
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2025 Alibaba Cloud
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/Qwen/Qwen2.5-Omni-7B/README.md
+++ b/Qwen/Qwen2.5-Omni-7B/README.md
--- a/README.md
+++ b/README.md
+# Qwen2.5-Omni
+7B参数完成看、听、说、写，端到端多模态大模型支持文本、图像、音频和视频输入。
+
+## 论文
+`无`
+
+## 模型结构
+Qwen2.5-Omni由多模态编码器（转换图片、视频、音频、文本为统一格式tokens）、thinker解码器（生成语义tokens）、talker解码器（生成多模态语义tokens）和流式编码器（tokens转换音频波形）组成。
+<div align=center>
+    <img src="./doc/Qwen25Omni.png"/>
+</div>
+
+## 算法原理
+Qwen2.5-Omni采用多模态领域通用的编码-解码结构，统一tokens和模型，采用Flow-Matching DiT扩散模型生成梅尔频谱图（语音的中间表示），再通过改进BigVGAN（高质量声码器）将频谱图转换为波形（音频信号）。
+<div align=center>
+    <img src="./doc/algorithm.png"/>
+</div>
+
+## 环境配置
+```
+mv Qwen2.5-Omni_pytorch Qwen2.5-Omni # 去框架名后缀
+```
+
+### Docker（方法一）
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
+# <your IMAGE ID>为以上拉取的docker的镜像ID替换，本镜像为：e77c15729879
+docker run -it --shm-size=64G -v $PWD/Qwen2.5-Omni:/home/Qwen2.5-Omni -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qomni <your IMAGE ID> bash
+cd /home/Qwen2.5-Omni
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
+
+unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
+cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
+```
+### Dockerfile（方法二）
+```
+cd /home/DB-GPT/docker
+docker build --no-cache -t qomni:latest .
+docker run --shm-size=64G --name qomni -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../Qwen2.5-Omni:/home/Qwen2.5-Omni -it qomni bash
+# 若遇到Dockerfile启动的方式安装环境需要长时间等待，可注释掉里面的pip安装，启动容器后再安装python库：pip install -r requirements.txt。
+
+cd /home/Qwen2.5-Omni
+unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
+cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
+```
+### Anaconda（方法三）
+1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装：
+- https://developer.hpccube.com/tool/
+```
+DTK驱动:dtk2504
+python:python3.10
+torch:2.4.1
+torchvision:0.19.1
+triton:3.0.0
+vllm:0.6.2
+flash-attn:2.6.1
+deepspeed:0.14.2
+apex:1.4.0
+transformers:4.50.0.dev0
+```
+
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
+
+2、其它非特殊库参照requirements.txt安装
+```
+cd /home/Qwen2.5-Omni
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
+
+cd /home/Qwen2.5-Omni
+unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
+cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
+```
+
+## 数据集
+`无`
+
+## 训练
+无
+
+## 推理
+预训练权重目录结构：
+```
+/home/Qwen2.5-Omni
+    └── Qwen/Qwen2.5-Omni-7B
+``` 
+
+### 单机多卡
+```
+python infer_transformers.py
+# vllm版由于需适配底层工具较多，敬请期待后期开放。
+```
+更多资料可参考源项目中的[`README_origin`](./README_origin.md)。
+
+
+## result
+`输入: `
+```
+./draw.mp4
+```
+
+`输出:`
+```
+"system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nOh, that sounds like a really cool video! It's great to see someone using a tablet to draw a guitar. What do you think about the style of the drawing? Is it more realistic or more of an abstract piece? And what do you think about the use of the tablet for drawing? It seems like a fun and creative way to work."
+```
+
+### 精度
+DCU与GPU精度一致，推理框架：pytorch。
+
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`制造,广媒,金融,能源,医疗,家居,教育`
+## 预训练权重
+预训练权重快速下载中心：[SCNet AIModels](http://113.200.138.88:18080/aimodels) ，项目中的预训练权重可从快速下载通道下载：[Qwen2.5-Omni-7B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Omni-7B.git)
+
+HF下载地址为：[Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B)
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/qwen2.5-omni_pytorch.git
+## 参考资料
+- https://github.com/QwenLM/Qwen2.5-Omni
+
--- a/README_CN.md
+++ b/README_CN.md
--- a/README_origin.md
+++ b/README_origin.md
--- a/assets/Qwen2.5_Omni.pdf
+++ b/assets/Qwen2.5_Omni.pdf
--- a/cookbooks/multi_round_omni_chatting.ipynb
+++ b/cookbooks/multi_round_omni_chatting.ipynb
--- a/cookbooks/omni_chatting_for_math.ipynb
+++ b/cookbooks/omni_chatting_for_math.ipynb
--- a/cookbooks/omni_chatting_for_music.ipynb
+++ b/cookbooks/omni_chatting_for_music.ipynb
--- a/cookbooks/screen_recording_interaction.ipynb
+++ b/cookbooks/screen_recording_interaction.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f2c16396",
+   "metadata": {},
+   "source": [
+    "### Screen Recording Interaction with Qwen2.5-Omni\n",
+    "\n",
+    "This notebook demonstrates how to use Qwen2.5-Omni to get the information and content you want to know by asking questions in real time on the recording screen."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "638e9082-c1ef-4efd-9a10-e35507e25363",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:04.049566Z",
+     "iopub.status.busy": "2025-01-29T12:40:04.049365Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8\n",
+    "!pip install qwen-omni-utils\n",
+    "!pip install openai\n",
+    "!pip install flash-attn --no-build-isolation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9596c50d-80a8-433f-b846-1fbf61145ccc",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:16.511701Z",
+     "iopub.status.busy": "2025-01-29T12:40:16.510916Z",
+     "iopub.status.idle": "2025-01-29T12:40:16.878038Z",
+     "shell.execute_reply": "2025-01-29T12:40:16.877543Z",
+     "shell.execute_reply.started": "2025-01-29T12:40:16.511678Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from qwen_omni_utils import process_mm_info\n",
+    "\n",
+    "# @title inference function\n",
+    "def inference(video_path, prompt, sys_prompt):\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": sys_prompt},\n",
+    "        {\"role\": \"user\", \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": prompt},\n",
+    "                {\"type\": \"video\", \"video\": video_path},\n",
+    "            ]\n",
+    "        },\n",
+    "    ]\n",
+    "    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "    # image_inputs, video_inputs = process_vision_info([messages])\n",
+    "    audios, images, videos = process_mm_info(messages, use_audio_in_video=False)\n",
+    "    inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=False)\n",
+    "    inputs = inputs.to(model.device).to(model.dtype)\n",
+    "\n",
+    "    output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)\n",
+    "\n",
+    "    text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "386e4cd8",
+   "metadata": {},
+   "source": [
+    "Load model and processors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e829b782-0be7-4bc6-a576-6b815323376e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": false
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:18.337731Z",
+     "iopub.status.busy": "2025-01-29T12:40:18.337470Z",
+     "iopub.status.idle": "2025-01-29T12:40:47.760976Z",
+     "shell.execute_reply": "2025-01-29T12:40:47.760220Z",
+     "shell.execute_reply.started": "2025-01-29T12:40:18.337713Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2025-03-22 17:14:12.353632: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2025-03-22 17:14:12.386228: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2025-03-22 17:14:12.386249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2025-03-22 17:14:12.387082: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2025-03-22 17:14:12.392644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2025-03-22 17:14:13.131254: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.14s/it]\n",
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:6129: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  for key, value in torch.load(path).items():\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor\n",
+    "\n",
+    "model_path = \"Qwen/Qwen2.5-Omni-7B\"\n",
+    "model = Qwen2_5OmniModel.from_pretrained(\n",
+    "    model_path,\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    device_map=\"auto\",\n",
+    "    attn_implementation=\"flash_attention_2\",\n",
+    ")\n",
+    "processor = Qwen2_5OmniProcessor.from_pretrained(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ed93fb82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a47ad45",
+   "metadata": {},
+   "source": [
+    "#### Understanding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1935af5e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:41:18.150397Z",
+     "iopub.status.busy": "2025-01-29T12:41:18.149631Z",
+     "iopub.status.idle": "2025-01-29T12:41:19.978329Z",
+     "shell.execute_reply": "2025-01-29T12:41:19.977054Z",
+     "shell.execute_reply.started": "2025-01-29T12:41:18.150371Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n",
+      "qwen-vl-utils using torchvision to read video.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "What the browser is used in this video?\n",
+      "assistant\n",
+      "The browser used in the video is Google Chrome.\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
+    "prompt = \"What the browser is used in this video?\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9961aae",
+   "metadata": {},
+   "source": [
+    "#### OCR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0894f5f1",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:44:01.387553Z",
+     "iopub.status.busy": "2025-01-29T12:44:01.386725Z",
+     "iopub.status.idle": "2025-01-29T12:44:09.671782Z",
+     "shell.execute_reply": "2025-01-29T12:44:09.671200Z",
+     "shell.execute_reply.started": "2025-01-29T12:44:01.387530Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "Who is the authors of this paper?\n",
+      "assistant\n",
+      "The authors of the paper \"Attention Is All You Need\" are:\n",
+      "\n",
+      "1. Ashish Vaswani\n",
+      "2. Noam Shazeer\n",
+      "3. Niki Parmar\n",
+      "4. Jakob Uszkoreit\n",
+      "5. Llion Jones\n",
+      "6. Aidan N. Gomez\n",
+      "7. Lukasz Kaiser\n",
+      "8. Illia Polosukhin\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
+    "prompt = \"Who is the authors of this paper?\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e9b7651",
+   "metadata": {},
+   "source": [
+    "#### Summarize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16aa3dc5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "Summarize this paper in short.\n",
+      "assistant\n",
+      "The paper \"Attention Is All You Need\" introduces the Transformer model, a novel architecture for sequence-to-sequence tasks that relies entirely on self-attention mechanisms. The Transformer outperforms existing models in machine translation tasks, achieving state-of-the-art BLEU scores on WMT 2014 English-German and English-French translation tasks. The model is highly parallelizable, allowing for efficient training on large datasets and GPUs. The paper also discusses the application of the Transformer to other tasks, such as English constituency parsing, and highlights its potential for handling large inputs and outputs, such as images and audio.\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
+    "prompt = \"Summarize this paper in short.\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8dd58bbd",
+   "metadata": {},
+   "source": [
+    "#### Assistant"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cea7d11",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "Please trranslate the abstract of paper into Chinese.\n",
+      "assistant\n",
+      "The abstract of the paper \"Attention Is All You Need\" by Vaswani et al. (2017) is as follows:\n",
+      "\n",
+      "---\n",
+      "\n",
+      "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based entirely on self-attention. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n",
+      "\n",
+      "---\n",
+      "\n",
+      "Translation:\n",
+      "\n",
+      "---\n",
+      "\n",
+      "主流的序列转换模型基于复杂的递归或卷积神经网络，包括编码器和解码器。表现最好的模型还通过注意力机制连接编码器和解码器。我们提出了一种新的简单网络架构，Transformer，完全基于自我注意力。在两个机器翻译任务上的实验表明，这些模型在质量上更优，同时更易于并行化，并且训练时间显著减少。我们的模型在WMT 2014英语到德语翻译任务上实现了28.4 BLEU，超过现有最佳结果，包括组合模型，提高了2 BLEU。在WMT 2014英语到法语翻译任务上，我们的模型在仅训练3.5天、使用八个GPU的情况下，建立了新的单模型状态最佳BLEU得分41.8，比文献中最好的模型（包括组合模型）低2 BLEU。我们展示了Transformer在其他任务上的泛化能力，成功地将其应用于英语句法解析，即使在大量和有限的训练数据下也是如此。\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
+    "prompt = \"Please trranslate the abstract of paper into Chinese.\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "omni",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/cookbooks/universal_audio_understanding.ipynb
+++ b/cookbooks/universal_audio_understanding.ipynb
--- a/cookbooks/video_information_extracting.ipynb
+++ b/cookbooks/video_information_extracting.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f2c16396",
+   "metadata": {},
+   "source": [
+    "### Video Information Extracting with Qwen2.5-Omni\n",
+    "\n",
+    "This notebook demonstrates how to use Qwen2.5-Omni to obtain information from the video stream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "638e9082-c1ef-4efd-9a10-e35507e25363",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:04.049566Z",
+     "iopub.status.busy": "2025-01-29T12:40:04.049365Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8\n",
+    "!pip install qwen-omni-utils\n",
+    "!pip install openai\n",
+    "!pip install flash-attn --no-build-isolation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9596c50d-80a8-433f-b846-1fbf61145ccc",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:16.511701Z",
+     "iopub.status.busy": "2025-01-29T12:40:16.510916Z",
+     "iopub.status.idle": "2025-01-29T12:40:16.878038Z",
+     "shell.execute_reply": "2025-01-29T12:40:16.877543Z",
+     "shell.execute_reply.started": "2025-01-29T12:40:16.511678Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from qwen_omni_utils import process_mm_info\n",
+    "\n",
+    "# @title inference function\n",
+    "def inference(video_path, prompt, sys_prompt):\n",
+    "    messages = [\n",
+    "        {\"role\": \"system\", \"content\": sys_prompt},\n",
+    "        {\"role\": \"user\", \"content\": [\n",
+    "                {\"type\": \"text\", \"text\": prompt},\n",
+    "                {\"type\": \"video\", \"video\": video_path},\n",
+    "            ]\n",
+    "        },\n",
+    "    ]\n",
+    "    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "    # image_inputs, video_inputs = process_vision_info([messages])\n",
+    "    audios, images, videos = process_mm_info(messages, use_audio_in_video=False)\n",
+    "    inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=False)\n",
+    "    inputs = inputs.to(model.device).to(model.dtype)\n",
+    "\n",
+    "    output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)\n",
+    "\n",
+    "    text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "386e4cd8",
+   "metadata": {},
+   "source": [
+    "Load model and processors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e829b782-0be7-4bc6-a576-6b815323376e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": false
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:40:18.337731Z",
+     "iopub.status.busy": "2025-01-29T12:40:18.337470Z",
+     "iopub.status.idle": "2025-01-29T12:40:47.760976Z",
+     "shell.execute_reply": "2025-01-29T12:40:47.760220Z",
+     "shell.execute_reply.started": "2025-01-29T12:40:18.337713Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2025-03-22 17:20:02.523530: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2025-03-22 17:20:02.556178: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2025-03-22 17:20:02.556202: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2025-03-22 17:20:02.557034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2025-03-22 17:20:02.562397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2025-03-22 17:20:03.318258: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]\n",
+      "/opt/conda/envs/omni/lib/python3.10/site-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:6129: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  for key, value in torch.load(path).items():\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor\n",
+    "\n",
+    "model_path = \"Qwen/Qwen2.5-Omni-7B\"\n",
+    "model = Qwen2_5OmniModel.from_pretrained(\n",
+    "    model_path,\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    device_map=\"auto\",\n",
+    "    attn_implementation=\"flash_attention_2\",\n",
+    ")\n",
+    "processor = Qwen2_5OmniProcessor.from_pretrained(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ed93fb82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Video"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a47ad45",
+   "metadata": {},
+   "source": [
+    "#### Question 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1935af5e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:41:18.150397Z",
+     "iopub.status.busy": "2025-01-29T12:41:18.149631Z",
+     "iopub.status.idle": "2025-01-29T12:41:19.978329Z",
+     "shell.execute_reply": "2025-01-29T12:41:19.977054Z",
+     "shell.execute_reply.started": "2025-01-29T12:41:18.150371Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n",
+      "qwen-vl-utils using torchvision to read video.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "How many kind of drinks can you see in the video?\n",
+      "assistant\n",
+      "There are five different kinds of drinks visible in the video.\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
+    "prompt = \"How many kind of drinks can you see in the video?\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9961aae",
+   "metadata": {},
+   "source": [
+    "#### Question 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0894f5f1",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2025-01-29T12:44:01.387553Z",
+     "iopub.status.busy": "2025-01-29T12:44:01.386725Z",
+     "iopub.status.idle": "2025-01-29T12:44:09.671782Z",
+     "shell.execute_reply": "2025-01-29T12:44:09.671200Z",
+     "shell.execute_reply.started": "2025-01-29T12:44:01.387530Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "How many bottles of drinks have I picked up?\n",
+      "assistant\n",
+      "You have picked up two bottles of drinks.\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
+    "prompt = \"How many bottles of drinks have I picked up?\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e9b7651",
+   "metadata": {},
+   "source": [
+    "#### Question 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16aa3dc5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls  width=\"640\"  height=\"360\">\n",
+       "      Your browser does not support the <code>video</code> element.\n",
+       "    </video>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Video object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "system\n",
+      "You are a helpful assistant.\n",
+      "user\n",
+      "How many milliliters are there in the bottle I picked up second time?\n",
+      "assistant\n",
+      "The bottle you picked up second time contains 500 milliliters of liquid.\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
+    "prompt = \"How many milliliters are there in the bottle I picked up second time?\"\n",
+    "\n",
+    "display(Video(video_path, width=640, height=360))\n",
+    "\n",
+    "## Use a local HuggingFace model to inference.\n",
+    "response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
+    "print(response[0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "omni",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/cookbooks/voice_chatting.ipynb
+++ b/cookbooks/voice_chatting.ipynb
--- a/doc/Qwen25Omni.png
+++ b/doc/Qwen25Omni.png
--- a/doc/algorithm.png
+++ b/doc/algorithm.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-dtk25.04/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+# Core dependencies
+gradio==5.23.1
+gradio_client==1.8.0
+qwen-omni-utils==0.0.3
+librosa==0.11.0
+ffmpeg==1.4
+ffmpeg-python==0.2.0
+soundfile==0.13.1
+modelscope_studio==1.2.2
+# git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+accelerate
+av
+qwen-vl-utils[decord]
+
+# Optional dependency
+# Uncomment the following line if you need flash-attn
+# flash-attn==2.7.4.post1
--- a/docker_nv/Dockerfile-omni-2.5-cu121
+++ b/docker_nv/Dockerfile-omni-2.5-cu121
+# Dockerfile of qwenllm/qwen-omni:2.5-cu121
+
+ARG CUDA_VERSION=12.1.0
+ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
+
+FROM ${from} as base
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN <<EOF
+apt update -y && apt upgrade -y && apt install -y --no-install-recommends  \
+    git \
+    git-lfs \
+    python3 \
+    python3-pip \
+    python3-dev \
+    wget \
+    vim \
+    libsndfile1 \
+    ccache \
+    software-properties-common \
+    ffmpeg \
+&& rm -rf /var/lib/apt/lists/*
+EOF
+
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.26.1/cmake-3.26.1-Linux-x86_64.sh \
+    -q -O /tmp/cmake-install.sh \
+    && chmod u+x /tmp/cmake-install.sh \
+    && mkdir /opt/cmake-3.26.1 \
+    && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-3.26.1 \
+    && rm /tmp/cmake-install.sh \
+    && ln -s /opt/cmake-3.26.1/bin/* /usr/local/bin
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+RUN git lfs install
+
+FROM base as dev
+
+WORKDIR /
+
+RUN mkdir -p /data/shared/Qwen
+
+WORKDIR /data/shared/Qwen/
+
+FROM dev as bundle_req
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install networkx==3.1
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 xformers==0.0.29.post2
+RUN --mount=type=cache,target=/root/.cache/pip pip3 install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8  \
+    && pip3 install accelerate qwen-omni-utils modelscope_studio
+
+FROM bundle_req as bundle_vllm
+
+ARG BUNDLE_FLASH_ATTENTION=true
+
+ENV MAX_JOBS=8
+ENV NVCC_THREADS=1
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ENV VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+ENV CCACHE_DIR=/root/.cache/ccache
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then \
+        mkdir -p /data/shared/code \
+        && pip install ninja \
+        && cd /data/shared/code \
+        && git clone https://github.com/Dao-AILab/flash-attention.git \
+        && cd flash-attention \
+        && python setup.py install \
+        && cd /data/shared/Qwen \
+        && rm -rf /data/shared/code/flash-attention; \
+    fi
+
+ARG BUNDLE_VLLM=true
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    if [ "$BUNDLE_VLLM" = "true" ]; then \
+    mkdir -p /data/shared/code \
+        && cd /data/shared/code \
+        && git clone -b qwen2_omni_public_v1 https://github.com/fyabc/vllm.git \
+        && cd vllm \
+        && git checkout d40f54fc2f1524458669048cb40a8d0286f5d1d2 \
+        && python3 use_existing_torch.py \
+        && pip3 install setuptools_scm \
+        && pip3 install -r requirements/cuda.txt \
+        && pip3 install . --no-build-isolation\
+        && cd /data/shared/Qwen \
+        && rm -rf /data/shared/code/vllm; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip3 install \
+    gradio==5.23.1 \
+    gradio_client==1.8.0 \
+    librosa==0.11.0 \
+    ffmpeg==1.4 \
+    ffmpeg-python==0.2.0 \
+    soundfile==0.13.1 \
+    av
+
+RUN rm -rvf /root/.cache/pip
+
+COPY ../web_demo.py ./
+
+EXPOSE 80
--- a/docker_nv/docker_web_demo.sh
+++ b/docker_nv/docker_web_demo.sh
+#!/usr/bin/env bash
+#
+# This script will automatically pull docker image from DockerHub, and start a daemon container to run the Qwen-Chat web-demo.
+
+IMAGE_NAME=qwenllm/qwen-omni:2.5-cu121
+QWEN_CHECKPOINT_PATH=/path/to/Qwen2.5-Omni-7B
+PORT=8901
+CONTAINER_NAME=qwen2.5-omni
+FLASH_ATTN=0
+
+function usage() {
+    echo '
+Usage: bash docker/docker_web_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Instruct] [-n CONTAINER_NAME] [--port PORT] [--flash-attn2]
+'
+}
+
+while [[ "$1" != "" ]]; do
+    case $1 in
+        -i | --image-name )
+            shift
+            IMAGE_NAME=$1
+            ;;
+        -c | --checkpoint )
+            shift
+            QWEN_CHECKPOINT_PATH=$1
+            ;;
+        -n | --container-name )
+            shift
+            CONTAINER_NAME=$1
+            ;;
+        --port )
+            shift
+            PORT=$1
+            ;;
+        --flash-attn2 )
+            FLASH_ATTN=1
+            ;;
+        -h | --help )
+            usage
+            exit 0
+            ;;
+        * )
+            echo "Unknown argument ${1}"
+            exit 1
+            ;;
+    esac
+    shift
+done
+
+if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then
+    echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit."
+    exit 1
+fi
+
+sudo docker pull ${IMAGE_NAME} || {
+    echo "Pulling image ${IMAGE_NAME} failed, exit."
+    exit 1
+}
+
+WEB_DEMO_ARGS="--server-port 8901 --server-name 0.0.0.0 -c /data/shared/Qwen/Qwen2.5-Omni-7B"
+if [ ${FLASH_ATTN} -eq 1 ]; then
+    WEB_DEMO_ARGS+=" --flash-attn2"
+fi
+
+sudo docker run --gpus all -d --restart always --name ${CONTAINER_NAME} \
+    -v /var/run/docker.sock:/var/run/docker.sock -p ${PORT}:8901 \
+    --mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen2.5-Omni-7B \
+    -it ${IMAGE_NAME} \
+    python web_demo.py ${WEB_DEMO_ARGS} && {
+    echo "Successfully started web demo. Open 'http://localhost:${PORT}' to try!
+Run \`docker logs ${CONTAINER_NAME}\` to check demo status.
+Run \`docker rm -f ${CONTAINER_NAME}\` to stop and remove the demo."
+}