Commit f8d86cb0 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2605 canceled with stages
-----BEGIN CERTIFICATE-----
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
-----END CERTIFICATE-----
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2025 Alibaba Cloud
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This diff is collapsed.
# Qwen2.5-Omni
7B参数完成看、听、说、写,端到端多模态大模型支持文本、图像、音频和视频输入。
## 论文
`无`
## 模型结构
Qwen2.5-Omni由多模态编码器(转换图片、视频、音频、文本为统一格式tokens)、thinker解码器(生成语义tokens)、talker解码器(生成多模态语义tokens)和流式编码器(tokens转换音频波形)组成。
<div align=center>
<img src="./doc/Qwen25Omni.png"/>
</div>
## 算法原理
Qwen2.5-Omni采用多模态领域通用的编码-解码结构,统一tokens和模型,采用Flow-Matching DiT扩散模型生成梅尔频谱图(语音的中间表示),再通过改进BigVGAN(高质量声码器)将频谱图转换为波形(音频信号)。
<div align=center>
<img src="./doc/algorithm.png"/>
</div>
## 环境配置
```
mv Qwen2.5-Omni_pytorch Qwen2.5-Omni # 去框架名后缀
```
### Docker(方法一)
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
# <your IMAGE ID>为以上拉取的docker的镜像ID替换,本镜像为:e77c15729879
docker run -it --shm-size=64G -v $PWD/Qwen2.5-Omni:/home/Qwen2.5-Omni -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name qomni <your IMAGE ID> bash
cd /home/Qwen2.5-Omni
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
```
### Dockerfile(方法二)
```
cd /home/DB-GPT/docker
docker build --no-cache -t qomni:latest .
docker run --shm-size=64G --name qomni -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../Qwen2.5-Omni:/home/Qwen2.5-Omni -it qomni bash
# 若遇到Dockerfile启动的方式安装环境需要长时间等待,可注释掉里面的pip安装,启动容器后再安装python库:pip install -r requirements.txt。
cd /home/Qwen2.5-Omni
unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
```
### Anaconda(方法三)
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装:
- https://developer.hpccube.com/tool/
```
DTK驱动:dtk2504
python:python3.10
torch:2.4.1
torchvision:0.19.1
triton:3.0.0
vllm:0.6.2
flash-attn:2.6.1
deepspeed:0.14.2
apex:1.4.0
transformers:4.50.0.dev0
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
2、其它非特殊库参照requirements.txt安装
```
cd /home/Qwen2.5-Omni
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple
cd /home/Qwen2.5-Omni
unzip f742a644ca32e65758c3adb36225aef1731bd2a8.zip
cd transformers-f742a644ca32e65758c3adb36225aef1731bd2a8
pip install -e . # 作者限定只能使用transformers==4.50.0.dev0
```
## 数据集
`无`
## 训练
## 推理
预训练权重目录结构:
```
/home/Qwen2.5-Omni
└── Qwen/Qwen2.5-Omni-7B
```
### 单机多卡
```
python infer_transformers.py
# vllm版由于需适配底层工具较多,敬请期待后期开放。
```
更多资料可参考源项目中的[`README_origin`](./README_origin.md)
## result
`输入: `
```
./draw.mp4
```
`输出:`
```
"system\nYou are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.\nuser\n\nassistant\nOh, that sounds like a really cool video! It's great to see someone using a tablet to draw a guitar. What do you think about the style of the drawing? Is it more realistic or more of an abstract piece? And what do you think about the use of the tablet for drawing? It seems like a fun and creative way to work."
```
### 精度
DCU与GPU精度一致,推理框架:pytorch。
## 应用场景
### 算法类别
`对话问答`
### 热点应用行业
`制造,广媒,金融,能源,医疗,家居,教育`
## 预训练权重
预训练权重快速下载中心:[SCNet AIModels](http://113.200.138.88:18080/aimodels) ,项目中的预训练权重可从快速下载通道下载:[Qwen2.5-Omni-7B](http://113.200.138.88:18080/aimodels/qwen/Qwen2.5-Omni-7B.git)
HF下载地址为:[Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B)
## 源码仓库及问题反馈
- http://developer.sourcefind.cn/codes/modelzoo/qwen2.5-omni_pytorch.git
## 参考资料
- https://github.com/QwenLM/Qwen2.5-Omni
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "markdown",
"id": "f2c16396",
"metadata": {},
"source": [
"### Screen Recording Interaction with Qwen2.5-Omni\n",
"\n",
"This notebook demonstrates how to use Qwen2.5-Omni to get the information and content you want to know by asking questions in real time on the recording screen."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "638e9082-c1ef-4efd-9a10-e35507e25363",
"metadata": {
"execution": {
"iopub.execute_input": "2025-01-29T12:40:04.049566Z",
"iopub.status.busy": "2025-01-29T12:40:04.049365Z"
},
"tags": []
},
"outputs": [],
"source": [
"!pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8\n",
"!pip install qwen-omni-utils\n",
"!pip install openai\n",
"!pip install flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9596c50d-80a8-433f-b846-1fbf61145ccc",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:40:16.511701Z",
"iopub.status.busy": "2025-01-29T12:40:16.510916Z",
"iopub.status.idle": "2025-01-29T12:40:16.878038Z",
"shell.execute_reply": "2025-01-29T12:40:16.877543Z",
"shell.execute_reply.started": "2025-01-29T12:40:16.511678Z"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/omni/lib/python3.10/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
" warnings.warn(\n"
]
}
],
"source": [
"from qwen_omni_utils import process_mm_info\n",
"\n",
"# @title inference function\n",
"def inference(video_path, prompt, sys_prompt):\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": sys_prompt},\n",
" {\"role\": \"user\", \"content\": [\n",
" {\"type\": \"text\", \"text\": prompt},\n",
" {\"type\": \"video\", \"video\": video_path},\n",
" ]\n",
" },\n",
" ]\n",
" text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
" # image_inputs, video_inputs = process_vision_info([messages])\n",
" audios, images, videos = process_mm_info(messages, use_audio_in_video=False)\n",
" inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=False)\n",
" inputs = inputs.to(model.device).to(model.dtype)\n",
"\n",
" output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)\n",
"\n",
" text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n",
" return text"
]
},
{
"cell_type": "markdown",
"id": "386e4cd8",
"metadata": {},
"source": [
"Load model and processors."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e829b782-0be7-4bc6-a576-6b815323376e",
"metadata": {
"ExecutionIndicator": {
"show": false
},
"execution": {
"iopub.execute_input": "2025-01-29T12:40:18.337731Z",
"iopub.status.busy": "2025-01-29T12:40:18.337470Z",
"iopub.status.idle": "2025-01-29T12:40:47.760976Z",
"shell.execute_reply": "2025-01-29T12:40:47.760220Z",
"shell.execute_reply.started": "2025-01-29T12:40:18.337713Z"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/omni/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"2025-03-22 17:14:12.353632: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-03-22 17:14:12.386228: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"2025-03-22 17:14:12.386249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"2025-03-22 17:14:12.387082: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2025-03-22 17:14:12.392644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2025-03-22 17:14:13.131254: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
"Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00, 1.14s/it]\n",
"/opt/conda/envs/omni/lib/python3.10/site-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:6129: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" for key, value in torch.load(path).items():\n"
]
}
],
"source": [
"import torch\n",
"from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor\n",
"\n",
"model_path = \"Qwen/Qwen2.5-Omni-7B\"\n",
"model = Qwen2_5OmniModel.from_pretrained(\n",
" model_path,\n",
" torch_dtype=torch.bfloat16,\n",
" device_map=\"auto\",\n",
" attn_implementation=\"flash_attention_2\",\n",
")\n",
"processor = Qwen2_5OmniProcessor.from_pretrained(model_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ed93fb82",
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Video"
]
},
{
"cell_type": "markdown",
"id": "6a47ad45",
"metadata": {},
"source": [
"#### Understanding"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1935af5e",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:41:18.150397Z",
"iopub.status.busy": "2025-01-29T12:41:18.149631Z",
"iopub.status.idle": "2025-01-29T12:41:19.978329Z",
"shell.execute_reply": "2025-01-29T12:41:19.977054Z",
"shell.execute_reply.started": "2025-01-29T12:41:18.150371Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n",
"qwen-vl-utils using torchvision to read video.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"What the browser is used in this video?\n",
"assistant\n",
"The browser used in the video is Google Chrome.\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
"prompt = \"What the browser is used in this video?\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
},
{
"cell_type": "markdown",
"id": "f9961aae",
"metadata": {},
"source": [
"#### OCR"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0894f5f1",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:44:01.387553Z",
"iopub.status.busy": "2025-01-29T12:44:01.386725Z",
"iopub.status.idle": "2025-01-29T12:44:09.671782Z",
"shell.execute_reply": "2025-01-29T12:44:09.671200Z",
"shell.execute_reply.started": "2025-01-29T12:44:01.387530Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"Who is the authors of this paper?\n",
"assistant\n",
"The authors of the paper \"Attention Is All You Need\" are:\n",
"\n",
"1. Ashish Vaswani\n",
"2. Noam Shazeer\n",
"3. Niki Parmar\n",
"4. Jakob Uszkoreit\n",
"5. Llion Jones\n",
"6. Aidan N. Gomez\n",
"7. Lukasz Kaiser\n",
"8. Illia Polosukhin\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
"prompt = \"Who is the authors of this paper?\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
},
{
"cell_type": "markdown",
"id": "4e9b7651",
"metadata": {},
"source": [
"#### Summarize"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16aa3dc5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"Summarize this paper in short.\n",
"assistant\n",
"The paper \"Attention Is All You Need\" introduces the Transformer model, a novel architecture for sequence-to-sequence tasks that relies entirely on self-attention mechanisms. The Transformer outperforms existing models in machine translation tasks, achieving state-of-the-art BLEU scores on WMT 2014 English-German and English-French translation tasks. The model is highly parallelizable, allowing for efficient training on large datasets and GPUs. The paper also discusses the application of the Transformer to other tasks, such as English constituency parsing, and highlights its potential for handling large inputs and outputs, such as images and audio.\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
"prompt = \"Summarize this paper in short.\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
},
{
"cell_type": "markdown",
"id": "8dd58bbd",
"metadata": {},
"source": [
"#### Assistant"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7cea7d11",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"Please trranslate the abstract of paper into Chinese.\n",
"assistant\n",
"The abstract of the paper \"Attention Is All You Need\" by Vaswani et al. (2017) is as follows:\n",
"\n",
"---\n",
"\n",
"The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based entirely on self-attention. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n",
"\n",
"---\n",
"\n",
"Translation:\n",
"\n",
"---\n",
"\n",
"主流的序列转换模型基于复杂的递归或卷积神经网络,包括编码器和解码器。表现最好的模型还通过注意力机制连接编码器和解码器。我们提出了一种新的简单网络架构,Transformer,完全基于自我注意力。在两个机器翻译任务上的实验表明,这些模型在质量上更优,同时更易于并行化,并且训练时间显著减少。我们的模型在WMT 2014英语到德语翻译任务上实现了28.4 BLEU,超过现有最佳结果,包括组合模型,提高了2 BLEU。在WMT 2014英语到法语翻译任务上,我们的模型在仅训练3.5天、使用八个GPU的情况下,建立了新的单模型状态最佳BLEU得分41.8,比文献中最好的模型(包括组合模型)低2 BLEU。我们展示了Transformer在其他任务上的泛化能力,成功地将其应用于英语句法解析,即使在大量和有限的训练数据下也是如此。\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4\"\n",
"prompt = \"Please trranslate the abstract of paper into Chinese.\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "omni",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
This diff is collapsed.
{
"cells": [
{
"cell_type": "markdown",
"id": "f2c16396",
"metadata": {},
"source": [
"### Video Information Extracting with Qwen2.5-Omni\n",
"\n",
"This notebook demonstrates how to use Qwen2.5-Omni to obtain information from the video stream."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "638e9082-c1ef-4efd-9a10-e35507e25363",
"metadata": {
"execution": {
"iopub.execute_input": "2025-01-29T12:40:04.049566Z",
"iopub.status.busy": "2025-01-29T12:40:04.049365Z"
},
"tags": []
},
"outputs": [],
"source": [
"!pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8\n",
"!pip install qwen-omni-utils\n",
"!pip install openai\n",
"!pip install flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9596c50d-80a8-433f-b846-1fbf61145ccc",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:40:16.511701Z",
"iopub.status.busy": "2025-01-29T12:40:16.510916Z",
"iopub.status.idle": "2025-01-29T12:40:16.878038Z",
"shell.execute_reply": "2025-01-29T12:40:16.877543Z",
"shell.execute_reply.started": "2025-01-29T12:40:16.511678Z"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/omni/lib/python3.10/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
" warnings.warn(\n"
]
}
],
"source": [
"from qwen_omni_utils import process_mm_info\n",
"\n",
"# @title inference function\n",
"def inference(video_path, prompt, sys_prompt):\n",
" messages = [\n",
" {\"role\": \"system\", \"content\": sys_prompt},\n",
" {\"role\": \"user\", \"content\": [\n",
" {\"type\": \"text\", \"text\": prompt},\n",
" {\"type\": \"video\", \"video\": video_path},\n",
" ]\n",
" },\n",
" ]\n",
" text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
" # image_inputs, video_inputs = process_vision_info([messages])\n",
" audios, images, videos = process_mm_info(messages, use_audio_in_video=False)\n",
" inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=False)\n",
" inputs = inputs.to(model.device).to(model.dtype)\n",
"\n",
" output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)\n",
"\n",
" text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)\n",
" return text"
]
},
{
"cell_type": "markdown",
"id": "386e4cd8",
"metadata": {},
"source": [
"Load model and processors."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e829b782-0be7-4bc6-a576-6b815323376e",
"metadata": {
"ExecutionIndicator": {
"show": false
},
"execution": {
"iopub.execute_input": "2025-01-29T12:40:18.337731Z",
"iopub.status.busy": "2025-01-29T12:40:18.337470Z",
"iopub.status.idle": "2025-01-29T12:40:47.760976Z",
"shell.execute_reply": "2025-01-29T12:40:47.760220Z",
"shell.execute_reply.started": "2025-01-29T12:40:18.337713Z"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/omni/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"2025-03-22 17:20:02.523530: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2025-03-22 17:20:02.556178: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"2025-03-22 17:20:02.556202: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"2025-03-22 17:20:02.557034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2025-03-22 17:20:02.562397: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2025-03-22 17:20:03.318258: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
"Loading checkpoint shards: 100%|██████████| 5/5 [00:05<00:00, 1.17s/it]\n",
"/opt/conda/envs/omni/lib/python3.10/site-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:6129: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" for key, value in torch.load(path).items():\n"
]
}
],
"source": [
"import torch\n",
"from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor\n",
"\n",
"model_path = \"Qwen/Qwen2.5-Omni-7B\"\n",
"model = Qwen2_5OmniModel.from_pretrained(\n",
" model_path,\n",
" torch_dtype=torch.bfloat16,\n",
" device_map=\"auto\",\n",
" attn_implementation=\"flash_attention_2\",\n",
")\n",
"processor = Qwen2_5OmniProcessor.from_pretrained(model_path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ed93fb82",
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Video"
]
},
{
"cell_type": "markdown",
"id": "6a47ad45",
"metadata": {},
"source": [
"#### Question 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1935af5e",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:41:18.150397Z",
"iopub.status.busy": "2025-01-29T12:41:18.149631Z",
"iopub.status.idle": "2025-01-29T12:41:19.978329Z",
"shell.execute_reply": "2025-01-29T12:41:19.977054Z",
"shell.execute_reply.started": "2025-01-29T12:41:18.150371Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n",
"qwen-vl-utils using torchvision to read video.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"How many kind of drinks can you see in the video?\n",
"assistant\n",
"There are five different kinds of drinks visible in the video.\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
"prompt = \"How many kind of drinks can you see in the video?\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
},
{
"cell_type": "markdown",
"id": "f9961aae",
"metadata": {},
"source": [
"#### Question 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0894f5f1",
"metadata": {
"ExecutionIndicator": {
"show": true
},
"execution": {
"iopub.execute_input": "2025-01-29T12:44:01.387553Z",
"iopub.status.busy": "2025-01-29T12:44:01.386725Z",
"iopub.status.idle": "2025-01-29T12:44:09.671782Z",
"shell.execute_reply": "2025-01-29T12:44:09.671200Z",
"shell.execute_reply.started": "2025-01-29T12:44:01.387530Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"How many bottles of drinks have I picked up?\n",
"assistant\n",
"You have picked up two bottles of drinks.\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
"prompt = \"How many bottles of drinks have I picked up?\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
},
{
"cell_type": "markdown",
"id": "4e9b7651",
"metadata": {},
"source": [
"#### Question 3"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16aa3dc5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<video src=\"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\" controls width=\"640\" height=\"360\">\n",
" Your browser does not support the <code>video</code> element.\n",
" </video>"
],
"text/plain": [
"<IPython.core.display.Video object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:System prompt modified, audio output may not work as expected. Audio output mode only works when using default system prompt 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"system\n",
"You are a helpful assistant.\n",
"user\n",
"How many milliliters are there in the bottle I picked up second time?\n",
"assistant\n",
"The bottle you picked up second time contains 500 milliliters of liquid.\n"
]
}
],
"source": [
"video_path = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4\"\n",
"prompt = \"How many milliliters are there in the bottle I picked up second time?\"\n",
"\n",
"display(Video(video_path, width=640, height=360))\n",
"\n",
"## Use a local HuggingFace model to inference.\n",
"response = inference(video_path, prompt=prompt, sys_prompt=\"You are a helpful assistant.\")\n",
"print(response[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "omni",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
This diff is collapsed.
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy
ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential
# RUN source /opt/dtk-dtk25.04/env.sh
# # 安装pip相关依赖
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
# Core dependencies
gradio==5.23.1
gradio_client==1.8.0
qwen-omni-utils==0.0.3
librosa==0.11.0
ffmpeg==1.4
ffmpeg-python==0.2.0
soundfile==0.13.1
modelscope_studio==1.2.2
# git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
accelerate
av
qwen-vl-utils[decord]
# Optional dependency
# Uncomment the following line if you need flash-attn
# flash-attn==2.7.4.post1
# Dockerfile of qwenllm/qwen-omni:2.5-cu121
ARG CUDA_VERSION=12.1.0
ARG from=nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
FROM ${from} as base
ARG DEBIAN_FRONTEND=noninteractive
RUN <<EOF
apt update -y && apt upgrade -y && apt install -y --no-install-recommends \
git \
git-lfs \
python3 \
python3-pip \
python3-dev \
wget \
vim \
libsndfile1 \
ccache \
software-properties-common \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
EOF
RUN wget https://github.com/Kitware/CMake/releases/download/v3.26.1/cmake-3.26.1-Linux-x86_64.sh \
-q -O /tmp/cmake-install.sh \
&& chmod u+x /tmp/cmake-install.sh \
&& mkdir /opt/cmake-3.26.1 \
&& /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-3.26.1 \
&& rm /tmp/cmake-install.sh \
&& ln -s /opt/cmake-3.26.1/bin/* /usr/local/bin
RUN ln -s /usr/bin/python3 /usr/bin/python
RUN git lfs install
FROM base as dev
WORKDIR /
RUN mkdir -p /data/shared/Qwen
WORKDIR /data/shared/Qwen/
FROM dev as bundle_req
RUN --mount=type=cache,target=/root/.cache/pip pip3 install networkx==3.1
RUN --mount=type=cache,target=/root/.cache/pip pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 xformers==0.0.29.post2
RUN --mount=type=cache,target=/root/.cache/pip pip3 install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8 \
&& pip3 install accelerate qwen-omni-utils modelscope_studio
FROM bundle_req as bundle_vllm
ARG BUNDLE_FLASH_ATTENTION=true
ENV MAX_JOBS=8
ENV NVCC_THREADS=1
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ENV VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$BUNDLE_FLASH_ATTENTION" = "true" ]; then \
mkdir -p /data/shared/code \
&& pip install ninja \
&& cd /data/shared/code \
&& git clone https://github.com/Dao-AILab/flash-attention.git \
&& cd flash-attention \
&& python setup.py install \
&& cd /data/shared/Qwen \
&& rm -rf /data/shared/code/flash-attention; \
fi
ARG BUNDLE_VLLM=true
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$BUNDLE_VLLM" = "true" ]; then \
mkdir -p /data/shared/code \
&& cd /data/shared/code \
&& git clone -b qwen2_omni_public_v1 https://github.com/fyabc/vllm.git \
&& cd vllm \
&& git checkout d40f54fc2f1524458669048cb40a8d0286f5d1d2 \
&& python3 use_existing_torch.py \
&& pip3 install setuptools_scm \
&& pip3 install -r requirements/cuda.txt \
&& pip3 install . --no-build-isolation\
&& cd /data/shared/Qwen \
&& rm -rf /data/shared/code/vllm; \
fi
RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install \
gradio==5.23.1 \
gradio_client==1.8.0 \
librosa==0.11.0 \
ffmpeg==1.4 \
ffmpeg-python==0.2.0 \
soundfile==0.13.1 \
av
RUN rm -rvf /root/.cache/pip
COPY ../web_demo.py ./
EXPOSE 80
#!/usr/bin/env bash
#
# This script will automatically pull docker image from DockerHub, and start a daemon container to run the Qwen-Chat web-demo.
IMAGE_NAME=qwenllm/qwen-omni:2.5-cu121
QWEN_CHECKPOINT_PATH=/path/to/Qwen2.5-Omni-7B
PORT=8901
CONTAINER_NAME=qwen2.5-omni
FLASH_ATTN=0
function usage() {
echo '
Usage: bash docker/docker_web_demo.sh [-i IMAGE_NAME] -c [/path/to/Qwen-Instruct] [-n CONTAINER_NAME] [--port PORT] [--flash-attn2]
'
}
while [[ "$1" != "" ]]; do
case $1 in
-i | --image-name )
shift
IMAGE_NAME=$1
;;
-c | --checkpoint )
shift
QWEN_CHECKPOINT_PATH=$1
;;
-n | --container-name )
shift
CONTAINER_NAME=$1
;;
--port )
shift
PORT=$1
;;
--flash-attn2 )
FLASH_ATTN=1
;;
-h | --help )
usage
exit 0
;;
* )
echo "Unknown argument ${1}"
exit 1
;;
esac
shift
done
if [ ! -e ${QWEN_CHECKPOINT_PATH}/config.json ]; then
echo "Checkpoint config.json file not found in ${QWEN_CHECKPOINT_PATH}, exit."
exit 1
fi
sudo docker pull ${IMAGE_NAME} || {
echo "Pulling image ${IMAGE_NAME} failed, exit."
exit 1
}
WEB_DEMO_ARGS="--server-port 8901 --server-name 0.0.0.0 -c /data/shared/Qwen/Qwen2.5-Omni-7B"
if [ ${FLASH_ATTN} -eq 1 ]; then
WEB_DEMO_ARGS+=" --flash-attn2"
fi
sudo docker run --gpus all -d --restart always --name ${CONTAINER_NAME} \
-v /var/run/docker.sock:/var/run/docker.sock -p ${PORT}:8901 \
--mount type=bind,source=${QWEN_CHECKPOINT_PATH},target=/data/shared/Qwen/Qwen2.5-Omni-7B \
-it ${IMAGE_NAME} \
python web_demo.py ${WEB_DEMO_ARGS} && {
echo "Successfully started web demo. Open 'http://localhost:${PORT}' to try!
Run \`docker logs ${CONTAINER_NAME}\` to check demo status.
Run \`docker rm -f ${CONTAINER_NAME}\` to stop and remove the demo."
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment