v1.0

f8d86cb0 · chenzk · f8d86cb0 · f8d86cb0 · f8d86cb0 · f8d86cb0
Commit f8d86cb0 authored Apr 02, 2025 by chenzk
15 changed files
--- a/draw.mp4
+++ b/draw.mp4
--- a/f742a644ca32e65758c3adb36225aef1731bd2a8.zip
+++ b/f742a644ca32e65758c3adb36225aef1731bd2a8.zip
--- a/icon.png
+++ b/icon.png
--- a/infer_transformers.py
+++ b/infer_transformers.py
+import soundfile as sf
+import torch
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
+from qwen_omni_utils import process_mm_info
+'''
+FORCE_QWENVL_VIDEO_READER=decord # 强制使用decord 后端
+'''
+# default: Load the model on the available device(s)
+model = Qwen2_5OmniModel.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="flash_attention_2")
+# We recommend enabling flash_attention_2 for better acceleration and memory saving.
+# model = Qwen2_5OmniModel.from_pretrained(
+#     "Qwen/Qwen2.5-Omni-7B",
+#     torch_dtype="auto",
+#     device_map="auto",
+#     attn_implementation="flash_attention_2",
+# )
+processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+conversation = [
+    {
+        "role": "system",
+        "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "video": "./draw.mp4"},
+        ],
+    },
+]
+# Preparation for inference
+text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
+inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
+inputs = inputs.to(model.device).to(model.dtype)
+# Inference: Generation of the output text and audio
+text_ids, audio = model.generate(**inputs, use_audio_in_video=True)
+text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+print(text)
+sf.write(
+    "output.wav",
+    audio.reshape(-1).detach().cpu().numpy(),
+    samplerate=24000,
+)
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1478
+# 模型名称
+modelName=Qwen2.5-Omni_pytorch
+# 模型描述
+modelDescription=7B参数完成看、听、说、写，端到端多模态大模型支持文本、图像、音频和视频输入。
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/output.wav
+++ b/output.wav
--- a/qwen-omni-utils/README.md
+++ b/qwen-omni-utils/README.md
+# qwen-omni-utils
+Qwen-Omni Utils contains a set of helper functions for processing and integrating visual and audio language information with Qwen-Omni Model.
+## Install
+```bash
+pip install qwen-omni-utils
+```
+## Usage
+### Qwen2Omni
+```python
+from transformers import Qwen2_5OmniModel, AutoProcessor
+from qwen_omni_utils import process_mm_info
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+messages = [
+    # Image
+    ## Local file path
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Image URL
+    [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Base64 encoded image
+    [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}],
+    ## PIL.Image.Image
+    [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": "Describe this image."}]}],
+    ## Model dynamically adjusts image size, specify dimensions if required.
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}],
+    # Video
+    ## Local video path
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4"}, {"type": "text", "text": "Describe this video."}]}],
+    ## Local video frames
+    [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/extracted_frame1.jpg", "file:///path/to/extracted_frame2.jpg", "file:///path/to/extracted_frame3.jpg"],}, {"type": "text", "text": "Describe this video."},],}],
+    ## Model dynamically adjusts video nframes, video height and width. specify args if required.
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", "fps": 2.0, "resized_height": 280, "resized_width": 280}, {"type": "text", "text": "Describe this video."}]}],
+    # Audio
+    ## Local audio path
+    [{"role": "user", "content": [{"type": "audio", "audio": "file:///path/to/audio1.wav"}, {"type": "text", "text": "Describe this audio."}]}],
+    ## Numpy format audio
+    [{"role": "user", "content": [{"type": "audio", "audio": numpy_audio}, {"type": "text", "text": "Describe this audio."}]}],
+    ## Remote audio
+    [{"role": "user", "content": [{"type": "audio", "audio": "https://path/to/audio.wav"}, {"type": "text", "text": "Describe this audio."}]}],
+]
+processor = AutoProcessor.from_pretrained(model_path)
+model = Qwen2_5OmniModel.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+audios, images, videos = process_mm_info(messages)
+inputs = processor(text=text, images=images, videos=videos, audios=audios, padding=True, return_tensors="pt")
+print(inputs)
+generated_ids, generate_wav = model.generate(**inputs)
+print(generated_ids)
+```
+### Qwen2VL
+```python
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+from qwen_omni_utils import process_vision_info
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+messages = [
+    # Image
+    ## Local file path
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Image URL
+    [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Base64 encoded image
+    [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}],
+    ## PIL.Image.Image
+    [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": "Describe this image."}]}],
+    ## Model dynamically adjusts image size, specify dimensions if required.
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}],
+    # Video
+    ## Local video path
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4"}, {"type": "text", "text": "Describe this video."}]}],
+    ## Local video frames
+    [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/extracted_frame1.jpg", "file:///path/to/extracted_frame2.jpg", "file:///path/to/extracted_frame3.jpg"],}, {"type": "text", "text": "Describe this video."},],}],
+    ## Model dynamically adjusts video nframes, video height and width. specify args if required.
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", "fps": 2.0, "resized_height": 280, "resized_width": 280}, {"type": "text", "text": "Describe this video."}]}],
+]
+processor = AutoProcessor.from_pretrained(model_path)
+model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+images, videos = process_vision_info(messages)
+inputs = processor(text=text, images=images, videos=videos, padding=True, return_tensors="pt")
+print(inputs)
+generated_ids = model.generate(**inputs)
+print(generated_ids)
+```
+### Qwen2.5VL
+```python
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_omni_utils import process_vision_info
+# You can set the maximum tokens for a video through the environment variable VIDEO_MAX_PIXELS
+# based on the maximum tokens that the model can accept. 
+# export VIDEO_MAX_PIXELS = 32000 * 28 * 28 * 0.9
+# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.
+messages = [
+    # Image
+    ## Local file path
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Image URL
+    [{"role": "user", "content": [{"type": "image", "image": "http://path/to/your/image.jpg"}, {"type": "text", "text": "Describe this image."}]}],
+    ## Base64 encoded image
+    [{"role": "user", "content": [{"type": "image", "image": "data:image;base64,/9j/..."}, {"type": "text", "text": "Describe this image."}]}],
+    ## PIL.Image.Image
+    [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": "Describe this image."}]}],
+    ## Model dynamically adjusts image size, specify dimensions if required.
+    [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/your/image.jpg", "resized_height": 280, "resized_width": 420}, {"type": "text", "text": "Describe this image."}]}],
+    # Video
+    ## Local video path
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4"}, {"type": "text", "text": "Describe this video."}]}],
+    ## Local video frames
+    [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/extracted_frame1.jpg", "file:///path/to/extracted_frame2.jpg", "file:///path/to/extracted_frame3.jpg"],}, {"type": "text", "text": "Describe this video."},],}],
+    ## Model dynamically adjusts video nframes, video height and width. specify args if required.
+    [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", "fps": 2.0, "resized_height": 280, "resized_width": 280}, {"type": "text", "text": "Describe this video."}]}],
+]
+processor = AutoProcessor.from_pretrained(model_path)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+images, videos, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+inputs = processor(text=text, images=images, videos=videos, padding=True, return_tensors="pt", **video_kwargs)
+print(inputs)
+generated_ids = model.generate(**inputs)
+print(generated_ids)
+```
\ No newline at end of file
--- a/qwen-omni-utils/pyproject.toml
+++ b/qwen-omni-utils/pyproject.toml
+[project]
+name = "qwen-omni-utils"
+version = "0.0.3"
+description = "Qwen Omni Language Model Utils - PyTorch"
+authors = [
+    { name = "Qwen Team", email = "lvyuanjun.lyj@alibaba-inc.com" },
+]
+dependencies = [
+    "requests",
+    "pillow",
+    "av",
+    "packaging",
+    "librosa",
+]
+readme = "README.md"
+requires-python = ">= 3.8"
+license = {text = "Apache-2.0"}
+keywords = [
+    'large language model',
+    'vision language model',
+    'qwen-omni',
+    'pytorch',
+]
+classifiers = [
+    'Development Status :: 4 - Beta',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Programming Language :: Python :: 3',
+    'License :: OSI Approved :: Apache Software License',
+]
+[project.urls]
+Homepage = "https://github.com/QwenLM/Qwen2-VL/tree/main/qwen-vl-utils"
+Repository = "https://github.com/QwenLM/Qwen2-VL.git"
+Issues = "https://github.com/QwenLM/Qwen2-VL/issues"
+[project.optional-dependencies]
+decord = [
+    "decord",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.rye]
+managed = true
+dev-dependencies = [
+    "torch",
+    "torchvision",
+    "torchaudio",
+]
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["src/qwen_omni_utils"]
+[tool.ruff]
+line-length = 119
+[tool.ruff.lint]
+ignore = ["C408", "C901", "E501", "E731", "E741", "W605"]
+select = ["C", "E", "F", "I", "W"]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["qwen_omni_utils"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
\ No newline at end of file
--- a/qwen-omni-utils/src/qwen_omni_utils/__init__.py
+++ b/qwen-omni-utils/src/qwen_omni_utils/__init__.py
+from .v2_5 import *
--- a/qwen-omni-utils/src/qwen_omni_utils/v2_5/__init__.py
+++ b/qwen-omni-utils/src/qwen_omni_utils/v2_5/__init__.py
+from .audio_process import process_audio_info
+from .vision_process import (
+    extract_vision_info,
+    fetch_image,
+    fetch_video,
+    process_vision_info,
+    smart_resize,
+)
+def process_mm_info(conversations, use_audio_in_video, return_video_kwargs=False):
+    audios = process_audio_info(conversations, use_audio_in_video)
+    vision = process_vision_info(conversations, return_video_kwargs=return_video_kwargs)
+    return (audios,) + vision
--- a/qwen-omni-utils/src/qwen_omni_utils/v2_5/audio_process.py
+++ b/qwen-omni-utils/src/qwen_omni_utils/v2_5/audio_process.py
--- a/qwen-omni-utils/src/qwen_omni_utils/v2_5/vision_process.py
+++ b/qwen-omni-utils/src/qwen_omni_utils/v2_5/vision_process.py
--- a/requirements.txt
+++ b/requirements.txt
+# Core dependencies
+gradio==5.23.1
+gradio_client==1.8.0
+qwen-omni-utils==0.0.3
+librosa==0.11.0
+ffmpeg==1.4
+ffmpeg-python==0.2.0
+soundfile==0.13.1
+modelscope_studio==1.2.2
+# git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+accelerate
+av
+qwen-vl-utils[decord]
+# Optional dependency
+# Uncomment the following line if you need flash-attn
+# flash-attn==2.7.4.post1
--- a/requirements_web_demo.txt
+++ b/requirements_web_demo.txt
--- a/web_demo.py
+++ b/web_demo.py