Add Qwen3-TTS

e3cdb305 · weishb · e3cdb305 · e3cdb305 · e3cdb305 · e3cdb305
Commit e3cdb305 authored Feb 09, 2026 by weishb
15 changed files
--- a/.gitattributes
+++ b/.gitattributes
+# 统一所有文本文件使用 LF 换行符（Linux 风格）
+* text=auto eol=lf
+
+# 显式标记二进制文件（避免误判）
+*.png binary
+*.jpg binary
+*.wav binary
+*.whl binary
+*.zip binary
\ No newline at end of file
--- a/LICENSE.txt
+++ b/LICENSE.txt
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2026 Alibaba Cloud
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# Qwen3-TTS
+## 论文
+[Qwen3-TTS Technical Report](https://arxiv.org/abs/2601.15621)
+
+## 模型简介
+由通义千问（Qwen）开发的一系列强大的语音生成能力，全面支持声音克隆、音色设计、超高质量拟人化语音合成以及基于自然语言的语音控制，为开发者和用户提供了目前最丰富的语音生成功能集。
+<div align=center>
+    <img src="./doc/qwen3-tts.png"/>
+</div>
+Qwen3-TTS 覆盖10种主要语言（中文、英文、日文、韩文、德文、法文、俄文、葡萄牙文、西班牙文和意大利文），并提供多种方言音色配置，以满足全球化的应用需求。此外，该模型具备强大的上下文理解能力，可根据指令和文本语义自适应调节语调、语速和情感表达，并对含噪声的输入文本展现出显著增强的鲁棒性。
+主要特性如下：
+强大的语音表征能力：基于自研的 Qwen3-TTS-Tokenizer-12Hz，实现对语音信号的高效声学压缩与高维语义建模，完整保留副语言信息（如语气、情绪）及声学环境特征，并通过轻量级非 DiT 架构实现高速、高保真的语音重建。
+通用端到端架构：采用离散多码本语言模型（LM）架构，实现全信息端到端语音建模，彻底规避了传统“语言模型 + DiT”方案中存在的信息瓶颈与级联误差问题，显著提升模型的通用性、生成效率和性能上限。
+极致低延迟流式生成：基于创新的双轨混合流式生成架构，单个模型同时支持流式与非流式生成模式。在用户仅输入单个字符后即可立即输出首个音频包，端到端合成延迟低至 97 毫秒，充分满足实时交互场景的严苛要求。
+智能文本理解与语音控制：支持由自然语言指令驱动的语音生成，可灵活调控音色、情感、韵律等多维度声学属性。通过深度融合文本语义理解能力，模型能自适应调整语调、节奏与情感表达，实现“所想即所听”的拟人化语音输出。
+
+
+## 环境依赖
+- 列举基础环境需求，根据实际情况填写
+
+| 软件 | 版本 |
+| :------: | :------: |
+| DTK | 25.04.2 |
+| python | 3.10.12 |
+| transformers | 4.57.3 |
+| vllm | 0.9.2+das.opt2.dtk25042 |
+| torchaudio | 2.5.1+das.opt1.dtk25042.20251127.g10a9ffcd |
+| transformer_engine | 2.5.0+das.opt1.dtk25042 |
+
+推荐使用镜像:harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk26.04-0130-py3.10-20260202
+- 挂载地址`-v`，`{docker_name}`和 `{docker_image_name}`根据实际模型情况修改
+
+```bash
+docker run -it \
+    --shm-size 60g \
+    --network=host \
+    --name qwen3-tts \
+    --privileged \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --device=/dev/mkfd \
+    --group-add video \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    -u root \
+    -v /opt/hyhal/:/opt/hyhal/:ro \
+    -v /path/your_code_data/:/path/your_code_data/ \
+    harbor.sourcefind.cn:5443/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.2-1226-das1.7-py3.10-20251226 bash
+```
+更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装
+其它包参照requirements.txt安装：
+```
+pip install -r requirements.txt
+```
+
+镜像内其他环境配置
+```
+1.重新安装torchaudio
+    pip uninstall torchaudio
+    pip install torchaudio-2.5.1+das.opt1.dtk25042.20251127.g10a9ffcd-cp310-cp310-manylinux_2_28_x86_64.whl
+
+2.解压vllm.zip到/usr/local/lib/python3.10/dist-packages直接覆盖需要修改的文件
+    unzip -o vllm.zip -d /usr/local/lib/python3.10/dist-packages
+```
+
+
+## 数据集
+暂无
+
+## 训练
+暂无
+
+## 推理
+
+### transformers
+#### 单机推理
+```
+VoiceDesign推理
+python test_model_12hz_voice_design.py
+
+CustomVoice
+python test_model_12hz_custom_voice.py
+
+Voice Clone
+python test_model_12hz_base.py
+```
+### vllm
+#### 单机推理（以VoiceDesign为例子，CustomVoice和Voice Clone需要切换模型）
+启动服务
+```bash
+VLLM_USE_V1=0  python -m vllm.entrypoints.openai.api_server --model Qwen3-TTS/Qwen3-TTS-12Hz-1.7B-VoiceDesign --served-model-name qwen3-tts --host 0.0.0.0 --port 8000 --trust-remote-code --dtype bfloat16 --disable-async-output-proc
+```
+调用服务：
+```
+VoiceDesign
+curl -sS http://127.0.0.1:8000/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -o output.wav \
+  -d '{
+    "model":"qwen3-tts",
+    "text":"哥哥，你回来啦，人家等了你好久好久了，要抱抱！",
+    "task_type":"VoiceDesign",
+    "language":"Auto",
+    "instruct":"体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，营造出黏人、做作又刻意卖萌的听觉效果。",
+    "generation_params":{
+      "max_new_tokens":4096,
+      "do_sample":true,
+      "top_k":50,
+      "top_p":1.0,
+      "temperature":0.9
+    },
+    "response_format":"wav"
+  }'
+
+CustomVoice
+curl -sS http://127.0.0.1:8000/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -o output_customvoice.wav \
+  -d '{
+    "model":"qwen3-tts",
+    "text":"哥哥，你回来啦，人家等了你好久好久了，要抱抱！",
+    "task_type":"CustomVoice",
+    "speaker":"YourSpeakerName",
+    "language":"Auto",
+    "instruct":"",
+    "generation_params":{
+      "max_new_tokens":4096,
+      "do_sample":true,
+      "top_k":50,
+      "top_p":1.0,
+      "temperature":0.9
+    },
+    "response_format":"wav"
+  }'
+
+Voice Clone
+curl -sS http://127.0.0.1:8000/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -o output_clone_icl.wav \
+  -d '{
+    "model":"qwen3-tts",
+    "text":"今天的风很温柔，我们一起出去走走吧。",
+    "task_type":"Base",
+    "language":"Auto",
+    "ref_audio":"/path/to/ref.wav",
+    "ref_text":"参考音频对应的文本内容",
+    "x_vector_only_mode":false,
+    "generation_params":{
+      "max_new_tokens":4096,
+      "do_sample":true,
+      "top_k":50,
+      "top_p":1.0,
+      "temperature":0.9
+    },
+    "response_format":"wav"
+  }'
+
+
+```
+
+
+## 效果展示
+
+示例输出音频：output_audio\output.wav
+
+### 精度
+`DCU与GPU精度一致，推理框架：vllm`
+
+## 预训练权重
+| 模型名称  | 权重大小  | DCU型号  | 最低卡数需求 |下载地址|
+|:-----:|:----------:|:----------:|:---------------------:|:----------:|
+| Qwen3-TTS-12Hz-1.7B-VoiceDesign | 1.7B | K100AI | 1 | [Modelscope] https://www.modelscope.cn/models/Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign|
+| Qwen3-TTS-12Hz-1.7B-CustomVoice | 1.7B | K100AI | 1 | [Modelscope] https://www.modelscope.cn/models/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice|
+| Qwen3-TTS-12Hz-1.7B-Base | 1.7B | K100AI | 1 | [Modelscope] https://www.modelscope.cn/models/Qwen/Qwen3-TTS-12Hz-1.7B-Base|
+
+## 源码仓库及问题反馈
+- https://developer.sourcefind.cn/codes/weishb/qwen3-tts_pytorch
+
+## 参考资料
+- https://github.com/QwenLM/Qwen3-TTS
\ No newline at end of file
--- a/README_origin.md
+++ b/README_origin.md
--- a/doc/iocn.png
+++ b/doc/iocn.png
--- a/doc/qwen3-tts.png
+++ b/doc/qwen3-tts.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=2047
+# 模型名称
+modelName=Qwen3-TTS_pytorch
+# 模型描述
+modelDescription=由通义开发的一系列强大的语音生成模型，支持声音克隆、声音设计、高质量拟人声生成和基于自然语言的语音控制。
+# 运行过程
+processType=推理
+# 算法类别
+appCategory=语音合成
+# 框架类型
+frameType=vllm
+# 加速卡类型
+accelerateType=K100AI
--- a/output_audio/output.wav
+++ b/output_audio/output.wav
--- a/requirements.txt
+++ b/requirements.txt
+soundfile
+librosa
+sox
+transformers==4.57.3
+qwen-tts
\ No newline at end of file
--- a/run_server.sh
+++ b/run_server.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+SERVER="${SERVER:-http://127.0.0.1:8000}"
+MODEL="${MODEL:-qwen3-tts}"
+LANGUAGE="${LANGUAGE:-Auto}"
+
+curl -sS "${SERVER}/v1/audio/speech" \
+  -H "Content-Type: application/json" \
+  -o output.wav \
+  -d @- <<EOF
+{
+  "model": "${MODEL}",
+  "text": "哥哥，你回来啦，人家等了你好久好久了，要抱抱！",
+  "task_type": "VoiceDesign",
+  "language": "${LANGUAGE}",
+  "instruct": "体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，营造出黏人、做作又刻意卖萌的听觉效果。",
+  "generation_params": {
+    "max_new_tokens": 4096,
+    "do_sample": true,
+    "top_k": 50,
+    "top_p": 1.0,
+    "temperature": 0.9
+  },
+  "response_format": "wav"
+}
+EOF
+
--- a/test_model_12hz_base.py
+++ b/test_model_12hz_base.py
+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+import torch
+import soundfile as sf
+
+from qwen_tts import Qwen3TTSModel
+
+
+def ensure_dir(d: str):
+    os.makedirs(d, exist_ok=True)
+
+
+def run_case(tts: Qwen3TTSModel, out_dir: str, case_name: str, call_fn):
+    torch.cuda.synchronize()
+    t0 = time.time()
+
+    wavs, sr = call_fn()
+
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"[{case_name}] time: {t1 - t0:.3f}s, n_wavs={len(wavs)}, sr={sr}")
+
+    for i, w in enumerate(wavs):
+        sf.write(os.path.join(out_dir, f"{case_name}_{i}.wav"), w, sr)
+
+
+def main():
+    device = "cuda:0"
+    MODEL_PATH = "Qwen/Qwen3-TTS-12Hz-1.7B-Base/"
+    OUT_DIR = "qwen3_tts_test_voice_clone_output_wav"
+    ensure_dir(OUT_DIR)
+
+    tts = Qwen3TTSModel.from_pretrained(
+        MODEL_PATH,
+        device_map=device,
+        dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
+
+    # Reference audio(s)
+    ref_audio_path_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav"
+    ref_audio_path_2 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_1.wav"
+
+    ref_audio_single = ref_audio_path_1
+    ref_audio_batch = [ref_audio_path_1, ref_audio_path_2]
+
+    ref_text_single = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you."
+    ref_text_batch = [
+        "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you.",
+        "甚至出现交易几乎停滞的情况。",
+    ]
+
+    # Synthesis targets
+    syn_text_single = "Good one. Okay, fine, I'm just gonna leave this sock monkey here. Goodbye."
+    syn_lang_single = "Auto"
+
+    syn_text_batch = [
+        "Good one. Okay, fine, I'm just gonna leave this sock monkey here. Goodbye.",
+        "其实我真的有发现，我是一个特别善于观察别人情绪的人。",
+    ]
+    syn_lang_batch = ["Chinese", "English"]
+
+    common_gen_kwargs = dict(
+        max_new_tokens=2048,
+        do_sample=True,
+        top_k=50,
+        top_p=1.0,
+        temperature=0.9,
+        repetition_penalty=1.05,
+        subtalker_dosample=True,
+        subtalker_top_k=50,
+        subtalker_top_p=1.0,
+        subtalker_temperature=0.9,
+    )
+
+    for xvec_only in [False, True]:
+        mode_tag = "xvec_only" if xvec_only else "icl"
+
+        # Case 1: prompt single + synth single, direct
+        run_case(
+            tts, OUT_DIR, f"case1_promptSingle_synSingle_direct_{mode_tag}",
+            lambda: tts.generate_voice_clone(
+                text=syn_text_single,
+                language=syn_lang_single,
+                ref_audio=ref_audio_single,
+                ref_text=ref_text_single,
+                x_vector_only_mode=xvec_only,
+                **common_gen_kwargs,
+            ),
+        )
+
+        # Case 1b: prompt single + synth single, via create_voice_clone_prompt
+        def _case1b():
+            prompt_items = tts.create_voice_clone_prompt(
+                ref_audio=ref_audio_single,
+                ref_text=ref_text_single,
+                x_vector_only_mode=xvec_only,
+            )
+            return tts.generate_voice_clone(
+                text=syn_text_single,
+                language=syn_lang_single,
+                voice_clone_prompt=prompt_items,
+                **common_gen_kwargs,
+            )
+
+        run_case(
+            tts, OUT_DIR, f"case1_promptSingle_synSingle_promptThenGen_{mode_tag}",
+            _case1b,
+        )
+
+        # Case 2: prompt single + synth batch, direct
+        run_case(
+            tts, OUT_DIR, f"case2_promptSingle_synBatch_direct_{mode_tag}",
+            lambda: tts.generate_voice_clone(
+                text=syn_text_batch,
+                language=syn_lang_batch,
+                ref_audio=ref_audio_single,
+                ref_text=ref_text_single,
+                x_vector_only_mode=xvec_only,
+                **common_gen_kwargs,
+            ),
+        )
+
+        # Case 2b: prompt single + synth batch, via create_voice_clone_prompt
+        def _case2b():
+            prompt_items = tts.create_voice_clone_prompt(
+                ref_audio=ref_audio_single,
+                ref_text=ref_text_single,
+                x_vector_only_mode=xvec_only,
+            )
+            return tts.generate_voice_clone(
+                text=syn_text_batch,
+                language=syn_lang_batch,
+                voice_clone_prompt=prompt_items,
+                **common_gen_kwargs,
+            )
+
+        run_case(
+            tts, OUT_DIR, f"case2_promptSingle_synBatch_promptThenGen_{mode_tag}",
+            _case2b,
+        )
+
+        # Case 3: prompt batch + synth batch, direct
+        run_case(
+            tts, OUT_DIR, f"case3_promptBatch_synBatch_direct_{mode_tag}",
+            lambda: tts.generate_voice_clone(
+                text=syn_text_batch,
+                language=syn_lang_batch,
+                ref_audio=ref_audio_batch,
+                ref_text=ref_text_batch,
+                x_vector_only_mode=[xvec_only, xvec_only],
+                **common_gen_kwargs,
+            ),
+        )
+
+        # Case 3b: prompt batch + synth batch, via create_voice_clone_prompt
+        def _case3b():
+            prompt_items = tts.create_voice_clone_prompt(
+                ref_audio=ref_audio_batch,
+                ref_text=ref_text_batch,
+                x_vector_only_mode=[xvec_only, xvec_only],
+            )
+            return tts.generate_voice_clone(
+                text=syn_text_batch,
+                language=syn_lang_batch,
+                voice_clone_prompt=prompt_items,
+                **common_gen_kwargs,
+            )
+
+        run_case(
+            tts, OUT_DIR, f"case3_promptBatch_synBatch_promptThenGen_{mode_tag}",
+            _case3b,
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/test_model_12hz_custom_voice.py
+++ b/test_model_12hz_custom_voice.py
+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import soundfile as sf
+
+from qwen_tts import Qwen3TTSModel
+
+
+def main():
+    device = "cuda:0"
+    MODEL_PATH = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice/"
+
+    tts = Qwen3TTSModel.from_pretrained(
+        MODEL_PATH,
+        device_map=device,
+        dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
+
+    # -------- Single (with instruct) --------
+    torch.cuda.synchronize()
+    t0 = time.time()
+
+    wavs, sr = tts.generate_custom_voice(
+        text="其实我真的有发现，我是一个特别善于观察别人情绪的人。",
+        language="Chinese",
+        speaker="Vivian",
+        instruct="用特别愤怒的语气说",
+    )
+
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"[CustomVoice Single] time: {t1 - t0:.3f}s")
+
+    sf.write("qwen3_tts_test_custom_single.wav", wavs[0], sr)
+
+    # -------- Batch (some empty instruct) --------
+    texts = ["其实我真的有发现，我是一个特别善于观察别人情绪的人。", "She said she would be here by noon."]
+    languages = ["Chinese", "English"]
+    speakers = ["Vivian", "Ryan"]
+    instructs = ["", "Very happy."]
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+
+    wavs, sr = tts.generate_custom_voice(
+        text=texts,
+        language=languages,
+        speaker=speakers,
+        instruct=instructs,
+        max_new_tokens=2048,
+    )
+
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"[CustomVoice Batch] time: {t1 - t0:.3f}s")
+
+    for i, w in enumerate(wavs):
+        sf.write(f"qwen3_tts_test_custom_batch_{i}.wav", w, sr)
+
+
+if __name__ == "__main__":
+    main()
--- a/test_model_12hz_voice_design.py
+++ b/test_model_12hz_voice_design.py
+# coding=utf-8
+# Copyright 2026 The Alibaba Qwen team.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import torch
+import soundfile as sf
+
+from qwen_tts import Qwen3TTSModel
+
+
+def main():
+    device = "cuda:0"
+    MODEL_PATH = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign/"
+
+    tts = Qwen3TTSModel.from_pretrained(
+        MODEL_PATH,
+        device_map=device,
+        dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    )
+
+    # -------- Single --------
+    torch.cuda.synchronize()
+    t0 = time.time()
+
+    wavs, sr = tts.generate_voice_design(
+        text="哥哥，你回来啦，人家等了你好久好久了，要抱抱！",
+        language="Chinese",
+        instruct="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，营造出黏人、做作又刻意卖萌的听觉效果。",
+    )
+
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"[VoiceDesign Single] time: {t1 - t0:.3f}s")
+
+    sf.write("qwen3_tts_test_voice_design_single.wav", wavs[0], sr)
+
+    # -------- Batch --------
+    texts = [
+        "哥哥，你回来啦，人家等了你好久好久了，要抱抱！",
+        "It's in the top drawer... wait, it's empty? No way, that's impossible! I'm sure I put it there!"
+    ]
+    languages = ["Chinese", "English"]
+    instructs = [
+        "体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，营造出黏人、做作又刻意卖萌的听觉效果。",
+        "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice."
+    ]
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+
+    wavs, sr = tts.generate_voice_design(
+        text=texts,
+        language=languages,
+        instruct=instructs,
+        max_new_tokens=2048,
+    )
+
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"[VoiceDesign Batch] time: {t1 - t0:.3f}s")
+
+    for i, w in enumerate(wavs):
+        sf.write(f"qwen3_tts_test_voice_design_batch_{i}.wav", w, sr)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/torchaudio-2.5.1+das.opt1.dtk25042.20251127.g10a9ffcd-cp310-cp310-manylinux_2_28_x86_64.whl
+++ b/torchaudio-2.5.1+das.opt1.dtk25042.20251127.g10a9ffcd-cp310-cp310-manylinux_2_28_x86_64.whl
--- a/vllm.zip
+++ b/vllm.zip