v1.0

39ac40a9 · chenzk · 39ac40a9 · 39ac40a9 · 39ac40a9 · 39ac40a9
Commit 39ac40a9 authored Jun 06, 2025 by chenzk
20 changed files
--- a/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh
+++ b/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh
+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh/20250409_161438/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp1.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 4000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 1.00e-3 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --audio-model-freeze \
+    --language-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x
--- a/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh
+++ b/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_stage1.sh
+#!/bin/bash
+set -e
+set -x
+SEQ_LENGTH="$1"
+if [ -z "$SEQ_LENGTH" ]
+then
+    SEQ_LENGTH=32768
+fi
+timestamp="$2"
+if [ -z "$timestamp" ]
+then
+    timestamp=`date +'%Y%m%d_%H'`0000
+fi
+######################################################################
+export ROOT_PATH=/data/
+export CODE_PATH=${ROOT_PATH}/VITA-Audio/
+export LOCAL_ROOT_PATH=/data_local/
+export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
+mkdir -p ${LOCAL_ROOT_PATH}
+mkdir -p ${LOCAL_CODE_PATH}
+apt update
+apt install -y rsync
+rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
+cd ${LOCAL_CODE_PATH}
+rm -fr datasets
+ln -s ${ROOT_PATH}/data datasets
+######################################################################
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
+pip3 install transformers==4.48.3
+#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
+######################################################################
+OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
+mkdir -p ${OUTPUT_DIR}
+rsync -avh $0 ${OUTPUT_DIR}
+export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
+mkdir -p ${HF_HOME}
+export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
+export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
+######################################################################
+LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+echo ${@}
+######################################################################
+DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
+MODEL_NAME_OR_PATH=${ROOT_PATH}/models/Qwen/Qwen2.5-7B-Instruct/
+AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
+rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
+######################################################################
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
+    --log_level "info" \
+    --do_train \
+    --overwrite_output_dir \
+    --config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp0.json \
+    --tokenizer_name $MODEL_NAME_OR_PATH \
+    --model_name_or_path $MODEL_NAME_OR_PATH \
+    --audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
+    --audio_tokenizer_type "sensevoice_glm4voice" \
+    --dataset_name $DATA_PATH \
+    --bf16 True \
+    --tf32 True \
+    --torch_dtype bfloat16 \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 1 \
+    --max_steps 8000 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --save_strategy "steps" \
+    --save_steps 0.1 \
+    --save_total_limit 2 \
+    --learning_rate 6.00e-5 \
+    --max_grad_norm 1.0 \
+    --weight_decay 0.0 \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.95 \
+    --adam_epsilon 1e-8 \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "tensorboard" \
+    --model_max_length ${SEQ_LENGTH} \
+    --gradient_checkpointing True \
+    --deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
+    --trust_remote_code False \
+    --ddp_timeout 7200 \
+    --ddp_backend ${DISTRIBUTED_BACKEND} \
+    --attn_implementation flash_attention_2 \
+    --seed 42 \
+    --data_seed 42 \
+    --reset_attention_mask \
+    --reset_position_ids \
+    --create_attention_mask false \
+    --create_attention_mask_2d false \
+    --dataloader_num_workers 8 \
+    --audio-model-freeze \
+    --text-audio-interval-ratio 1 10 4 10 \
+    #--language-model-freeze \
+    #--dataset_joint false \
+    #--variable_length true \
+    #--tokenizer_name_or_path Qwen2Tokenizer \
+    #--bf16 True \
+    #--fp16 True \
+    #--tf32 True \
+set +x
--- a/scripts/set_env_ds_gpu.sh
+++ b/scripts/set_env_ds_gpu.sh
+#set -e
+#set -x
+######################################################################
+#export NCCL_NET=IB
+#export NCCL_SOCKET_IFNAME="bond1"
+#export GLOO_SOCKET_IFNAME="bond1"
+#export NCCL_DEBUG=INFO
+#export NCCL_IB_QPS_PER_CONNECTION=2
+#export GLOO_SOCKET_IFNAME=eth0
+#export NCCL_DEBUG=INFO
+#export NCCL_IB_QPS_PER_CONNECTION=2
+#export NCCL_IB_DISABLE=1
+export DISTRIBUTED_BACKEND="nccl"
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+######################################################################
+pip3 install -r requirements_ds_gpu.txt
+#pip3 install --no-index --find-links=/data/software/ -r requirements_ds_gpu.txt
+pip3 install deepspeed==0.15.4
+#pip3 install --no-index --find-links=/data/software/ deepspeed==0.15.4
+#pip3 install deepspeed==0.16.1
+#pip3 install deepspeed==0.14.2
+pip3 install -e `pwd`
+######################################################################
+#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+#apt update
+#apt install -y openssh-server rsync tmux htop
+######################################################################
+export NNODES=${WORLD_SIZE}
+export NODE_RANK=${RANK}
+export MASTER_PORT=34567
+if [ -z "$NPROC_PER_NODE" ]
+then
+    export NPROC_PER_NODE=8
+    export NNODES=1
+    export NODE_RANK=0
+    export MASTER_ADDR=127.0.0.1
+fi
+######################################################################
--- a/setup.py
+++ b/setup.py
+from setuptools import find_packages, setup
+setup(
+    name='vita_audio',
+    version='0.0.1',
+    packages=[
+        "vita_audio",
+    ],
+    install_requires=[
+    ],
+)
--- a/third_party/GLM-4-Voice/.gitmodules
+++ b/third_party/GLM-4-Voice/.gitmodules
+[submodule "third_party/Matcha-TTS"]
+	path = third_party/Matcha-TTS
+	url = https://github.com/shivammehta25/Matcha-TTS
--- a/third_party/GLM-4-Voice/LICENSE
+++ b/third_party/GLM-4-Voice/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 GLM-4-Voice Model Team @ Zhipu AI
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/third_party/GLM-4-Voice/README.md
+++ b/third_party/GLM-4-Voice/README.md
+# GLM-4-Voice
+<p align="center">
+📄<a href="https://arxiv.org/abs/2412.02612" target="_blank"> Report </a> • 🤗 <a href="https://huggingface.co/THUDM/glm-4-voice-9b" target="_blank">HF Repo</a> • 🤖 <a href="https://modelscope.cn/studios/ZhipuAI/GLM-4-Voice-Demo" target="_blank">Demo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+Read this in [English](./README_en.md)
+GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音，进行实时语音对话，并且能够遵循用户的指令要求改变语音的情感、语调、语速、方言等属性。
+## Model Architecture
+![Model Architecture](./resources/architecture.jpeg)
+GLM-4-Voice 由三个部分组成：
+* GLM-4-Voice-Tokenizer: 通过在 [Whisper](https://github.com/openai/whisper) 的 Encoder 部分增加 Vector Quantization 并在 ASR 数据上有监督训练，将连续的语音输入转化为离散的 token。每秒音频平均只需要用 12.5 个离散 token 表示。
+* GLM-4-Voice-Decoder: 基于 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 的 Flow Matching 模型结构训练的支持流式推理的语音解码器，将离散化的语音 token 转化为连续的语音输出。最少只需要 10 个语音 token 即可开始生成，降低端到端对话延迟。
+* GLM-4-Voice-9B: 在 [GLM-4-9B](https://github.com/THUDM/GLM-4) 的基础上进行语音模态的预训练和对齐，从而能够理解和生成离散化的语音 token。
+预训练方面，为了攻克模型在语音模态下的智商和合成表现力两个难关，我们将 Speech2Speech 任务解耦合为“根据用户音频做出文本回复”和“根据文本回复和用户语音合成回复语音”两个任务，并设计两种预训练目标，分别基于文本预训练数据和无监督音频数据合成语音-文本交错数据以适配这两种任务形式。GLM-4-Voice-9B 在 GLM-4-9B 的基座模型基础之上，经过了数百万小时音频和数千亿 token 的音频文本交错数据预训练，拥有很强的音频理解和建模能力。
+对齐方面，为了支持高质量的语音对话，我们设计了一套流式思考架构：根据用户语音，GLM-4-Voice 可以流式交替输出文本和语音两个模态的内容，其中语音模态以文本作为参照保证回复内容的高质量，并根据用户的语音指令要求做出相应的声音变化，在最大程度保留语言模型智商的情况下仍然具有端到端建模的能力，同时具备低延迟性，最低只需要输出 20 个 token 便可以合成语音。
+## Model List
+|         Model         |       Type       |                                                                     Download                                                                     |
+|:---------------------:|:----------------:|:------------------------------------------------------------------------------------------------------------------------------------------------:|
+| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-tokenizer) |
+|    GLM-4-Voice-9B     |    Chat Model    |        [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-9b)        |
+|  GLM-4-Voice-Decoder  |  Speech Decoder  |   [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder) [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/glm-4-voice-decoder)   |
+## Usage
+我们提供了可以直接启动的 Web Demo。用户可以输入语音或文本，模型会同时给出语音和文字回复。
+![](resources/web_demo.png)
+### Preparation
+首先下载仓库
+```shell
+git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
+cd GLM-4-Voice
+```
+然后安装依赖。也可以使用我们提供的镜像 `zhipuai/glm-4-voice:0.1` 以跳过这一步。
+```shell
+pip install -r requirements.txt
+```
+由于 Decoder 模型不支持通过 `transformers` 初始化，因此 checkpoint 需要单独下载。
+```shell
+# git 模型下载，请确保已安装 git-lfs
+git lfs install
+git clone https://huggingface.co/THUDM/glm-4-voice-decoder
+```
+### Launch Web Demo
+1. 启动模型服务
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
+```
+如果你需要使用 Int4 精度启动，请运行
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype int4 --device cuda:0
+```
+此命令会自动下载 `glm-4-voice-9b`。如果网络条件不好，也手动下载之后通过 `--model-path` 指定本地的路径。
+2. 启动 web 服务
+```shell
+python web_demo.py --tokenizer-path  THUDM/glm-4-voice-tokenizer --model-path THUDM/glm-4-voice-9b --flow-path ./glm-4-voice-decoder
+```
+即可在 http://127.0.0.1:8888 访问 web demo。
+此命令会自动下载 `glm-4-voice-tokenizer` 和 `glm-4-voice-9b`。 请注意，`glm-4-voice-decoder` 需要手动下载。
+如果网络条件不好，可以手动下载这三个模型之后通过 `--tokenizer-path`, `--flow-path` 和 `--model-path` 指定本地的路径。
+### Known Issues
+* Gradio 的流式音频播放效果不稳定。在生成完成后点击对话框中的音频质量会更高。
+## Cases
+我们提供了 GLM-4-Voice 的部分对话案例，包括控制情绪、改变语速、生成方言等。
+* 用轻柔的声音引导我放松
+https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0
+* 用激动的声音解说足球比赛
+https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b
+* 用哀怨的声音讲一个鬼故事
+https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a
+* 用东北话介绍一下冬天有多冷
+https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037
+* 用重庆话念“吃葡萄不吐葡萄皮”
+https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b
+* 用北京话念一句绕口令
+https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651
+  * 加快语速
+https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f
+  * 再快一点
+https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d
+## Acknowledgements
+本项目的部分代码来自：
+* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+* [transformers](https://github.com/huggingface/transformers)
+* [GLM-4](https://github.com/THUDM/GLM-4)
+## 协议
+ GLM-4 模型的权重的使用则需要遵循 [模型协议](https://huggingface.co/THUDM/glm-4-voice-9b/blob/main/LICENSE)。
+ 本开源仓库的代码则遵循 [Apache 2.0](LICENSE) 协议。
+## 引用
+```
+@misc{zeng2024glm4,
+      title={GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot}, 
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Kedong Wang and Shengmin Jiang and Lei Zhao and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2412.02612},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.02612}, 
+}
+```
+```
+@misc{zeng2024scaling,
+      title={Scaling Speech-Text Pre-training with Synthetic Interleaved Data}, 
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Lei Zhang and Shengmin Jiang and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2411.17607},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.17607}, 
+}
+```
--- a/third_party/GLM-4-Voice/README_en.md
+++ b/third_party/GLM-4-Voice/README_en.md
+# GLM-4-Voice
+<p align="center">
+📄<a href="https://arxiv.org/abs/2412.02612" target="_blank"> Report </a> • 🤗 <a href="https://huggingface.co/THUDM/glm-4-voice-9b" target="_blank">HF Repo</a> • 🤖 <a href="https://modelscope.cn/studios/ZhipuAI/GLM-4-Voice-Demo" target="_blank">Demo</a> • 🐦 <a href="https://twitter.com/thukeg" target="_blank">Twitter</a>
+</p>
+GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions.
+## Model Architecture
+![Model Architecture](./resources/architecture.jpeg)
+We provide the three components of GLM-4-Voice:
+* GLM-4-Voice-Tokenizer: Trained by adding vector quantization to the encoder part of [Whisper](https://github.com/openai/whisper), converting continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens.
+* GLM-4-Voice-9B: Pre-trained and aligned on speech modality based on [GLM-4-9B](https://github.com/THUDM/GLM-4), enabling understanding and generation of discretized speech.
+* GLM-4-Voice-Decoder: A speech decoder supporting streaming inference, retrained based on [CosyVoice](https://github.com/FunAudioLLM/CosyVoice), converting discrete speech tokens into continuous speech output. Generation can start with as few as 10 audio tokens, reducing conversation latency.
+## Model List
+|         Model         |       Type       |                               Download                               |
+|:---------------------:|:----------------:|:--------------------------------------------------------------------:|
+| GLM-4-Voice-Tokenizer | Speech Tokenizer | [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-tokenizer) |
+|    GLM-4-Voice-9B     |    Chat Model    |    [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-9b)     |
+|  GLM-4-Voice-Decoder  |  Speech Decoder  |  [🤗 Huggingface](https://huggingface.co/THUDM/glm-4-voice-decoder)  |
+## Usage
+We provide a Web Demo that can be launched directly. Users can input speech or text, and the model will respond with both speech and text.
+![](resources/web_demo.png)
+### Preparation
+First, download the repository
+```shell
+git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
+cd GLM-4-Voice
+```
+Then, install the dependencies. You can also use our pre-built docker image `zhipuai/glm-4-voice:0.1` to skip the step.
+```shell
+pip install -r requirements.txt
+```
+Since the Decoder model does not support initialization via `transformers`, the checkpoint needs to be downloaded separately.
+```shell
+# Git model download, please ensure git-lfs is installed
+git clone https://huggingface.co/THUDM/glm-4-voice-decoder
+```
+### Launch Web Demo
+1. Start the model server
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0
+```
+If you need to launch with Int4 precision, run
+```shell
+python model_server.py --host localhost --model-path THUDM/glm-4-voice-9b --port 10000 --dtype int4 --device cuda:0
+```
+This command will automatically download `glm-4-voice-9b`. If network conditions are poor, you can manually download it and specify the local path using `--model-path`.
+2. Start the web service
+```shell
+python web_demo.py --tokenizer-path  THUDM/glm-4-voice-tokenizer --model-path THUDM/glm-4-voice-9b --flow-path ./glm-4-voice-decoder
+```
+You can access the web demo at [http://127.0.0.1:8888](http://127.0.0.1:8888).
+This command will automatically download `glm-4-voice-tokenizer` and `glm-4-voice-9b`. Please note that `glm-4-voice-decoder` needs to be downloaded manually.
+If the network connection is poor, you can manually download these three models and specify the local paths using `--tokenizer-path`, `--flow-path`, and `--model-path`.
+### Known Issues
+* Gradio’s streaming audio playback can be unstable. The audio quality will be higher when clicking on the audio in the dialogue box after generation is complete.
+## Examples
+We provide some dialogue cases for GLM-4-Voice, including emotion control, speech rate alteration, dialect generation, etc. (The examples are in Chinese.)
+* Use a gentle voice to guide me to relax
+https://github.com/user-attachments/assets/4e3d9200-076d-4c28-a641-99df3af38eb0
+* Use an excited voice to commentate a football match
+https://github.com/user-attachments/assets/0163de2d-e876-4999-b1bc-bbfa364b799b
+* Tell a ghost story with a mournful voice
+https://github.com/user-attachments/assets/a75b2087-d7bc-49fa-a0c5-e8c99935b39a
+* Introduce how cold winter is with a Northeastern dialect
+https://github.com/user-attachments/assets/91ba54a1-8f5c-4cfe-8e87-16ed1ecf4037
+* Say "Eat grapes without spitting out the skins" in Chongqing dialect
+https://github.com/user-attachments/assets/7eb72461-9e84-4d8e-9c58-1809cf6a8a9b
+* Recite a tongue twister with a Beijing accent
+https://github.com/user-attachments/assets/a9bb223e-9c0a-440d-8537-0a7f16e31651
+  * Increase the speech rate
+https://github.com/user-attachments/assets/c98a4604-366b-4304-917f-3c850a82fe9f
+  * Even faster
+https://github.com/user-attachments/assets/d5ff0815-74f8-4738-b0f1-477cfc8dcc2d
+## Acknowledgements
+Some code in this project is from:
+* [CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+* [transformers](https://github.com/huggingface/transformers)
+* [GLM-4](https://github.com/THUDM/GLM-4)
+## License Agreement
+ The use of GLM-4 model weights must follow the [Model License Agreement](https://huggingface.co/THUDM/glm-4-voice-9b/blob/main/LICENSE).
+ The code in this open-source repository is licensed under the [Apache 2.0](LICENSE) License.
+## Citation
+```
+@misc{zeng2024glm4,
+      title={GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot}, 
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Kedong Wang and Shengmin Jiang and Lei Zhao and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2412.02612},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.02612}, 
+}
+```
+```
+@misc{zeng2024scaling,
+      title={Scaling Speech-Text Pre-training with Synthetic Interleaved Data}, 
+      author={Aohan Zeng and Zhengxiao Du and Mingdao Liu and Lei Zhang and Shengmin Jiang and Yuxiao Dong and Jie Tang},
+      year={2024},
+      eprint={2411.17607},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.17607}, 
+}
+```
--- a/third_party/GLM-4-Voice/__pycache__/flow_inference.cpython-310.pyc
+++ b/third_party/GLM-4-Voice/__pycache__/flow_inference.cpython-310.pyc
--- a/third_party/GLM-4-Voice/audio_process.py
+++ b/third_party/GLM-4-Voice/audio_process.py
+import os
+import librosa
+import soundfile as sf
+import numpy as np
+from pathlib import Path
+import io
+# Split audio stream at silence points to prevent playback stuttering issues
+# caused by AAC encoder frame padding when streaming audio through Gradio audio components.
+class AudioStreamProcessor:
+    def __init__(self, sr=22050, min_silence_duration=0.1, threshold_db=-40):
+        self.sr = sr
+        self.min_silence_duration = min_silence_duration
+        self.threshold_db = threshold_db
+        self.buffer = np.array([])
+    def process(self, audio_data, last=False):
+        """
+        Add audio data and process it
+        params:
+            audio_data: audio data in numpy array
+            last: whether this is the last chunk of data
+        returns:
+            Processed audio data, returns None if no split point is found
+        """
+        # Add new data to buffer
+        self.buffer = np.concatenate([self.buffer, audio_data]) if len(self.buffer) > 0 else audio_data
+        if last:
+            result = self.buffer
+            self.buffer = np.array([])
+            return self._to_wav_bytes(result)
+        # Find silence boundary
+        split_point = self._find_silence_boundary(self.buffer)
+        if split_point is not None:
+            # Modified: Extend split point to the end of silence
+            silence_end = self._find_silence_end(split_point)
+            result = self.buffer[:silence_end]
+            self.buffer = self.buffer[silence_end:]
+            return self._to_wav_bytes(result)
+        return None
+    def _find_silence_boundary(self, audio):
+        """
+        Find the starting point of silence boundary in audio
+        """
+        # Convert audio to decibels
+        db = librosa.amplitude_to_db(np.abs(audio), ref=np.max)
+        # Find points below threshold
+        silence_points = np.where(db < self.threshold_db)[0]
+        if len(silence_points) == 0:
+            return None
+        # Calculate minimum silence samples
+        min_silence_samples = int(self.min_silence_duration * self.sr)
+        # Search backwards for continuous silence segment starting point
+        for i in range(len(silence_points) - min_silence_samples, -1, -1):
+            if i < 0:
+                break
+            if np.all(np.diff(silence_points[i:i+min_silence_samples]) == 1):
+                return silence_points[i]
+        return None
+    def _find_silence_end(self, start_point):
+        """
+        Find the end point of silence segment
+        """
+        db = librosa.amplitude_to_db(np.abs(self.buffer[start_point:]), ref=np.max)
+        silence_points = np.where(db >= self.threshold_db)[0]
+        if len(silence_points) == 0:
+            return len(self.buffer)
+        return start_point + silence_points[0]
+    def _to_wav_bytes(self, audio_data):
+        """
+        trans_to_wav_bytes
+        """
+        wav_buffer = io.BytesIO()
+        sf.write(wav_buffer, audio_data, self.sr, format='WAV')
+        return wav_buffer.getvalue()
--- a/third_party/GLM-4-Voice/cosyvoice/__init__.py
+++ b/third_party/GLM-4-Voice/cosyvoice/__init__.py
--- a/third_party/GLM-4-Voice/cosyvoice/__pycache__/__init__.cpython-310.pyc
+++ b/third_party/GLM-4-Voice/cosyvoice/__pycache__/__init__.cpython-310.pyc
--- a/third_party/GLM-4-Voice/cosyvoice/bin/inference.py
+++ b/third_party/GLM-4-Voice/cosyvoice/bin/inference.py
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from cosyvoice.cli.model import CosyVoiceModel
+from cosyvoice.dataset.dataset import Dataset
+def get_args():
+    parser = argparse.ArgumentParser(description='inference with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
+    parser.add_argument('--tts_text', required=True, help='tts input file')
+    parser.add_argument('--llm_model', required=True, help='llm model file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--mode',
+                        default='sft',
+                        choices=['sft', 'zero_shot'],
+                        help='inference mode')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init cosyvoice models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+    model.load(args.llm_model, args.flow_model, args.hifigan_model)
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            text = batch["text"]
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            tts_text = batch["tts_text"]
+            tts_index = batch["tts_index"]
+            tts_text_token = batch["tts_text_token"].to(device)
+            tts_text_token_len = batch["tts_text_token_len"].to(device)
+            speech_token = batch["speech_token"].to(device)
+            speech_token_len = batch["speech_token_len"].to(device)
+            speech_feat = batch["speech_feat"].to(device)
+            speech_feat_len = batch["speech_feat_len"].to(device)
+            utt_embedding = batch["utt_embedding"].to(device)
+            spk_embedding = batch["spk_embedding"].to(device)
+            if args.mode == 'sft':
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
+            else:
+                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
+                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
+            model_output = model.inference(**model_input)
+            tts_key = '{}_{}'.format(utts[0], tts_index[0])
+            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
+            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
+            f.write('{} {}\n'.format(tts_key, tts_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()
--- a/third_party/GLM-4-Voice/cosyvoice/bin/train.py
+++ b/third_party/GLM-4-Voice/cosyvoice/bin/train.py
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+# import deepspeed
+import pdb
+from hyperpyyaml import load_hyperpyyaml
+from torch.distributed.elastic.multiprocessing.errors import record
+from cosyvoice.utils.executor import Executor
+from cosyvoice.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=False,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of cosyvoice_join.')
+    # parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+    # pdb.set_trace()
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    save_model(model, 'init', info_dict)
+    # Get executor
+    executor = Executor()
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        # try:
+        #     dist.barrier()
+        # except RuntimeError as e:
+        #     logging.info('except RuntimeError as e: {}'.format(e))
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()
--- a/third_party/GLM-4-Voice/cosyvoice/cli/__init__.py
+++ b/third_party/GLM-4-Voice/cosyvoice/cli/__init__.py
--- a/third_party/GLM-4-Voice/cosyvoice/cli/cosyvoice.py
+++ b/third_party/GLM-4-Voice/cosyvoice/cli/cosyvoice.py
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+from cosyvoice.cli.frontend import CosyVoiceFrontEnd
+from cosyvoice.cli.model import CosyVoiceModel
+class CosyVoice:
+    def __init__(self, model_dir):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
+                                          configs['feat_extractor'],
+                                          '{}/campplus.onnx'.format(model_dir),
+                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
+                                          '{}/spk2info.pt'.format(model_dir),
+                                          instruct,
+                                          configs['allowed_special'])
+        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/hift.pt'.format(model_dir))
+        del configs
+    def list_avaliable_spks(self):
+        spks = list(self.frontend.spk2info.keys())
+        return spks
+    def inference_sft(self, tts_text, spk_id):
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_sft(i, spk_id)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_cross_lingual(self, tts_text, prompt_speech_16k):
+        if self.frontend.instruct is True:
+            raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
+    def inference_instruct(self, tts_text, spk_id, instruct_text):
+        if self.frontend.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        tts_speeches = []
+        for i in self.frontend.text_normalize(tts_text, split=True):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            model_output = self.model.inference(**model_input)
+            tts_speeches.append(model_output['tts_speech'])
+        return {'tts_speech': torch.concat(tts_speeches, dim=1)}
--- a/third_party/GLM-4-Voice/cosyvoice/cli/frontend.py
+++ b/third_party/GLM-4-Voice/cosyvoice/cli/frontend.py
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+try:
+    import ttsfrd
+    use_ttsfrd = True
+except ImportError:
+    print("failed to import ttsfrd, use WeTextProcessing instead")
+    from tn.chinese.normalizer import Normalizer as ZhNormalizer
+    from tn.english.normalizer import Normalizer as EnNormalizer
+    use_ttsfrd = False
+from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+class CosyVoiceFrontEnd:
+    def __init__(self,
+                 get_tokenizer: Callable,
+                 feat_extractor: Callable,
+                 campplus_model: str,
+                 speech_tokenizer_model: str,
+                 spk2info: str = '',
+                 instruct: bool = False,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.feat_extractor = feat_extractor
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
+        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"])
+        if os.path.exists(spk2info):
+            self.spk2info = torch.load(spk2info, map_location=self.device)
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+        self.use_ttsfrd = use_ttsfrd
+        if self.use_ttsfrd:
+            self.frd = ttsfrd.TtsFrontendEngine()
+            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+            assert self.frd.initialize('{}/../../pretrained_models/CosyVoice-ttsfrd/resource'.format(ROOT_DIR)) is True, 'failed to initialize ttsfrd resource'
+            self.frd.set_lang_type('pinyin')
+            self.frd.enable_pinyin_mix(True)
+            self.frd.set_breakmodel_index(1)
+        else:
+            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
+            self.en_tn_model = EnNormalizer()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+        return text_token, text_token_len
+    def _extract_speech_token(self, speech):
+        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
+        speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
+                                                                self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
+        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
+        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_token, speech_token_len
+    def _extract_spk_embedding(self, speech):
+        feat = kaldi.fbank(speech,
+                           num_mel_bins=80,
+                           dither=0,
+                           sample_frequency=16000)
+        feat = feat - feat.mean(dim=0, keepdim=True)
+        embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
+        embedding = torch.tensor([embedding]).to(self.device)
+        return embedding
+    def _extract_speech_feat(self, speech):
+        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
+        speech_feat = speech_feat.unsqueeze(dim=0)
+        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
+        return speech_feat, speech_feat_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.zh_tn_model.normalize(text)
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "、")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,]+$', '。', text)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        else:
+            if self.use_ttsfrd:
+                text = self.frd.get_frd_extra_info(text, 'input')
+            else:
+                text = self.en_tn_model.normalize(text)
+            text = spell_out_number(text, self.inflect_parser)
+            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                                token_min_n=60, merge_len=20,
+                                                comma_split=False)]
+        if split is False:
+            return text
+        return texts
+    def frontend_sft(self, tts_text, spk_id):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        embedding = self.spk2info[spk_id]['embedding']
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
+        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
+        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
+        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
+        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+        embedding = self._extract_spk_embedding(prompt_speech_16k)
+        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
+                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                       'llm_embedding': embedding, 'flow_embedding': embedding}
+        return model_input
+    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
+        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
+        # in cross lingual mode, we remove prompt in llm
+        del model_input['prompt_text']
+        del model_input['prompt_text_len']
+        del model_input['llm_prompt_speech_token']
+        del model_input['llm_prompt_speech_token_len']
+        return model_input
+    def frontend_instruct(self, tts_text, spk_id, instruct_text):
+        model_input = self.frontend_sft(tts_text, spk_id)
+        # in instruct mode, we remove spk_embedding in llm due to information leakage
+        del model_input['llm_embedding']
+        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
+        model_input['prompt_text'] = instruct_text_token
+        model_input['prompt_text_len'] = instruct_text_token_len
+        return model_input
--- a/third_party/GLM-4-Voice/cosyvoice/cli/model.py
+++ b/third_party/GLM-4-Voice/cosyvoice/cli/model.py
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+class CosyVoiceModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 hift: torch.nn.Module):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.hift = hift
+    def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+        self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
+        self.flow.to(self.device).eval()
+        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
+        self.hift.to(self.device).eval()
+    def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_speech_token = self.llm.inference(text=text.to(self.device),
+                                              text_len=text_len.to(self.device),
+                                              prompt_text=prompt_text.to(self.device),
+                                              prompt_text_len=prompt_text_len.to(self.device),
+                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
+                                              embedding=llm_embedding.to(self.device),
+                                              beam_size=1,
+                                              sampling=25,
+                                              max_token_text_ratio=30,
+                                              min_token_text_ratio=3)
+        tts_mel = self.flow.inference(token=tts_speech_token,
+                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=flow_prompt_speech_token.to(self.device),
+                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
+                                      prompt_feat=prompt_speech_feat.to(self.device),
+                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=flow_embedding.to(self.device))
+        tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
+        return {'tts_speech': tts_speech}
+    def text_to_token(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_speech_token = self.llm.inference(text=text.to(self.device),
+                                              text_len=text_len.to(self.device),
+                                              prompt_text=prompt_text.to(self.device),
+                                              prompt_text_len=prompt_text_len.to(self.device),
+                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
+                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
+                                              embedding=llm_embedding.to(self.device),
+                                              beam_size=1,
+                                              sampling=25,
+                                              max_token_text_ratio=30,
+                                              min_token_text_ratio=3)
+        return tts_speech_token
+    def token_to_speech(self, tts_speech_token, flow_embedding, llm_embedding=torch.zeros(0, 192),
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
+                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
+                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
+        tts_mel = self.flow.inference(token=tts_speech_token,
+                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
+                                      prompt_token=flow_prompt_speech_token.to(self.device),
+                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
+                                      prompt_feat=prompt_speech_feat.to(self.device),
+                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
+                                      embedding=flow_embedding.to(self.device))
+        tts_speech = self.hift.inference(mel=tts_mel).cpu()
+        torch.cuda.empty_cache()
+        return {'tts_speech': tts_speech}
\ No newline at end of file
--- a/third_party/GLM-4-Voice/cosyvoice/dataset/__init__.py
+++ b/third_party/GLM-4-Voice/cosyvoice/dataset/__init__.py
--- a/third_party/GLM-4-Voice/cosyvoice/dataset/dataset.py
+++ b/third_party/GLM-4-Voice/cosyvoice/dataset/dataset.py
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from cosyvoice.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            shuffle=True,
+            partition=True,
+            tts_file='',
+            prompt_utt2data=''):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert mode in ['train', 'inference']
+    lists = read_lists(data_list_file)
+    # import pdb 
+    # pdb.set_trace()
+    if mode == 'inference':
+        with open(tts_file) as f:
+            tts_data = json.load(f)
+        utt2lists = read_json_lists(prompt_utt2data)
+        # filter unnecessary file in inference mode
+        lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
+    dataset = DataList(lists,shuffle=shuffle,partition=partition)
+    if mode == 'inference':
+        # map partial arg tts_data in inference mode
+        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset