"scripts/ci/ci_install_rust.sh" did not exist on "21ec66e59e466ba8bef05478296fabfcb1f94421"
Commit 39ac40a9 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2747 failed with stages
in 0 seconds
import copy
import math
import os
import sys
import gradio as gr
import numpy as np
import torch
from numba import jit
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from transformers.generation import GenerationConfig
from vita_audio.data.processor.audio_processor import add_audio_input_contiguous
from vita_audio.tokenizer import get_audio_tokenizer
PUNCTUATION = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
@jit
def float_to_int16(audio: np.ndarray) -> np.ndarray:
am = int(math.ceil(float(np.abs(audio).max())) * 32768)
am = 32767 * 32768 // am
return np.multiply(audio, am).astype(np.int16)
def is_wav(file_path):
wav_extensions = {".wav"}
_, ext = os.path.splitext(file_path)
return ext.lower() in wav_extensions
def _parse_text(text):
lines = text.split("\n")
lines = [line for line in lines if line != ""]
count = 0
for i, line in enumerate(lines):
if "```" in line:
count += 1
items = line.split("`")
if count % 2 == 1:
lines[i] = f'<pre><code class="language-{items[-1]}">'
else:
lines[i] = "<br></code></pre>"
else:
if i > 0 and count % 2 == 1:
line = line.replace("`", r"\`")
line = line.replace("<", "&lt;")
line = line.replace(">", "&gt;")
line = line.replace(" ", "&nbsp;")
line = line.replace("*", "&ast;")
line = line.replace("_", "&lowbar;")
line = line.replace("-", "&#45;")
line = line.replace(".", "&#46;")
line = line.replace("!", "&#33;")
line = line.replace("(", "&#40;")
line = line.replace(")", "&#41;")
line = line.replace("$", "&#36;")
lines[i] = "<br>" + line
return "".join(lines)
def _launch_demo(model, tokenizer, audio_tokenizer):
def predict(_chatbot, task_history, task):
chat_query = task_history[-1][0]
print(task_history)
messages = []
audio_path_list = []
if task == "Spoken QA":
messages = [
{
"role": "system",
# "content": "Your Name: Luke\nYour Gender: male\n\nRespond in a text-audio interleaved manner.",
# "content": "Your Name: Lucy\nYour Gender: female\nRespond in a text-audio interleaved manner.",
"content": "Your Name: Omni\nYour Gender: female\nRespond in a text-audio interleaved manner.",
},
]
for i, (q, a) in enumerate(task_history):
if isinstance(q, (tuple, list)) and is_wav(q[0]):
audio_path_list.append(q[0])
messages = messages + [
{
"role": "user",
"content": f"\n<|audio|>",
},
]
else:
messages = messages + [
{
"role": "user",
"content": q,
},
]
if a != None:
messages = messages + [
{
"role": "assistant",
"content": a,
},
]
model.generation_config.do_sample = False
elif task == "TTS":
for i, (q, a) in enumerate(task_history):
if isinstance(q, (tuple, list)) and is_wav(q[0]):
audio_path_list.append(q[0])
messages = messages + [
{
"role": "user",
"content": f"\n<|audio|>",
},
]
else:
messages = messages + [
{
"role": "user",
"content": f"Convert the text to speech.\n{q}",
},
]
if a != None:
messages = messages + [
{
"role": "assistant",
"content": a,
},
]
model.generation_config.do_sample = True
elif task == "ASR":
for i, (q, a) in enumerate(task_history):
if isinstance(q, (tuple, list)) and is_wav(q[0]):
audio_path_list.append(q[0])
messages = messages + [
{
"role": "user",
"content": f"Convert the speech to text.\n<|audio|>",
},
]
else:
messages = messages + [
{
"role": "user",
"content": f"{q}",
},
]
if a != None:
messages = messages + [
{
"role": "assistant",
"content": a,
},
]
model.generation_config.do_sample = False
add_generation_prompt = True
input_ids = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=add_generation_prompt,
# return_tensors="pt",
)
input_ids, audios, audio_indices = add_audio_input_contiguous(
input_ids, audio_path_list, tokenizer, audio_tokenizer
)
input_ids = torch.tensor([input_ids], dtype=torch.long).to("cuda")
# print("input", tokenizer.decode(input_ids[0], skip_special_tokens=False), flush=True)
if audio_path_list == []:
audios = None
audio_indices = None
outputs = model.generate(
input_ids,
audios=audios,
audio_indices=audio_indices,
)
output = tokenizer.decode(outputs[0], skip_special_tokens=False)
# print(f"{output=}", flush=True)
audio_offset = tokenizer.convert_tokens_to_ids("<|audio_0|>")
begin_of_audio = tokenizer.convert_tokens_to_ids("<|begin_of_audio|>")
end_of_audio = tokenizer.convert_tokens_to_ids("<|end_of_audio|>")
im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
response = outputs[0][len(input_ids[0]) :]
audio_tokens = []
text_tokens = []
for token_id in response:
if token_id >= audio_offset:
audio_tokens.append(token_id - audio_offset)
elif (
(token_id.item() != begin_of_audio)
and (token_id.item() != end_of_audio)
and (token_id.item() != im_end)
):
text_tokens.append(token_id)
if len(audio_tokens) > 0:
tts_speech = audio_tokenizer.decode(audio_tokens)
audio_np = float_to_int16(tts_speech.cpu().numpy())
tts_speech = (22050, audio_np)
else:
tts_speech = None
# import pdb;pdb.set_trace()
history_response = tokenizer.decode(text_tokens)
task_history[-1] = (chat_query, history_response)
_chatbot[-1] = (chat_query, history_response)
# print("query",chat_query)
# print("task_history",task_history)
# print(_chatbot)
# print("answer: ",outputs)
return _chatbot, tts_speech
def add_text(history, task_history, text):
task_text = text
# import pdb;pdb.set_trace()
if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
task_text = text[:-1]
history = history + [(_parse_text(text), None)]
task_history = task_history + [(task_text, None)]
return history, task_history, ""
def add_audio(history, task_history, file):
print(file)
if file is None:
return history, task_history
history = history + [((file,), None)]
task_history = task_history + [((file,), None)]
return history, task_history
def reset_user_input():
# import pdb;pdb.set_trace()
return gr.update(value="")
def reset_state(task_history):
task_history.clear()
return []
with gr.Blocks(title="VITA-Audio-Plus-Vanilla") as demo:
gr.Markdown("""<center><font size=8>VITA-Audio-Plus-Vanilla</center>""")
gr.Markdown(
"""<center><font size=4>The deployment of the VITA-Audio-Plus-Vanilla model employs a non-streaming deployment approach. The currently deployed model is VITA-Audio-Plus-Vanilla. For the ASR and TTS tasks, only single-turn dialogues are supported. In the Spoken QA task, generated text is used as dialogue history to reduce the context length.</center>"""
)
chatbot = gr.Chatbot(
label="VITA-Audio-Plus-Vanilla", elem_classes="control-height", height=500
)
query = gr.Textbox(lines=2, label="Text Input")
task_history = gr.State([])
with gr.Row():
add_text_button = gr.Button("Submit Text (提交文本)")
add_audio_button = gr.Button("Submit Audio (提交音频)")
empty_bin = gr.Button("🧹 Clear History (清除历史)")
task = gr.Radio(choices=["ASR", "TTS", "Spoken QA"], label="TASK", value="Spoken QA")
with gr.Row(scale=1):
record_btn = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="🎤 Record or Upload Audio (录音或上传音频)",
show_download_button=True,
waveform_options=gr.WaveformOptions(sample_rate=16000),
)
audio_output = gr.Audio(
label="Play", streaming=True, autoplay=True, show_download_button=True
)
add_text_button.click(
add_text, [chatbot, task_history, query], [chatbot, task_history], show_progress=True
).then(reset_user_input, [], [query]).then(
predict, [chatbot, task_history, task], [chatbot, audio_output], show_progress=True
)
empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
add_audio_button.click(
add_audio,
[chatbot, task_history, record_btn],
[chatbot, task_history],
show_progress=True,
).then(predict, [chatbot, task_history, task], [chatbot, audio_output], show_progress=True)
server_port = 18806
demo.launch(
share=False,
debug=True,
server_name="0.0.0.0",
server_port=server_port,
show_api=False,
show_error=False,
)
def main():
model_name_or_path = "VITA-MLLM/VITA-Audio-Plus-Vanilla"
device_map = "cuda:0"
sys.path.append("third_party/GLM-4-Voice/")
sys.path.append("third_party/GLM-4-Voice/cosyvoice/")
sys.path.append("third_party/GLM-4-Voice/third_party/Matcha-TTS/")
from huggingface_hub import snapshot_download
audio_tokenizer_path = snapshot_download(repo_id="THUDM/glm-4-voice-tokenizer")
flow_path = snapshot_download(repo_id="THUDM/glm-4-voice-decoder")
audio_tokenizer_rank = 0
audio_tokenizer_type = "sensevoice_glm4voice"
torch_dtype = torch.bfloat16
audio_tokenizer = get_audio_tokenizer(
audio_tokenizer_path, audio_tokenizer_type, flow_path=flow_path, rank=audio_tokenizer_rank
)
from evaluation.get_chat_template import qwen2_chat_template as chat_template
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
trust_remote_code=True,
chat_template=chat_template,
)
# print(f"{tokenizer=}")
# print(f"{tokenizer.get_chat_template()=}")
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
device_map=device_map,
torch_dtype=torch_dtype,
attn_implementation="flash_attention_2",
).eval()
# print(f"{model.config.model_type=}")
model.generation_config = GenerationConfig.from_pretrained(
model_name_or_path, trust_remote_code=True
)
model.generation_config.max_new_tokens = 4096
model.generation_config.chat_format = "chatml"
model.generation_config.max_window_size = 8192
model.generation_config.use_cache = True
model.generation_config.do_sample = True
model.generation_config.temperature = 1.0
model.generation_config.top_k = 50
model.generation_config.top_p = 1.0
model.generation_config.num_beams = 1
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.generation_config.mtp_inference_mode = [8192, 10]
_launch_demo(model, tokenizer, audio_tokenizer)
if __name__ == "__main__":
main()
# 模型编码
modelCode=1582
# 模型名称
modelName=VITA-Audio_pytorch
# 模型描述
modelDescription=在生成首个音频片段时大幅提升响应速度,解决实时语音关键瓶颈,整体推理速度相比同规模模型提升3–5倍。
# 应用场景
appScenario=推理,语音合成,广媒,影视,动漫,医疗,家居,教育
# 框架类型
frameType=pytorch
# GLM-4-Voice-Decoder
GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够根据用户的指令改变语音的情感、语调、语速、方言等属性。
GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions.
本仓库是 GLM-4-Voice 的 speech decoder 部分。GLM-4-Voice-Decoder 是基于 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 重新训练的支持流式推理的语音解码器,将离散化的语音 token 转化为连续的语音输出。最少只需要 10 个音频 token 即可开始生成,降低对话延迟。
The repo provides the speech decoder of GLM-4-Voice. GLM-4-Voice-Decoder is a speech decoder supporting streaming inference, retrained based on [CosyVoice](https://github.com/FunAudioLLM/CosyVoice), converting discrete speech tokens into continuous speech output. Generation can start with as few as 10 audio tokens, reducing conversation latency.
更多信息请参考我们的仓库 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
For more information please refer to our repo [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
\ No newline at end of file
# GLM-4-Voice-Tokenizer
GLM-4-Voice 是智谱 AI 推出的端到端语音模型。GLM-4-Voice 能够直接理解和生成中英文语音,进行实时语音对话,并且能够根据用户的指令改变语音的情感、语调、语速、方言等属性。
GLM-4-Voice is an end-to-end voice model launched by Zhipu AI. GLM-4-Voice can directly understand and generate Chinese and English speech, engage in real-time voice conversations, and change attributes such as emotion, intonation, speech rate, and dialect based on user instructions.
本仓库是 GLM-4-Voice 的 speech tokenizer 部分。通过在 [Whisper](https://github.com/openai/whisper) 的 encoder 部分增加 vector quantization 进行训练,将连续的语音输入转化为离散的 token。每秒音频转化为 12.5 个离散 token。
The repo provides the speech tokenzier of GLM-4-Voice, which is trained by adding vector quantization to the encoder part of [Whisper](https://github.com/openai/whisper) and converts continuous speech input into discrete tokens. Each second of audio is converted into 12.5 discrete tokens.
更多信息请参考我们的仓库 [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
For more information please refer to our repo [GLM-4-Voice](https://github.com/THUDM/GLM-4-Voice).
\ No newline at end of file
File added
expecttest
peft
xlsxwriter
termcolor
tabulate
tiktoken
matplotlib
datasets
einops
pybind11
tensorboardX
pyarrow
transformers==4.48.3
deepspeed
accelerate>=1.1.1
timm
flask
flask_restful
decord
natsort
# setuptools==69.5.1
setuptools
# cosyvoice2
pyworld
evaluate
hyperpyyaml
diffusers
conformer
hydra-core
lightning
gdown
wget
funasr
zhconv
jiwer
zhon
WeTextProcessing
inflect
openai-whisper
onnxruntime
modelscope
word2number
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true,
"sub_group_size": 1e12
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"dump_state": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true,
"sub_group_size": 1e12
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"dump_state": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true,
"sub_group_size": 1e12
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"dump_state": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true,
"sub_group_size": 1e12
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"dump_state": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"offload_param": {
"device": "none",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 1e9,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"stage3_prefetch_bucket_size": 1e9,
"stage3_param_persistence_threshold": 1e9,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto"
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": "auto",
"warmup_min_ratio": 0,
"warmup_num_steps": "auto",
"cos_min_ratio": 0.1
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"round_robin_gradients": true,
"sub_group_size": 1e12,
"stage3_prefetch_bucket_size": 5e8,
"stage3_param_persistence_threshold": 1e5,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"flops_profiler": {
"enabled": false,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false,
"dump_state": false
}
#!/bin/bash
set -e
set -x
timestamp="$1"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H%M%S'`
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt install -y rsync
mkdir -p ${LOCAL_CODE_PATH}
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME/"
mkdir -p ${HF_HOME}
export HF_ENDPOINT=https://hf-mirror.com
export MODELSCOPE_CACHE="${ROOT_PATH}/data/MODELSCOPE_CACHE/"
mkdir -p ${MODELSCOPE_CACHE}
export LC_ALL="en_US.utf8"
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
######################################################################
if true
#if false
then
MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Boost/"
MODEL_NAME_OR_PATH="/data/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp10_stage2.sh/VITA-Audio-Balance/"
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
FLOW_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-decoder
AUDIO_TOKENIZER_TYPE="glm4voice"
export PYTHONPATH=${PYTHONPATH}:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/cosyvoice/:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
fi
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
######################################################################
if true
#if false
then
apt-get update && apt install -y ffmpeg
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/llama-questions/test.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/llama-questions/
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_text.json
echo "copypaste ACC: ${JSON_PATH}"
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/llama-questions/test_hyp_ref_speech.json
echo "copypaste ACC: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/trivia_qa-audio/validation.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/trivia_qa-audio/
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_text.json
echo "copypaste ACC: ${JSON_PATH}"
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/trivia_qa-audio/validation_hyp_ref_speech.json
echo "copypaste ACC: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/spoken-web-questions/test.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_sqa.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/spoken-web-questions/
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_text.json
echo "copypaste ACC: ${JSON_PATH}"
python evaluation/compute-acc-of-contain.py ${OUTPUT_DIR}/spoken-web-questions/test_hyp_ref_speech.json
echo "copypaste ACC: ${JSON_PATH}"
fi
######################################################################
if true
#if false
then
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.clean.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/librispeech_asr/
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.clean_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/validation.other.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/librispeech_asr/
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/validation.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/validation.other_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.clean.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/librispeech_asr/
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.clean_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.clean_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/fixie-ai/librispeech_asr/test.other.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/librispeech_asr/
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/librispeech_asr/test.other_ref.txt ${OUTPUT_DIR}/librispeech_asr/test.other_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
fi
######################################################################
if true
#if false
then
JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_MEETING.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/wenetspeech/
python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt
echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_MEETING_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
JSON_PATH=${ROOT_PATH}/data/jsonl/wenet-e2e/wenetspeech/TEST_NET.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/wenetspeech/
python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt
echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/wenetspeech/TEST_NET_ref.txt ${OUTPUT_DIR}/wenetspeech/TEST_NET_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
fi
######################################################################
if true
#if false
then
JSON_PATH=${ROOT_PATH}/data/jsonl/shenyunhang/AISHELL-1/test.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_asr.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/AISHELL-1/
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/_test.clean_ref.txt ${OUTPUT_DIR}/AISHELL-1/test.clean_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/AISHELL-1/test_ref.txt ${OUTPUT_DIR}/AISHELL-1/test_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
fi
######################################################################
if true
#if false
then
JSON_PATH=${ROOT_PATH}/data/jsonl/mythicinfinity/libritts/test.clean.jsonl
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_libritts.py \
--json_path ${JSON_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/libritts/ \
#python evaluation/compute-cer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt
#echo "copypaste CER: ${JSON_PATH}"
python evaluation/compute-wer.py --char=1 --v=1 ${OUTPUT_DIR}/libritts/test.clean_ref.txt ${OUTPUT_DIR}/libritts/test.clean_hyp.txt
echo "copypaste WER: ${JSON_PATH}"
fi
######################################################################
if true
#if false
then
DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \
--data_path ${DATA_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/seed-tts/ \
--speaker_prompt \
export ARNOLD_WORKER_GPU=${NPROC_PER_NODE}
cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh
echo "copypaste WER: ${DATA_PATH} zh"
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh
echo "copypaste WER: ${DATA_PATH} hardcase"
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en
echo "copypaste WER: ${DATA_PATH} en"
bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ ${DATA_PATH}/wavlm_large_finetune.pth
echo "copypaste SIM: ${DATA_PATH} zh"
bash cal_sim.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ ${DATA_PATH}/wavlm_large_finetune.pth
echo "copypaste SIM: ${DATA_PATH} hardcase"
bash cal_sim.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ ${DATA_PATH}/wavlm_large_finetune.pth
echo "copypaste SIM: ${DATA_PATH} en"
cd ${LOCAL_CODE_PATH}
fi
######################################################################
if false
then
DATA_PATH=${ROOT_PATH}/data/BytedanceSpeech/seed-tts-eval/
torchrun ${DISTRIBUTED_ARGS} evaluation/evaluate_seedtts.py \
--data_path ${DATA_PATH} \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--audio_tokenizer_path ${AUDIO_TOKENIZER_PATH} \
--audio_tokenizer_type ${AUDIO_TOKENIZER_TYPE} \
--flow_path ${FLOW_PATH} \
--output_dir ${OUTPUT_DIR}/seed-tts/ \
export ARNOLD_WORKER_GPU=${NPROC_PER_NODE}
cd ${LOCAL_CODE_PATH}/third_party/seed-tts-eval
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/meta.lst ${OUTPUT_DIR}/seed-tts/zh/ zh
echo "copypaste WER: ${DATA_PATH} zh"
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/zh/hardcase.lst ${OUTPUT_DIR}/seed-tts/hardcase/ zh
echo "copypaste WER: ${DATA_PATH} hardcase"
bash cal_wer.sh ${DATA_PATH}/seedtts_testset/en/meta.lst ${OUTPUT_DIR}/seed-tts/en/ en
echo "copypaste WER: ${DATA_PATH} en"
cd ${LOCAL_CODE_PATH}
fi
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_glm4voice_mtp1_stage1.sh/20250313_040353/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name vita_audio/models/qwen2_mtp_v4_48_3/config_7B_mtp10.json \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 8000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 1.00e-3 \
--max_grad_norm 1.0 \
--weight_decay 0.0 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 8 \
--language-model-freeze \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage2.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/s2s_qwen25/finetune_glm4voice_mtp10_stage1.sh/20250315_022047/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name ${MODEL_NAME_OR_PATH} \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 4000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 5.00e-5 \
--max_grad_norm 1.0 \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2_no_optimizer.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 8 \
--mtp_model_lr_mult 1.00e1 \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/s2s_qwen25/finetune_glm4voice_stage1.sh/20250222_043913/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name vita_audio/models/qwen2_mtp_v4_48_3/config_7B_mtp1.json \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 4000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 1.00e-3 \
--max_grad_norm 1.0 \
--weight_decay 0.0 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 8 \
--language-model-freeze \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/models/Qwen/Qwen2.5-7B-Instruct/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name ${MODEL_NAME_OR_PATH} \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 8000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 6.00e-5 \
--max_grad_norm 1.0 \
--weight_decay 0.0 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 8 \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage1.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp1_stage1.sh/20250418_075843/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp10.json \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "sensevoice_glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 8000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 1.00e-3 \
--max_grad_norm 1.0 \
--weight_decay 0.0 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 8 \
--audio-model-freeze \
--language-model-freeze \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
#!/bin/bash
set -e
set -x
SEQ_LENGTH="$1"
if [ -z "$SEQ_LENGTH" ]
then
SEQ_LENGTH=32768
fi
timestamp="$2"
if [ -z "$timestamp" ]
then
timestamp=`date +'%Y%m%d_%H'`0000
fi
######################################################################
export ROOT_PATH=/data/
export CODE_PATH=${ROOT_PATH}/VITA-Audio/
export LOCAL_ROOT_PATH=/data_local/
export LOCAL_CODE_PATH=${LOCAL_ROOT_PATH}/VITA-Audio/
mkdir -p ${LOCAL_ROOT_PATH}
mkdir -p ${LOCAL_CODE_PATH}
apt update
apt install -y rsync
rsync -a --exclude ".git" --exclude ".gitee" ${CODE_PATH}/ ${LOCAL_CODE_PATH}/
cd ${LOCAL_CODE_PATH}
rm -fr datasets
ln -s ${ROOT_PATH}/data datasets
######################################################################
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source ${LOCAL_CODE_PATH}/scripts/set_env_ds_gpu.sh
pip3 install transformers==4.48.3
#pip3 install --no-index --find-links=/data/software/ transformers==4.48.3
######################################################################
OUTPUT_DIR=${ROOT_PATH}/output/LM/"$0"/${timestamp}/
mkdir -p ${OUTPUT_DIR}
rsync -avh $0 ${OUTPUT_DIR}
export HF_HOME="${ROOT_PATH}/data/HF_HOME_node${INDEX}/"
mkdir -p ${HF_HOME}
export TRITON_CACHE_DIR=${LOCAL_CODE_PATH}
export PYTHONPATH=$PYTHONPATH:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice:${LOCAL_CODE_PATH}/third_party/GLM-4-Voice/third_party/Matcha-TTS/
######################################################################
LOG=${OUTPUT_DIR}/log_node${INDEX}.txt
exec &> >(tee -a "$LOG")
echo Logging output to "$LOG"
echo ${@}
######################################################################
DATA_PATH=${LOCAL_CODE_PATH}/configs/sts_finetune_stage2.yaml
MODEL_NAME_OR_PATH=${ROOT_PATH}/output/LM/scripts/deepspeed/sts_qwen25/finetune_sensevoice_glm4voice_mtp10_stage1.sh/20250421_180624/
AUDIO_TOKENIZER_PATH=${ROOT_PATH}/models/THUDM/glm-4-voice-tokenizer
rsync -avh ${DATA_PATH} ${OUTPUT_DIR}
######################################################################
DISTRIBUTED_ARGS="
--nproc_per_node $NPROC_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS tools/finetune_sts_v4_48_3.py \
--log_level "info" \
--do_train \
--overwrite_output_dir \
--config_name ${LOCAL_CODE_PATH}/VITA-Audio/models/qwen2_mtp_sensevoice_v4_48_3/config_7B_mtp10.json \
--tokenizer_name $MODEL_NAME_OR_PATH \
--model_name_or_path $MODEL_NAME_OR_PATH \
--audio_tokenizer_path $AUDIO_TOKENIZER_PATH \
--audio_tokenizer_type "sensevoice_glm4voice" \
--dataset_name $DATA_PATH \
--bf16 True \
--tf32 True \
--torch_dtype bfloat16 \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--max_steps 4000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--save_strategy "steps" \
--save_steps 0.1 \
--save_total_limit 2 \
--learning_rate 5.00e-5 \
--max_grad_norm 1.0 \
--weight_decay 0.1 \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--adam_epsilon 1e-8 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "tensorboard" \
--model_max_length ${SEQ_LENGTH} \
--gradient_checkpointing True \
--deepspeed ${LOCAL_CODE_PATH}/scripts/deepspeed/ds_config_zero2_no_optimizer.json \
--trust_remote_code False \
--ddp_timeout 7200 \
--ddp_backend ${DISTRIBUTED_BACKEND} \
--attn_implementation flash_attention_2 \
--seed 42 \
--data_seed 42 \
--reset_attention_mask \
--reset_position_ids \
--create_attention_mask false \
--create_attention_mask_2d false \
--dataloader_num_workers 2 \
--mtp_model_lr_mult 1.00e1 \
--audio-model-freeze \
--text-audio-interval-ratio 1 10 4 10 \
#--language-model-freeze \
#--dataset_joint false \
#--variable_length true \
#--tokenizer_name_or_path Qwen2Tokenizer \
#--bf16 True \
#--fp16 True \
#--tf32 True \
set +x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment