v1.0

0112b0f0 · chenzk · 0112b0f0 · 0112b0f0 · 0112b0f0 · 0112b0f0
Commit 0112b0f0 authored Feb 14, 2025 by chenzk
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "third_party/Matcha-TTS"]
+    path = third_party/Matcha-TTS
+    url = https://github.com/shivammehta25/Matcha-TTS.git
\ No newline at end of file
--- a/LICENSE.txt
+++ b/LICENSE.txt
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# InspireMusic
+支持音乐、歌曲及音频的生成，为用户提供多样化选择。
+## 论文
+`无`
+## 模型结构
+InspireMusic基于Qwen模型初始化的自回归Transformer模型预测音频token。
+<div align=center>
+    <img src="./doc/structure.png"/>
+</div>
+## 算法原理
+通过具有高压缩比的WavTokenizer将输入的连续音频特征转换成离散音频token，然后利用基于Qwen模型初始化的自回归Transformer模型预测音频token，再由CFM扩散模型重建音频的潜层特征，最终通过Vocoder输出高质量的音频波形。
+<div align=center>
+    <img src="./doc/algorithm.png"/>
+</div>
+## 环境配置
+```
+mv InspireMusic_pytorch InspireMusic # 去框架名后缀
+```
+### Docker（方法一）
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
+# <your IMAGE ID>为以上拉取的docker的镜像ID替换，本镜像为：b272aae8ec72
+docker run -it --shm-size=64G -v $PWD/InspireMusic:/home/InspireMusic -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name music <your IMAGE ID> bash
+cd /home/InspireMusic
+pip install -r requirements.txt
+```
+### Dockerfile（方法二）
+```
+cd /home/InspireMusic/docker
+docker build --no-cache -t InspireMusic:latest .
+docker run --shm-size=64G --name music -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video -v $PWD/../../InspireMusic:/home/InspireMusic -it music bash
+# 若遇到Dockerfile启动的方式安装环境需要长时间等待，可注释掉里面的pip安装，启动容器后再安装python库：pip install -r requirements.txt。
+```
+### Anaconda（方法三）
+1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装：
+- https://developer.hpccube.com/tool/
+```
+DTK驱动:dtk24.04.3
+python:python3.10
+torch:2.3.0
+torchvision:0.18.1
+torchaudio:2.1.2
+triton:2.1.0
+vllm:0.6.2
+flash-attn:2.6.1
+deepspeed:0.14.2
+apex:1.3.0
+xformers:0.0.25
+transformers:4.48.0
+```
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应。`
+2、其它非特殊库参照requirements.txt安装
+```
+cd /home/InspireMusic
+pip install -r requirements.txt
+```
+## 数据集
+`无`
+## 训练
+`无`
+本项目的训练需一定的乐理基础，一般人难以训练出较好的效果，感兴趣的用户请参考源项目的[`README_origin`](./README_origin.md)训练。
+## 推理
+### 单机单卡
+```
+# 预训练权重放入：/home/InspireMusic/pretrained_models/
+cd /home/InspireMusic/examples/music_generation
+python -m inspiremusic.cli.inference # 或 sh test.sh
+```
+项目当前处在初期研发时期，源项目仍存在一些bug和效果问题，逐渐完善中。
+更多资料可参考源项目的[`README_origin`](./README_origin.md)
+## result
+`输入: `
+```
+prompt(默认): "Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance."
+```
+`输出:`
+```
+/home/InspireMusic/examples/music_generation/exp/inspiremusic/output_audio.wav
+```
+### 精度
+DCU与GPU精度一致，推理框架：pytorch。
+## 应用场景
+### 算法类别
+`音乐生成`
+### 热点应用行业
+`广媒,影视,动漫,医疗,家居,教育`
+## 预训练权重
+预训练权重快速下载中心：[SCNet AIModels](http://113.200.138.88:18080/aimodels) ，项目中的预训练权重可从快速下载通道下载：[InspireMusic-1.5B-Long](http://113.200.138.88:18080/aimodels/funaudiollm/InspireMusic-1.5B-Long.git)
+Hugging Face下载地址为：[InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long)
+## 源码仓库及问题反馈
+- http://developer.sourcefind.cn/codes/modelzoo/InspireMusic_pytorch.git
+## 参考资料
+- https://github.com/FunAudioLLM/InspireMusic.git
--- a/README_origin.md
+++ b/README_origin.md
--- a/app.py
+++ b/app.py
+# Copyright (c) 2024 Alibaba Inc (authors: Chong Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+os.system('nvidia-smi')
+os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
+os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
+os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
+import sys
+import torch
+print(torch.backends.cudnn.version())
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+import spaces
+import gradio as gr
+from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
+import torchaudio
+import datetime
+import hashlib
+import importlib
+MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
+AUDIO_PROMPT_DIR = "demo/audio_prompts"
+OUTPUT_AUDIO_DIR = "demo/outputs"
+DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
+					 "A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
+					 "A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
+					 "The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
+					 "The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
+					 "Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]
+def generate_filename():
+	hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
+	hash_string = hash_object.hexdigest()
+	return hash_string
+def get_args(
+		task, text="", audio=None, model_name="InspireMusic-Base",
+		chorus="intro",
+		output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
+	if "24kHz" in model_name:
+		output_sample_rate = 24000
+	if output_sample_rate == 24000:
+		fast = True
+	else:
+		fast = False
+	# This function constructs the arguments required for InspireMusic
+	args = {
+		"task"                      : task,
+		"text"                      : text,
+		"audio_prompt"              : audio,
+		"model_name"                : model_name,
+		"chorus"                    : chorus,
+		"fast"                      : fast,
+		"fade_out"                  : True,
+		"trim"                      : trim,
+		"output_sample_rate"        : output_sample_rate,
+		"min_generate_audio_seconds": 10.0,
+		"max_generate_audio_seconds": max_generate_audio_seconds,
+		"max_audio_prompt_length": 5.0,
+		"model_dir"                 : os.path.join("pretrained_models",
+												   model_name),
+		"result_dir"                : OUTPUT_AUDIO_DIR,
+		"output_fn"                 : generate_filename(),
+		"format"                    : "wav",
+		"time_start" : time_start,
+		"time_end": time_end,
+		"fade_out_duration": 1.0,
+	}
+	if args["time_start"] is None:
+		args["time_start"] = 0.0
+	args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
+	print(args)
+	return args
+def trim_audio(audio_file, cut_seconds=5):
+	audio, sr = torchaudio.load(audio_file)
+	num_samples = cut_seconds * sr
+	cutted_audio = audio[:, :num_samples]
+	output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
+	torchaudio.save(output_path, cutted_audio, sr)
+	return output_path
+@spaces.GPU()
+def music_generation(args):
+	set_env_variables()
+	model = InspireMusicUnified(
+			model_name=args["model_name"],
+			model_dir=args["model_dir"],
+			min_generate_audio_seconds=args["min_generate_audio_seconds"],
+			max_generate_audio_seconds=args["max_generate_audio_seconds"],
+			sample_rate=24000,
+			output_sample_rate=args["output_sample_rate"],
+			load_jit=True,
+			load_onnx=False,
+			fast=args["fast"],
+			result_dir=args["result_dir"])
+	output_path = model.inference(
+			task=args["task"],
+			text=args["text"],
+			audio_prompt=args["audio_prompt"],
+			chorus=args["chorus"],
+			time_start=args["time_start"],
+			time_end=args["time_end"],
+			output_fn=args["output_fn"],
+			max_audio_prompt_length=args["max_audio_prompt_length"],
+			fade_out_duration=args["fade_out_duration"],
+			output_format=args["format"],
+			fade_out_mode=args["fade_out"],
+			trim=args["trim"])
+	return output_path
+def demo_inspiremusic_t2m(text, model_name, chorus,
+					 output_sample_rate, max_generate_audio_seconds):
+	args = get_args(
+			task='text-to-music', text=text, audio=None,
+			model_name=model_name, chorus=chorus,
+			output_sample_rate=output_sample_rate,
+			max_generate_audio_seconds=max_generate_audio_seconds)
+	return music_generation(args)
+def demo_inspiremusic_con(text, audio, model_name, chorus,
+					 output_sample_rate, max_generate_audio_seconds):
+	args = get_args(
+			task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
+			model_name=model_name, chorus=chorus,
+			output_sample_rate=output_sample_rate,
+			max_generate_audio_seconds=max_generate_audio_seconds)
+	return music_generation(args)
+def main():
+	with gr.Blocks(theme=gr.themes.Soft()) as demo:
+		gr.Markdown("""
+		# InspireMusic
+		- Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz. 
+		- Github: https://github.com/FunAudioLLM/InspireMusic/
+		- Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
+		- Currently only support English text prompts.
+		- This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
+		""")
+		with gr.Row(equal_height=True):
+			model_name = gr.Dropdown(
+					MODELS, label="Select Model Name",
+					value="InspireMusic-1.5B-Long")
+			chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
+								 label="Chorus Mode", value="intro")
+			output_sample_rate = gr.Dropdown([48000, 24000],
+											 label="Output Audio Sample Rate (Hz)",
+											 value=48000)
+			max_generate_audio_seconds = gr.Slider(10, 300,
+												   label="Generate Audio Length (s)",
+												   value=30)
+		with gr.Row(equal_height=True):
+			text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
+									value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
+			audio_input = gr.Audio(
+				label="Input Audio Prompt (For Music Continuation Task)",
+				type="filepath")
+		music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)
+		with gr.Row():
+			button = gr.Button("Start Text-to-Music Task")
+			button.click(demo_inspiremusic_t2m,
+						 inputs=[text_input, model_name,
+								 chorus,
+								 output_sample_rate,
+								 max_generate_audio_seconds],
+						 outputs=music_output)
+			generate_button = gr.Button("Start Music Continuation Task")
+			generate_button.click(demo_inspiremusic_con,
+								  inputs=[text_input, audio_input, model_name,
+										  chorus,
+										  output_sample_rate,
+										  max_generate_audio_seconds],
+								  outputs=music_output)
+		t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
+	demo.launch()
+if __name__ == '__main__':
+	os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
+	os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
+	main()
--- a/asset/InspireMusic-24kHz.png
+++ b/asset/InspireMusic-24kHz.png
--- a/asset/InspireMusic.png
+++ b/asset/InspireMusic.png
--- a/asset/QR.jpg
+++ b/asset/QR.jpg
--- a/asset/dingding.png
+++ b/asset/dingding.png
--- a/asset/dingtalk.png
+++ b/asset/dingtalk.png
--- a/asset/logo.png
+++ b/asset/logo.png
--- a/doc/algorithm.png
+++ b/doc/algorithm.png
--- a/doc/structure.png
+++ b/doc/structure.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-24.04.3/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+# --extra-index-url https://download.pytorch.org/whl/cu118
+conformer==0.3.2
+# deepspeed==0.14.2; sys_platform == 'linux'
+diffusers==0.27.2
+gdown==5.1.0
+gradio==4.32.2
+grpcio==1.57.0
+grpcio-tools==1.57.0
+hydra-core==1.3.2
+HyperPyYAML==1.2.2
+inflect==7.3.1
+librosa==0.10.2
+lightning==2.2.4
+matplotlib==3.7.5
+modelscope==1.15.0
+networkx==3.1
+omegaconf==2.3.0
+onnx==1.17.0
+# onnxruntime-gpu==1.16.0; sys_platform == 'linux'
+onnxruntime==1.16.0; sys_platform == 'darwin' or sys_platform == 'windows'
+openai-whisper==20231117
+protobuf==4.25
+pydantic==2.7.0
+rich==13.7.1
+soundfile==0.12.1
+tensorboard==2.14.0
+# torch==2.0.1
+# torchaudio==2.0.2
+uvicorn==0.30.0
+wget==3.2
+fastapi==0.111.0
+fastapi-cli==0.0.4
+WeTextProcessing==1.0.3
+transformers
+accelerate
+huggingface-hub==0.25.2
+julius
+# https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
--- a/docker_start.sh
+++ b/docker_start.sh
+docker run -it --shm-size=64G -v $PWD/InspireMusic:/home/InspireMusic -v /public/DL_DATA/AI:/home/AI -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name music b272aae8ec72 bash
+# python -m torch.utils.collect_env
--- a/examples/music_generation/conf/ds_stage2.json
+++ b/examples/music_generation/conf/ds_stage2.json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps": 1,
+  "steps_per_print": 100,
+  "gradient_clipping": 5,
+  "fp16": {
+    "enabled": false,
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 256,
+    "hysteresis": 2,
+    "consecutive_hysteresis": false,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": false
+  },
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "none",
+      "pin_memory": true
+    },
+    "allgather_partitions": true,
+    "allgather_bucket_size": 5e8,
+    "overlap_comm": false,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 5e8,
+    "contiguous_gradients" : true
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+        "lr": 0.001,
+        "weight_decay": 0.0001,
+        "torch_adam": true,
+        "adam_w_mode": true
+    }
+  }
+}
\ No newline at end of file
--- a/examples/music_generation/conf/inspiremusic.fromscratch.yaml
+++ b/examples/music_generation/conf/inspiremusic.fromscratch.yaml
+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: '../../pretrained_models/InspireMusic-Base/'
+generator_path: '../../pretrained_models/InspireMusic-Base/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.ras_sampling
+        top_p: 0.8
+        top_k: 50
+        win_size: 10
+        tau_r: 0.1
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 7.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 1000
--- a/examples/music_generation/conf/inspiremusic.yaml
+++ b/examples/music_generation/conf/inspiremusic.yaml
+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+target_sample_rate: 48000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: '../../pretrained_models/InspireMusic-Base/'
+generator_path: '../../pretrained_models/InspireMusic-Base/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 20000
+    min_length: 1
+    token_max_length: 200
+    token_min_length: 1
+    max_acoustic_length: 20000
+    min_acoustic_length: 1800
+    mode: 'train_flow'
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 15500 # llm 12000
+    # batch_type: 'static'
+    # batch_size: 2 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+    mode: 'train'
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>, 
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500
--- a/examples/music_generation/conf/inspiremusic_1.5b.yaml
+++ b/examples/music_generation/conf/inspiremusic_1.5b.yaml
+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: '../../pretrained_models/InspireMusic-1.5B/'
+generator_path: '../../pretrained_models/InspireMusic-1.5B/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>, 
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500