run.sh 6.26 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/bin/bash
# Copyright 2024 Alibaba Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script just show an example to build your own music generation model.
# You may need to prepare your own dataset to fine-tune or train from scratch.
# Here take MusicCaps [1] dataset as an example.
# Download MusicCaps from: https://huggingface.co/datasets/google/MusicCaps

# Reference:
# 1. Agostinelli, A., Denk, T. I., Borsos, Z., Engel, J., Verzetti, M., Caillon, A., Huang, Q., Jansen, A., Roberts, A., Tagliasacchi, M., Sharifi, M., Zeghidour, N., & Frank, C. (2023). MusicLM: Generating music from text. Google Research. https://doi.org/10.48550/arXiv.2301.11325

. ./path.sh || exit 1;

stage=1
stop_stage=5

model_name=InspireMusic-Base
pretrained_model_dir=../../pretrained_models/${model_name}
dataset_name=musiccaps

# data preparation
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  echo "Download dataset and prepare wav.scp/text."
  # Here you may need to download MusicCaps dataset
  for x in ${dataset_name}_train ${dataset_name}_dev; do
    [ -d data/$x/ ] || mkdir -p data/$x/
  done
fi

export CUDA_VISIBLE_DEVICES="0"
# extract acoustic tokens
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "Extract acoustic token, you should have prepared acoustic tokenizer model, wav.scp/text"
  for x in ${dataset_name}_dev ${dataset_name}_train; do
    echo "$x"
    tools/extract_acoustic_token.py --dir data/$x \
    --ckpt_path ${pretrained_model_dir}/music_tokenizer/model.pt \
    --config_path ${pretrained_model_dir}/music_tokenizer/config.json
  done
fi

# extract semantic tokens
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  echo "Extract semantic token, you should have prepared semantic tokenizer model, wav.scp/text"
  for x in ${dataset_name}_dev ${dataset_name}_train; do
    echo "$x"
    tools/extract_semantic_token.py --dir data/$x \
    --ckpt_path ${pretrained_model_dir}/wavtokenizer/model.pt \
    --config_path ${pretrained_model_dir}/wavtokenizer/config.yaml
  done
fi

# data packing
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2acoustic_token.pt/utt2semantic_token.pt"
  for x in ${dataset_name}_train ${dataset_name}_dev; do
    echo $x
    [ -d data/$x/parquet ] || mkdir -p data/$x/parquet
    tools/make_parquet_list.py --num_utts_per_parquet 10000 \
      --num_processes 10 \
      --semantic_token_dir `pwd`/data/$x/ \
      --acoustic_token_dir `pwd`/data/$x/ \
      --des_dir `pwd`/data/$x/parquet \
      --src_dir `pwd`/data/$x/
  done
fi

# inference
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  test_set=${dataset_name}_dev
  echo "Run inference."
  expr_name="${test_set}"
  for task in 'text-to-music' 'continuation'; do
    [ -d `pwd`/exp/${model_name}/${task}_${expr_name} ] && rm -rf `pwd`/exp/${model_name}/${task}_${expr_name}
    echo `pwd`/exp/${model_name}/${task}_${expr_name}
    python inspiremusic/bin/inference.py --task $task \
        --gpu 0 \
        --config conf/inspiremusic.yaml \
        --prompt_data data/${test_set}/parquet/data.list \
        --flow_model $pretrained_model_dir/flow.pt \
        --llm_model $pretrained_model_dir/llm.pt \
        --music_tokenizer $pretrained_model_dir/music_tokenizer \
        --wavtokenizer $pretrained_model_dir/wavtokenizer \
        --result_dir `pwd`/exp/${model_name}/${task}_${expr_name}
  done
fi


# train llm and flow models fp16
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
  export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
  job_id=1024
  dist_backend="nccl"
  num_workers=8
  prefetch=100
  train_engine=torch_ddp
  expr_name="InspireMusic-Base-musiccaps-ft"

  echo "Run model training. We support llm and flow traning."

  if [ $train_engine == 'deepspeed' ]; then
    echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary"
  fi
  cat data/${dataset_name}_train/parquet/data.list > data/${dataset_name}_train.data.list
  cat data/${dataset_name}_dev/parquet/data.list > data/${dataset_name}_dev.data.list

  # train llm, support fp16 training
  model="llm"
  torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
      inspiremusic/bin/train.py \
      --train_engine $train_engine \
      --config conf/inspiremusic.yaml \
      --train_data data/${dataset_name}_train.data.list \
      --cv_data data/${dataset_name}_dev.data.list \
      --model $model \
      --model_dir `pwd`/exp/${expr_name}/$model/$train_engine \
      --tensorboard_dir `pwd`/tensorboard/${expr_name}/$model/$train_engine \
      --ddp.dist_backend $dist_backend \
      --num_workers ${num_workers} \
      --prefetch ${prefetch} \
      --pin_memory \
      --deepspeed_config ./conf/ds_stage2.json \
      --deepspeed.save_states model+optimizer \
      --fp16 \
      --checkpoint ../../pretrained_models/InspireMusic-Base/llm.pt

  # train flow matching model, only support fp32 training
  model="flow"
  torchrun --nnodes=1 --nproc_per_node=$num_gpus \
        --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
      inspiremusic/bin/train.py \
      --train_engine $train_engine \
      --config conf/inspiremusic.yaml \
      --train_data data/${dataset_name}_train.data.list \
      --cv_data data/${dataset_name}_dev.data.list \
      --model $model \
      --model_dir `pwd`/exp/${expr_name}/$model/$train_engine \
      --tensorboard_dir `pwd`/tensorboard/${expr_name}/$model/$train_engine \
      --ddp.dist_backend $dist_backend \
      --num_workers ${num_workers} \
      --prefetch ${prefetch} \
      --pin_memory \
      --deepspeed_config ./conf/ds_stage2.json \
      --deepspeed.save_states model+optimizer
fi