Commit 8304167c authored by hungchiayu1's avatar hungchiayu1
Browse files

update

parent 282138af
...@@ -2,17 +2,45 @@ ...@@ -2,17 +2,45 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"test.wav\", \"duration\": 10.0} for _ in range(10)]" "data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"data/test.wav\", \"duration\": 10.0} for _ in range(10)]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from src.utils import read_wav_file\n",
"\n",
"wav = read_wav_file('data/test.wav',30)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 1323000])\n"
]
}
],
"source": [
"print(wav.shape)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import json\n", "import json\n",
...@@ -102,7 +130,7 @@ ...@@ -102,7 +130,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.9" "version": "3.11.10"
} }
}, },
"nbformat": 4, "nbformat": 4,
{
"compute_environment": "LOCAL_MACHINE",
"distributed_type": "MULTI_GPU",
"main_process_port": 29512,
"downcast_bf16": false,
"machine_rank": 0,
"gpu_ids": "0,1",
"main_training_function": "main",
"mixed_precision": "no",
"num_machines": 1,
"num_processes": 2,
"rdzv_backend": "static",
"same_network": true,
"tpu_use_cluster": false,
"tpu_use_sudo": false,
"use_cpu": false
}
\ No newline at end of file
...@@ -9,7 +9,7 @@ paths: ...@@ -9,7 +9,7 @@ paths:
# Training-related parameters # Training-related parameters
training: training:
per_device_batch_size: 16 per_device_batch_size: 4
learning_rate: 5e-4 learning_rate: 5e-4
gradient_accumulation_steps: 1 gradient_accumulation_steps: 1
num_train_epochs: 80 num_train_epochs: 80
......
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}] [{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file \ No newline at end of file
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}] [{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file \ No newline at end of file
torch==2.4.0 torch==2.4.0
torchaudio==2.4.0 torchaudio==2.4.0
torchlibrosa==0.1.0 torchlibrosa==0.1.0
torchvision==0.19.1 torchvision==0.19.0
transformers==4.44.0 transformers==4.44.0
diffusers==0.30.0 diffusers==0.30.0
accelerate==0.34.2 accelerate==0.34.2
......
...@@ -60,7 +60,7 @@ def parse_args(): ...@@ -60,7 +60,7 @@ def parse_args():
help="Config file defining the model size as well as other hyper parameter.", help="Config file defining the model size as well as other hyper parameter.",
) )
parser.add_argument( parser.add_argument(
"--prefix", type=str, default=None, "--prefix", type=str, default='',
help="Add prefix in text prompts.", help="Add prefix in text prompts.",
) )
...@@ -119,10 +119,7 @@ def parse_args(): ...@@ -119,10 +119,7 @@ def parse_args():
help="Whether to continue training from a model weight", help="Whether to continue training from a model weight",
) )
parser.add_argument(
"--stereo", action='store_true', default=False,
help="Whether data is in stereo format",
)
args = parser.parse_args() args = parser.parse_args()
...@@ -348,13 +345,14 @@ def main(): ...@@ -348,13 +345,14 @@ def main():
for audio_path in audios: for audio_path in audios:
wav = read_wav_file(audio_path,length,stereo=args.stereo) ## Only read the first 30 seconds of audio wav = read_wav_file(audio_path,length) ## Only read the first 30 seconds of audio
if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
wav = wav.repeat(2,1)
audio_list.append(wav) audio_list.append(wav)
if not args.stereo:
audio_list = [wav.repeat(2,1) for wav in audio_list] ## Our vae expects stereo data, so we have to repeat the channel.
audio_input = torch.stack(audio_list,dim=0) audio_input = torch.stack(audio_list,dim=0)
...@@ -426,12 +424,13 @@ def main(): ...@@ -426,12 +424,13 @@ def main():
audio_list = [] audio_list = []
for audio_path in audios: for audio_path in audios:
wav = read_wav_file(audio_path,length,stereo=args.stereo) ## make sure none of audio exceed 30 sec wav = read_wav_file(audio_path,length) ## make sure none of audio exceed 30 sec
if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
wav = wav.repeat(2,1)
audio_list.append(wav) audio_list.append(wav)
if not args.stereo:
audio_list = [wav.repeat(2,1) for wav in audio_list] ## Repeat for stereo
audio_input = torch.stack(audio_list,dim=0) audio_input = torch.stack(audio_list,dim=0)
audio_input = audio_input.to(device) audio_input = audio_input.to(device)
......
...@@ -26,25 +26,27 @@ def pad_wav(waveform, segment_length): ...@@ -26,25 +26,27 @@ def pad_wav(waveform, segment_length):
elif waveform_length > segment_length: elif waveform_length > segment_length:
return waveform[:segment_length] return waveform[:segment_length]
else: else:
pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device) padded_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
waveform = torch.cat([waveform, pad_wav]) waveform = torch.cat([waveform, padded_wav])
return waveform return waveform
def read_wav_file(filename, duration_sec,stereo=False): def read_wav_file(filename, duration_sec):
info = torchaudio.info(filename) info = torchaudio.info(filename)
sample_rate = info.sample_rate sample_rate = info.sample_rate
# Calculate the number of frames corresponding to the desired duration # Calculate the number of frames corresponding to the desired duration
num_frames = int(sample_rate * duration_sec) num_frames = int(sample_rate * duration_sec)
waveform, sr = torchaudio.load(filename,num_frames=num_frames) # Faster!!! waveform, sr = torchaudio.load(filename,num_frames=num_frames) # Faster!!!
if stereo : ## Stereo audio if waveform.shape[0] == 2 : ## Stereo audio
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100) resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100)
resampled_waveform = resampler(waveform) resampled_waveform = resampler(waveform)
#print(resampled_waveform.shape)
padded_left = pad_wav(resampled_waveform[0], int(44100*duration_sec)) ## We pad left and right seperately padded_left = pad_wav(resampled_waveform[0], int(44100*duration_sec)) ## We pad left and right seperately
padded_right = pad_wav(resampled_waveform[1], int(44100*duration_sec)) padded_right = pad_wav(resampled_waveform[1], int(44100*duration_sec))
......
CUDA_VISISBLE_DEVICES=0,1,2,3,4,5 accelerate launch --config_file='configs/accelerator_config.yaml' src.train.py --report_to='wandb' --checkpointing_steps="best" --save_every=5 --config='tangoflux_config.yaml' CUDA_VISISBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' src/train.py --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml'
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment