Commit 8304167c authored by hungchiayu1's avatar hungchiayu1
Browse files

update

parent 282138af
......@@ -2,17 +2,45 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"test.wav\", \"duration\": 10.0} for _ in range(10)]"
"data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"data/test.wav\", \"duration\": 10.0} for _ in range(10)]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from src.utils import read_wav_file\n",
"\n",
"wav = read_wav_file('data/test.wav',30)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 1323000])\n"
]
}
],
"source": [
"print(wav.shape)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
......@@ -102,7 +130,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
{
"compute_environment": "LOCAL_MACHINE",
"distributed_type": "MULTI_GPU",
"main_process_port": 29512,
"downcast_bf16": false,
"machine_rank": 0,
"gpu_ids": "0,1",
"main_training_function": "main",
"mixed_precision": "no",
"num_machines": 1,
"num_processes": 2,
"rdzv_backend": "static",
"same_network": true,
"tpu_use_cluster": false,
"tpu_use_sudo": false,
"use_cpu": false
}
\ No newline at end of file
......@@ -9,7 +9,7 @@ paths:
# Training-related parameters
training:
per_device_batch_size: 16
per_device_batch_size: 4
learning_rate: 5e-4
gradient_accumulation_steps: 1
num_train_epochs: 80
......
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file
......@@ -60,7 +60,7 @@ def parse_args():
help="Config file defining the model size as well as other hyper parameter.",
)
parser.add_argument(
"--prefix", type=str, default=None,
"--prefix", type=str, default='',
help="Add prefix in text prompts.",
)
......@@ -119,10 +119,7 @@ def parse_args():
help="Whether to continue training from a model weight",
)
parser.add_argument(
"--stereo", action='store_true', default=False,
help="Whether data is in stereo format",
)
args = parser.parse_args()
......@@ -348,13 +345,14 @@ def main():
for audio_path in audios:
wav = read_wav_file(audio_path,length,stereo=args.stereo) ## Only read the first 30 seconds of audio
wav = read_wav_file(audio_path,length) ## Only read the first 30 seconds of audio
if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
wav = wav.repeat(2,1)
audio_list.append(wav)
if not args.stereo:
audio_list = [wav.repeat(2,1) for wav in audio_list] ## Our vae expects stereo data, so we have to repeat the channel.
audio_input = torch.stack(audio_list,dim=0)
......@@ -426,12 +424,13 @@ def main():
audio_list = []
for audio_path in audios:
wav = read_wav_file(audio_path,length,stereo=args.stereo) ## make sure none of audio exceed 30 sec
wav = read_wav_file(audio_path,length) ## make sure none of audio exceed 30 sec
if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
wav = wav.repeat(2,1)
audio_list.append(wav)
if not args.stereo:
audio_list = [wav.repeat(2,1) for wav in audio_list] ## Repeat for stereo
audio_input = torch.stack(audio_list,dim=0)
audio_input = audio_input.to(device)
......
......@@ -26,25 +26,27 @@ def pad_wav(waveform, segment_length):
elif waveform_length > segment_length:
return waveform[:segment_length]
else:
pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
waveform = torch.cat([waveform, pad_wav])
padded_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
waveform = torch.cat([waveform, padded_wav])
return waveform
def read_wav_file(filename, duration_sec,stereo=False):
def read_wav_file(filename, duration_sec):
info = torchaudio.info(filename)
sample_rate = info.sample_rate
# Calculate the number of frames corresponding to the desired duration
num_frames = int(sample_rate * duration_sec)
waveform, sr = torchaudio.load(filename,num_frames=num_frames) # Faster!!!
if stereo : ## Stereo audio
if waveform.shape[0] == 2 : ## Stereo audio
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100)
resampled_waveform = resampler(waveform)
#print(resampled_waveform.shape)
padded_left = pad_wav(resampled_waveform[0], int(44100*duration_sec)) ## We pad left and right seperately
padded_right = pad_wav(resampled_waveform[1], int(44100*duration_sec))
......
CUDA_VISISBLE_DEVICES=0,1,2,3,4,5 accelerate launch --config_file='configs/accelerator_config.yaml' src.train.py --report_to='wandb' --checkpointing_steps="best" --save_every=5 --config='tangoflux_config.yaml'
\ No newline at end of file
CUDA_VISISBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' src/train.py --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml'
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment