update

8304167c · hungchiayu1 · 282138af · 8304167c · 8304167c · 8304167c
Commit 8304167c authored Dec 30, 2024 by hungchiayu1
9 changed files
--- a/Inference.ipynb
+++ b/Inference.ipynb
@@ -2,17 +2,45 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
-    "data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"test.wav\", \"duration\": 10.0} for _ in range(10)]"
+    "data = [{\"captions\": \"Rhythmic wooden table tapping overlaid with steady water pouring sound\", \"location\": \"data/test.wav\", \"duration\": 10.0} for _ in range(10)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.utils import read_wav_file\n",
+    "\n",
+    "wav = read_wav_file('data/test.wav',30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 1323000])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(wav.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
@@ -102,7 +130,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,
--- a/configs/accelerator_config.yaml
+++ b/configs/accelerator_config.yaml
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "distributed_type": "MULTI_GPU",
+  "main_process_port": 29512,
+  "downcast_bf16": false,
+  "machine_rank": 0,
+  "gpu_ids": "0,1",
+  "main_training_function": "main",
+  "mixed_precision": "no",
+  "num_machines": 1,
+  "num_processes": 2,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
\ No newline at end of file
--- a/configs/tangoflux_config.yaml
+++ b/configs/tangoflux_config.yaml
@@ -9,7 +9,7 @@ paths:

 # Training-related parameters
 training:
-  per_device_batch_size: 16
+  per_device_batch_size: 4
  learning_rate: 5e-4
  gradient_accumulation_steps: 1
  num_train_epochs: 80

--- a/data/train.json
+++ b/data/train.json
-[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
+[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file
--- a/data/val.json
+++ b/data/val.json
-[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
+[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "data/test.wav", "duration": 10.0}]
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
 torch==2.4.0
 torchaudio==2.4.0
 torchlibrosa==0.1.0
-torchvision==0.19.1
+torchvision==0.19.0
 transformers==4.44.0
 diffusers==0.30.0
 accelerate==0.34.2

--- a/src/train.py
+++ b/src/train.py
@@ -60,7 +60,7 @@ def parse_args():
        help="Config file defining the model size as well as other hyper parameter.",
    )
    parser.add_argument(
-        "--prefix", type=str, default=None,
+        "--prefix", type=str, default='',
        help="Add prefix in text prompts.",
    )
   
@@ -119,10 +119,7 @@ def parse_args():
        help="Whether to continue training from a model weight",
    )

-    parser.add_argument(
-        "--stereo", action='store_true', default=False,
-        help="Whether data is in stereo format",
-    )
+    
    
    args = parser.parse_args()

@@ -348,13 +345,14 @@ def main():
                   
                    for audio_path in audios:
                        
-                        wav = read_wav_file(audio_path,length,stereo=args.stereo) ## Only read the first 30 seconds of audio
+                        wav = read_wav_file(audio_path,length) ## Only read the first 30 seconds of audio
+                        if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                            wav = wav.repeat(2,1)
                        audio_list.append(wav)
                    
                    

-                    if not args.stereo:
-                        audio_list = [wav.repeat(2,1) for wav in audio_list] ## Our vae expects stereo data, so we have to repeat the channel.
+
                    
                            
                    audio_input = torch.stack(audio_list,dim=0)
@@ -426,12 +424,13 @@ def main():
                audio_list = []
                for audio_path in audios:
                    
-                    wav = read_wav_file(audio_path,length,stereo=args.stereo) ## make sure none of audio exceed 30 sec
+                    wav = read_wav_file(audio_path,length) ## make sure none of audio exceed 30 sec
+                    if wav.shape[0] == 1 : ## If this audio is mono, we repeat the channel so it become "fake stereo"
+                        wav = wav.repeat(2,1)
                    audio_list.append(wav)
    
                
-                if not args.stereo:
-                    audio_list = [wav.repeat(2,1) for wav in audio_list] ## Repeat for stereo
+

                audio_input = torch.stack(audio_list,dim=0)
                audio_input = audio_input.to(device)

--- a/src/utils.py
+++ b/src/utils.py
@@ -26,25 +26,27 @@ def pad_wav(waveform, segment_length):
    elif waveform_length > segment_length:
        return waveform[:segment_length]
    else:
-        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
-        waveform = torch.cat([waveform, pad_wav])
+        padded_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
+        waveform = torch.cat([waveform, padded_wav])
        return waveform
    
    


-def read_wav_file(filename, duration_sec,stereo=False):
+def read_wav_file(filename, duration_sec):
    info = torchaudio.info(filename)
    sample_rate = info.sample_rate
    
    # Calculate the number of frames corresponding to the desired duration
    num_frames = int(sample_rate * duration_sec)
+
    waveform, sr = torchaudio.load(filename,num_frames=num_frames)  # Faster!!!
    

-    if stereo : ## Stereo audio
+    if waveform.shape[0] == 2 : ## Stereo audio
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100)
        resampled_waveform = resampler(waveform)
+        #print(resampled_waveform.shape)
        padded_left = pad_wav(resampled_waveform[0], int(44100*duration_sec)) ## We pad left and right seperately
        padded_right = pad_wav(resampled_waveform[1], int(44100*duration_sec))


--- a/train.sh
+++ b/train.sh

-CUDA_VISISBLE_DEVICES=0,1,2,3,4,5 accelerate launch --config_file='configs/accelerator_config.yaml' src.train.py --report_to='wandb'  --checkpointing_steps="best" --save_every=5 --config='tangoflux_config.yaml' 
\ No newline at end of file
+CUDA_VISISBLE_DEVICES=0,1 accelerate launch --config_file='configs/accelerator_config.yaml' src/train.py   --checkpointing_steps="best" --save_every=5 --config='configs/tangoflux_config.yaml' 
\ No newline at end of file