upload files

4998f0d5 · hungchiayu1 · d16bd9c0 · 4998f0d5 · 4998f0d5 · 4998f0d5
Commit 4998f0d5 authored Dec 30, 2024 by hungchiayu1
8 changed files
--- a/Inference.ipynb
+++ b/Inference.ipynb
--- a/data/test.wav
+++ b/data/test.wav
--- a/data/train.json
+++ b/data/train.json
+[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
--- a/data/val.json
+++ b/data/val.json
+[{"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}, {"captions": "Rhythmic wooden table tapping overlaid with steady water pouring sound", "location": "test.wav", "duration": 10.0}]
\ No newline at end of file
--- a/src/model.py
+++ b/src/model.py
@@ -430,7 +430,7 @@ class TangoFlux(nn.Module):
                        1,
                    )
            loss = loss.mean()
-            raw_model_loss, raw_ref_loss,implicit_acc,epsilon_diff = 0,0,0,0 ## default this to 0 if doing sft
+            raw_model_loss, raw_ref_loss,implicit_acc = 0,0,0 ## default this to 0 if doing sft

        else:
            encoder_hidden_states = encoder_hidden_states.repeat(2, 1, 1)
@@ -494,8 +494,7 @@ class TangoFlux(nn.Module):
                
                
            
-            epsilon_diff = torch.max(torch.zeros_like(model_losses_w), 
-                                      ref_losses_w-model_losses_w).mean()
+            
            
            

@@ -504,8 +503,8 @@ class TangoFlux(nn.Module):
            implicit_acc = (scale_term * (model_diff - ref_diff)  > 0).sum().float() / inside_term.size(0)
            loss = -1 * F.logsigmoid(inside_term).mean()  + model_losses_w.mean() 
        
-        
-        return loss, raw_model_loss, raw_ref_loss, implicit_acc,epsilon_diff
+        ## raw_model_loss, raw_ref_loss, implicit_acc is used to help to analyze dpo behaviour. 
+        return loss, raw_model_loss, raw_ref_loss, implicit_acc
        

    
\ No newline at end of file
--- a/src/train.py
+++ b/src/train.py
@@ -367,7 +367,7 @@ def main():
                    audio_latent = unwrapped_vae.encode(audio_input).latent_dist.sample()
                    audio_latent = audio_latent.transpose(1,2) ## Tranpose  to (bsz, seq_len, channel)

-                loss, _, _,_, _  = model(audio_latent, text ,duration=duration)
+                loss, _, _,_  = model(audio_latent, text ,duration=duration)
                total_loss += loss.detach().float()
                accelerator.backward(loss)
                
@@ -413,8 +413,8 @@ def main():
                        output_dir = os.path.join(output_dir, output_dir)
                    accelerator.save_state(output_dir)

-            if completed_steps >= args.max_train_steps:
-                break
+        if completed_steps >= args.max_train_steps:
+            break

        model.eval()
        eval_progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)
@@ -441,7 +441,7 @@ def main():
                audio_latent = audio_latent.transpose(1,2) ## Tranpose  to (bsz, seq_len, channel)
                
    
-                val_loss,_, _,_, _  = model(audio_latent, text , duration=duration)
+                val_loss,_, _,_  = model(audio_latent, text , duration=duration)
                
                total_val_loss += val_loss.detach().float()
                eval_progress_bar.update(1)

--- a/src/utils.py
+++ b/src/utils.py
+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import pandas as pd
+
+import torchaudio
+import random
+import itertools
+import numpy as np
+
+
+import numpy as np
+
+    
+def normalize_wav(waveform):
+    waveform = waveform - torch.mean(waveform)
+    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+
+
+def pad_wav(waveform, segment_length):
+    waveform_length = len(waveform)
+    
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    else:
+        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
+        waveform = torch.cat([waveform, pad_wav])
+        return waveform
+    
+    
+
+
+def read_wav_file(filename, duration_sec,stereo=False):
+    info = torchaudio.info(filename)
+    sample_rate = info.sample_rate
+    
+    # Calculate the number of frames corresponding to the desired duration
+    num_frames = int(sample_rate * duration_sec)
+    waveform, sr = torchaudio.load(filename,num_frames=num_frames)  # Faster!!!
+    
+
+    if stereo : ## Stereo audio
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=44100)
+        resampled_waveform = resampler(waveform)
+        padded_left = pad_wav(resampled_waveform[0], int(44100*duration_sec)) ## We pad left and right seperately
+        padded_right = pad_wav(resampled_waveform[1], int(44100*duration_sec))
+
+        return torch.stack([padded_left,padded_right])
+    else:
+        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=44100)[0]
+        waveform = pad_wav(waveform, int(44100*duration_sec)).unsqueeze(0)
+
+        return waveform
+
+
+
+
+
+class DPOText2AudioDataset(Dataset):
+    def __init__(self, dataset, prefix, text_column, audio_w_column, audio_l_column, duration, num_examples=-1):
+
+        inputs = list(dataset[text_column])
+        self.inputs = [prefix + inp for inp in inputs]
+        self.audios_w = list(dataset[audio_w_column])
+        self.audios_l = list(dataset[audio_l_column])
+        self.durations = list(dataset[duration])
+        self.indices = list(range(len(self.inputs)))
+
+        self.mapper = {}
+        for index, audio_w, audio_l, duration, text in zip(self.indices, self.audios_w,self.audios_l,self.durations,inputs):
+            self.mapper[index] = [audio_w, audio_l, duration, text]
+
+        if num_examples != -1:
+            self.inputs, self.audios_w, self.audios_l, self.durations = self.inputs[:num_examples], self.audios_w[:num_examples], self.audios_l[:num_examples],  self.durations[:num_examples]
+            self.indices = self.indices[:num_examples]
+
+    def __len__(self):
+        return len(self.inputs)
+
+    def get_num_instances(self):
+        return len(self.inputs)
+
+    def __getitem__(self, index):
+        s1, s2, s3, s4, s5 = self.inputs[index], self.audios_w[index], self.audios_l[index], self.durations[index], self.indices[index]
+        return s1, s2, s3, s4, s5
+
+    def collate_fn(self, data):
+        dat = pd.DataFrame(data)
+        return [dat[i].tolist() for i in dat]
+
+class Text2AudioDataset(Dataset):
+    def __init__(self, dataset, prefix, text_column, audio_column, duration, num_examples=-1):
+
+        inputs = list(dataset[text_column])
+        self.inputs = [prefix + inp for inp in inputs]
+        self.audios = list(dataset[audio_column])
+        self.durations = list(dataset[duration])
+        self.indices = list(range(len(self.inputs)))
+
+        self.mapper = {}
+        for index, audio, duration,text in zip(self.indices, self.audios, self.durations,inputs):
+            self.mapper[index] = [audio, text,duration]
+
+        if num_examples != -1:
+            self.inputs, self.audios, self.durations = self.inputs[:num_examples], self.audios[:num_examples], self.durations[:num_examples]
+            self.indices = self.indices[:num_examples]
+
+    def __len__(self):
+        return len(self.inputs)
+
+    def get_num_instances(self):
+        return len(self.inputs)
+
+    def __getitem__(self, index):
+        s1, s2, s3, s4 = self.inputs[index], self.audios[index], self.durations[index], self.indices[index]
+        return s1, s2, s3, s4
+
+    def collate_fn(self, data):
+        dat = pd.DataFrame(data)
+        return [dat[i].tolist() for i in dat]
--- a/train.sh
+++ b/train.sh
+
+CUDA_VISISBLE_DEVICES=0,1,2,3,4,5 accelerate launch --config_file='configs/accelerator_config.yaml' src.train.py --report_to='wandb'  --checkpointing_steps="best" --save_every=5 --config='tangoflux_config.yaml' 
\ No newline at end of file