finish GradTTS pipeline

1d2551d7 · patil-suraj · 80073936 · 1d2551d7
Commit 1d2551d7 authored Jun 16, 2022 by patil-suraj
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 6 deletions

src/diffusers/pipelines/pipeline_grad_tts.py src/diffusers/pipelines/pipeline_grad_tts.py +21 -6

No files found.
--- a/src/diffusers/pipelines/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/pipeline_grad_tts.py
@@ -4,12 +4,13 @@ import math
 import torch
 from torch import nn
+import tqdm
 from diffusers.configuration_utils import ConfigMixin
 from diffusers.modeling_utils import ModelMixin
 from diffusers import DiffusionPipeline
-from .grad_tts_utils import text_to_sequence
+from .grad_tts_utils import GradTTSTokenizer # flake8: noqa
 def sequence_mask(length, max_length=None):
@@ -385,24 +386,29 @@ class TextEncoder(ModelMixin, ConfigMixin):
        x_dp = torch.detach(x)
        logw = self.proj_w(x_dp, x_mask)
-        return mu, logw, x_mask, spk
+        return mu, logw, x_mask
 class GradTTS(DiffusionPipeline):
    def __init__(self, unet, text_encoder, noise_scheduler, tokenizer):
        super().__init__()
        noise_scheduler = noise_scheduler.set_format("pt")
-        self.register_modules(diffwave=unet, text_encoder=text_encoder, noise_scheduler=noise_scheduler, tokenizer=tokenizer)
+        self.register_modules(unet=unet, text_encoder=text_encoder, noise_scheduler=noise_scheduler, tokenizer=tokenizer)
    @torch.no_grad()
    def __call__(self, text, num_inference_steps, generator, temperature, length_scale, speaker_id=None, torch_device=None):
        if torch_device is None:
            torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.unet.to(torch_device)
+        self.text_encoder.to(torch_device)
        x, x_lengths = self.tokenizer(text)
+        x = x.to(torch_device)
+        x_lengths = x_lengths.to(torch_device)
        if speaker_id is not None:
-            speaker_id= torch.longTensor([speaker_id])
+            speaker_id= torch.LongTensor([speaker_id]).to(torch_device)
        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
        mu_x, logw, x_mask = self.text_encoder(x, x_lengths)
@@ -426,6 +432,15 @@ class GradTTS(DiffusionPipeline):
        # Sample latent representation from terminal distribution N(mu_y, I)
        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        xt = z * y_mask
+        h = 1.0 / num_inference_steps
+        for t in tqdm.tqdm(range(num_inference_steps), total=num_inference_steps):
+            t = (1.0 - (t + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype, device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            residual = self.unet(xt, y_mask, mu_y, t, speaker_id)
+            xt = self.noise_scheduler.step(xt, residual, mu_y, h, time)
+            xt = xt * y_mask
+        return xt[:, :, :y_max_length]
\ No newline at end of file