"src/diffusers/models/attention.py" did not exist on "31d1f3c8c0c296bbdef9fa1651cfa7995cbed4b1"
Commit c26b38b2 authored by moto's avatar moto Committed by Facebook GitHub Bot
Browse files

Update forced alignment tutorial (#2544)

Summary:
1. Fix initialization.
Previously, the SOS token score was initialized to 0 across the time axis.
This was biasing the alignment to delay the start.
The proper way to delay the SOS is via blank token.
The new initilization takes the cumulated sum of blank scores.
2. Fill the end of trellis with Inf
Similar to the start, at the end where there remaining time frame is less
than the number of tokens, it is no longer possible to align the text, thus
we fill with Inf for better visualization.
3. Clean up asset management code.

Pull Request resolved: https://github.com/pytorch/audio/pull/2544

Reviewed By: nateanl

Differential Revision: D38276478

Pulled By: mthrok

fbshipit-source-id: 6d934cc850a0790b8c463a4f69f8f1143633d299
parent 67cb420d
......@@ -11,6 +11,16 @@ Recognition <https://arxiv.org/abs/2007.09127>`__.
"""
import torch
import torchaudio
print(torch.__version__)
print(torchaudio.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
######################################################################
# Overview
......@@ -37,32 +47,18 @@ Recognition <https://arxiv.org/abs/2007.09127>`__.
# %matplotlib inline
import os
from dataclasses import dataclass
import IPython
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio
matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.__version__)
print(torchaudio.__version__)
print(device)
SPEECH_URL = "https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
SPEECH_FILE = "_assets/speech.wav"
SPEECH_FILE = torchaudio.utils.download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
if not os.path.exists(SPEECH_FILE):
os.makedirs("_assets", exist_ok=True)
with open(SPEECH_FILE, "wb") as file:
file.write(requests.get(SPEECH_URL).content)
######################################################################
# Generate frame-wise label probability
......@@ -156,8 +152,12 @@ def get_trellis(emission, tokens, blank_id=0):
# Trellis has extra diemsions for both time axis and tokens.
# The extra dim for tokens represents <SoS> (start-of-sentence)
# The extra dim for time axis is for simplification of the code.
trellis = torch.full((num_frame + 1, num_tokens + 1), -float("inf"))
trellis[:, 0] = 0
trellis = torch.empty((num_frame + 1, num_tokens + 1))
trellis[0, 0] = 0
trellis[1:, 0] = torch.cumsum(emission[:, 0], 0)
trellis[0, -num_tokens:] = -float("inf")
trellis[-num_tokens:, 0] = float("inf")
for t in range(num_frame):
trellis[t + 1, 1:] = torch.maximum(
# Score for staying at the same token
......@@ -250,7 +250,8 @@ def backtrack(trellis, emission, tokens, blank_id=0):
path = backtrack(trellis, emission, tokens)
print(path)
for p in path:
print(p)
################################################################################
......@@ -449,6 +450,8 @@ plot_alignments(
)
plt.show()
################################################################################
#
# A trick to embed the resulting audio to the generated file.
# `IPython.display.Audio` has to be the last call in a cell,
......@@ -458,10 +461,9 @@ def display_segment(i):
word = word_segments[i]
x0 = int(ratio * word.start)
x1 = int(ratio * word.end)
filename = f"_assets/{i}_{word.label}.wav"
torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
return IPython.display.Audio(filename)
segment = waveform[:, x0:x1]
return IPython.display.Audio(segment.numpy(), rate=bundle.sample_rate)
######################################################################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment