Unverified Commit 34e69f97 authored by moto's avatar moto Committed by GitHub
Browse files

Embed audio samples in generated tutorials (#1985)

It turned out that generated tutorials can embed the audio if the following conditions are met.
This commit changes how audio samples are shown in tutorials so that they become playable in doc.

1. There is only one `IPython.display.Audio` call in a cell
2. An object of `IPython.display.Audio` is the last object interpreter receives in the cell
3. Audio format is `wav`
   (`flac` can be embedded as well, but browsers (Chrome/Safari) won't play it)

Ref: https://stackoverflow.com/a/33109647
parent c670898c
...@@ -270,7 +270,7 @@ ax1.imshow(spec[0].cpu().detach()) ...@@ -270,7 +270,7 @@ ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach()) ax2.plot(waveforms[0].cpu().detach())
torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
IPython.display.display(IPython.display.Audio("output_wavernn.wav")) IPython.display.Audio("output_wavernn.wav")
###################################################################### ######################################################################
...@@ -299,7 +299,7 @@ ax1.imshow(spec[0].cpu().detach()) ...@@ -299,7 +299,7 @@ ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach()) ax2.plot(waveforms[0].cpu().detach())
torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate) torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
IPython.display.display(IPython.display.Audio("output_griffinlim.wav")) IPython.display.Audio("output_griffinlim.wav")
###################################################################### ######################################################################
...@@ -330,4 +330,4 @@ ax1.imshow(spec[0].cpu().detach()) ...@@ -330,4 +330,4 @@ ax1.imshow(spec[0].cpu().detach())
ax2.plot(waveforms[0].cpu().detach()) ax2.plot(waveforms[0].cpu().detach())
torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050) torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
IPython.display.display(IPython.display.Audio("output_waveglow.wav")) IPython.display.Audio("output_waveglow.wav")
...@@ -56,8 +56,8 @@ print(torch.__version__) ...@@ -56,8 +56,8 @@ print(torch.__version__)
print(torchaudio.__version__) print(torchaudio.__version__)
print(device) print(device)
SPEECH_URL = 'https://download.pytorch.org/torchaudio/test-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac' SPEECH_URL = 'https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'
SPEECH_FILE = 'speech.flac' SPEECH_FILE = 'speech.wav'
if not os.path.exists(SPEECH_FILE): if not os.path.exists(SPEECH_FILE):
with open(SPEECH_FILE, 'wb') as file: with open(SPEECH_FILE, 'wb') as file:
...@@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform): ...@@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform):
plot_alignments(trellis, segments, word_segments, waveform[0],) plot_alignments(trellis, segments, word_segments, waveform[0],)
plt.show() plt.show()
# Generate the audio for each segment # A trick to embed the resulting audio to the generated file.
print(transcript) # `IPython.display.Audio` has to be the last call in a cell,
IPython.display.display(IPython.display.Audio(SPEECH_FILE)) # and there should be only one call par cell.
ratio = waveform.size(1) / (trellis.size(0) - 1) def display_segment(i):
for i, word in enumerate(word_segments): ratio = waveform.size(1) / (trellis.size(0) - 1)
word = word_segments[i]
x0 = int(ratio * word.start) x0 = int(ratio * word.start)
x1 = int(ratio * word.end) x1 = int(ratio * word.end)
filename = f"{i}_{word.label}.wav" filename = f"{i}_{word.label}.wav"
torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate) torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
print(f"{word.label}: {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f}") print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
IPython.display.display(IPython.display.Audio(filename)) return IPython.display.Audio(filename)
######################################################################
#
# Generate the audio for each segment
print(transcript)
IPython.display.Audio(SPEECH_FILE)
######################################################################
#
display_segment(0)
######################################################################
#
display_segment(1)
######################################################################
#
display_segment(2)
######################################################################
#
display_segment(3)
######################################################################
#
display_segment(4)
######################################################################
#
display_segment(5)
######################################################################
#
display_segment(6)
######################################################################
#
display_segment(7)
######################################################################
#
display_segment(8)
###################################################################### ######################################################################
# Conclusion # Conclusion
......
...@@ -120,7 +120,7 @@ print(model.__class__) ...@@ -120,7 +120,7 @@ print(model.__class__)
# Creative Commos BY 4.0. # Creative Commos BY 4.0.
# #
IPython.display.display(IPython.display.Audio(SPEECH_FILE)) IPython.display.Audio(SPEECH_FILE)
###################################################################### ######################################################################
...@@ -273,7 +273,7 @@ transcript = decoder(emission[0]) ...@@ -273,7 +273,7 @@ transcript = decoder(emission[0])
# #
print(transcript) print(transcript)
IPython.display.display(IPython.display.Audio(SPEECH_FILE)) IPython.display.Audio(SPEECH_FILE)
###################################################################### ######################################################################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment