Embed audio samples in generated tutorials (#1985)

It turned out that generated tutorials can embed the audio if the following conditions are met. This commit changes how audio samples are shown in tutorials so that they become playable in doc. 1. There is only one `IPython.display.Audio` call in a cell 2. An object of `IPython.display.Audio` is the last object interpreter receives in the cell 3. Audio format is `wav` (`flac` can be embedded as well, but browsers (Chrome/Safari) won't play it) Ref: https://stackoverflow.com/a/33109647

Embed audio samples in generated tutorials (#1985)
It turned out that generated tutorials can embed the audio if the following conditions are met. This commit changes how audio samples are shown in tutorials so that they become playable in doc. 1. There is only one `IPython.display.Audio` call in a cell 2. An object of `IPython.display.Audio` is the last object interpreter receives in the cell 3. Audio format is `wav` (`flac` can be embedded as well, but browsers (Chrome/Safari) won't play it) Ref: https://stackoverflow.com/a/33109647
34e69f97 · moto · GitHub · c670898c · 34e69f97 · 34e69f97
Unverified Commit 34e69f97 authored Nov 05, 2021 by moto Committed by GitHub Nov 05, 2021
3 changed files
--- a/examples/gallery/tts/tacotron2_pipeline_tutorial.py
+++ b/examples/gallery/tts/tacotron2_pipeline_tutorial.py
@@ -270,7 +270,7 @@ ax1.imshow(spec[0].cpu().detach())
 ax2.plot(waveforms[0].cpu().detach())
 torchaudio.save("output_wavernn.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_wavernn.wav"))
+IPython.display.Audio("output_wavernn.wav")
 ######################################################################
@@ -299,7 +299,7 @@ ax1.imshow(spec[0].cpu().detach())
 ax2.plot(waveforms[0].cpu().detach())
 torchaudio.save("output_griffinlim.wav", waveforms[0:1].cpu(), sample_rate=vocoder.sample_rate)
-IPython.display.display(IPython.display.Audio("output_griffinlim.wav"))
+IPython.display.Audio("output_griffinlim.wav")
 ######################################################################
@@ -330,4 +330,4 @@ ax1.imshow(spec[0].cpu().detach())
 ax2.plot(waveforms[0].cpu().detach())
 torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
-IPython.display.display(IPython.display.Audio("output_waveglow.wav"))
+IPython.display.Audio("output_waveglow.wav")
--- a/examples/gallery/wav2vec2/forced_alignment_tutorial.py
+++ b/examples/gallery/wav2vec2/forced_alignment_tutorial.py
@@ -56,8 +56,8 @@ print(torch.__version__)
 print(torchaudio.__version__)
 print(device)
-SPEECH_URL = 'https://download.pytorch.org/torchaudio/test-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac'
+SPEECH_URL = 'https://download.pytorch.org/torchaudio/tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav'
-SPEECH_FILE = 'speech.flac'
+SPEECH_FILE = 'speech.wav'
 if not os.path.exists(SPEECH_FILE):
  with open(SPEECH_FILE, 'wb') as file:
@@ -422,18 +422,71 @@ def plot_alignments(trellis, segments, word_segments, waveform):
 plot_alignments(trellis, segments, word_segments, waveform[0],)
 plt.show()
-# Generate the audio for each segment
+# A trick to embed the resulting audio to the generated file.
-print(transcript)
+# `IPython.display.Audio` has to be the last call in a cell,
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+# and there should be only one call par cell.
-ratio = waveform.size(1) / (trellis.size(0) - 1)
+def display_segment(i):
-for i, word in enumerate(word_segments):
+  ratio = waveform.size(1) / (trellis.size(0) - 1)
+  word = word_segments[i]
  x0 = int(ratio * word.start)
  x1 = int(ratio * word.end)
  filename = f"{i}_{word.label}.wav"
  torchaudio.save(filename, waveform[:, x0:x1], bundle.sample_rate)
-  print(f"{word.label}: {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f}")
+  print(f"{word.label} ({word.score:.2f}): {x0 / bundle.sample_rate:.3f} - {x1 / bundle.sample_rate:.3f} sec")
-  IPython.display.display(IPython.display.Audio(filename))
+  return IPython.display.Audio(filename)
+######################################################################
+# 
+# Generate the audio for each segment
+print(transcript)
+IPython.display.Audio(SPEECH_FILE)
+######################################################################
+# 
+display_segment(0)
+######################################################################
+# 
+display_segment(1)
+######################################################################
+# 
+display_segment(2)
+######################################################################
+# 
+display_segment(3)
+######################################################################
+# 
+display_segment(4)
+######################################################################
+# 
+display_segment(5)
+######################################################################
+# 
+display_segment(6)
+######################################################################
+# 
+display_segment(7)
+######################################################################
+# 
+display_segment(8)
 ######################################################################
 # Conclusion

--- a/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
+++ b/examples/gallery/wav2vec2/speech_recognition_pipeline_tutorial.py
@@ -120,7 +120,7 @@ print(model.__class__)
 # Creative Commos BY 4.0.
 # 
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 ######################################################################
@@ -273,7 +273,7 @@ transcript = decoder(emission[0])
 # 
 print(transcript)
-IPython.display.display(IPython.display.Audio(SPEECH_FILE))
+IPython.display.Audio(SPEECH_FILE)
 ######################################################################