Commit 9bbd4600 authored by hwangjeff's avatar hwangjeff Committed by hwangjeff
Browse files

Rename utterance to transcript in datasets (#1841)

parent 94027791
...@@ -18,7 +18,7 @@ def get_mock_dataset(root_dir): ...@@ -18,7 +18,7 @@ def get_mock_dataset(root_dir):
""" """
mocked_data = [] mocked_data = []
sample_rate = 16000 sample_rate = 16000
utterance = "This is a test utterance." transcript = "This is a test transcript."
base_dir = os.path.join(root_dir, "ARCTIC", "cmu_us_aew_arctic") base_dir = os.path.join(root_dir, "ARCTIC", "cmu_us_aew_arctic")
txt_dir = os.path.join(base_dir, "etc") txt_dir = os.path.join(base_dir, "etc")
...@@ -44,11 +44,11 @@ def get_mock_dataset(root_dir): ...@@ -44,11 +44,11 @@ def get_mock_dataset(root_dir):
sample = ( sample = (
normalize_wav(data), normalize_wav(data),
sample_rate, sample_rate,
utterance, transcript,
utterance_id.split("_")[1], utterance_id.split("_")[1],
) )
mocked_data.append(sample) mocked_data.append(sample)
txt.write(f'( {utterance_id} "{utterance}" )\n') txt.write(f'( {utterance_id} "{transcript}" )\n')
seed += 1 seed += 1
return mocked_data return mocked_data
...@@ -66,10 +66,10 @@ class TestCMUARCTIC(TempDirMixin, TorchaudioTestCase): ...@@ -66,10 +66,10 @@ class TestCMUARCTIC(TempDirMixin, TorchaudioTestCase):
def _test_cmuarctic(self, dataset): def _test_cmuarctic(self, dataset):
n_ite = 0 n_ite = 0
for i, (waveform, sample_rate, utterance, utterance_id) in enumerate(dataset): for i, (waveform, sample_rate, transcript, utterance_id) in enumerate(dataset):
expected_sample = self.samples[i] expected_sample = self.samples[i]
assert sample_rate == expected_sample[1] assert sample_rate == expected_sample[1]
assert utterance == expected_sample[2] assert transcript == expected_sample[2]
assert utterance_id == expected_sample[3] assert utterance_id == expected_sample[3]
self.assertEqual(expected_sample[0], waveform, atol=5e-5, rtol=1e-8) self.assertEqual(expected_sample[0], waveform, atol=5e-5, rtol=1e-8)
n_ite += 1 n_ite += 1
......
...@@ -11,7 +11,7 @@ from torchaudio_unittest.common_utils import ( ...@@ -11,7 +11,7 @@ from torchaudio_unittest.common_utils import (
from torchaudio.datasets import librispeech from torchaudio.datasets import librispeech
# Used to generate a unique utterance for each dummy audio file # Used to generate a unique transcript for each dummy audio file
_NUMBERS = [ _NUMBERS = [
'ZERO', 'ZERO',
'ONE', 'ONE',
...@@ -51,11 +51,11 @@ def get_mock_dataset(root_dir): ...@@ -51,11 +51,11 @@ def get_mock_dataset(root_dir):
filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav' filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav'
path = os.path.join(chapter_path, filename) path = os.path.join(chapter_path, filename)
utterance = ' '.join( transcript = ' '.join(
[_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]] [_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]
) )
trans_content.append( trans_content.append(
f'{speaker_id}-{chapter_id}-{utterance_id:04d} {utterance}' f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
) )
data = get_whitenoise( data = get_whitenoise(
...@@ -69,7 +69,7 @@ def get_mock_dataset(root_dir): ...@@ -69,7 +69,7 @@ def get_mock_dataset(root_dir):
sample = ( sample = (
normalize_wav(data), normalize_wav(data),
sample_rate, sample_rate,
utterance, transcript,
speaker_id, speaker_id,
chapter_id, chapter_id,
utterance_id utterance_id
...@@ -104,11 +104,11 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase): ...@@ -104,11 +104,11 @@ class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
def _test_librispeech(self, dataset): def _test_librispeech(self, dataset):
num_samples = 0 num_samples = 0
for i, ( for i, (
data, sample_rate, utterance, speaker_id, chapter_id, utterance_id data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
) in enumerate(dataset): ) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8) self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1] assert sample_rate == self.samples[i][1]
assert utterance == self.samples[i][2] assert transcript == self.samples[i][2]
assert speaker_id == self.samples[i][3] assert speaker_id == self.samples[i][3]
assert chapter_id == self.samples[i][4] assert chapter_id == self.samples[i][4]
assert utterance_id == self.samples[i][5] assert utterance_id == self.samples[i][5]
......
...@@ -11,8 +11,8 @@ from torchaudio_unittest.common_utils import ( ...@@ -11,8 +11,8 @@ from torchaudio_unittest.common_utils import (
normalize_wav, normalize_wav,
) )
# Used to generate a unique utterance for each dummy audio file # Used to generate a unique transcript for each dummy audio file
_UTTERANCE = [ _TRANSCRIPT = [
'Please call Stella', 'Please call Stella',
'Ask her to bring these things', 'Ask her to bring these things',
'with her from the store', 'with her from the store',
...@@ -59,14 +59,14 @@ def get_mock_dataset(root_dir): ...@@ -59,14 +59,14 @@ def get_mock_dataset(root_dir):
save_wav(audio_file_path, data, sample_rate) save_wav(audio_file_path, data, sample_rate)
txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt') txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt')
utterance = _UTTERANCE[utterance_id - 1] transcript = _TRANSCRIPT[utterance_id - 1]
with open(txt_file_path, 'w') as f: with open(txt_file_path, 'w') as f:
f.write(utterance) f.write(transcript)
sample = ( sample = (
normalize_wav(data), normalize_wav(data),
sample_rate, sample_rate,
utterance, transcript,
speaker_id, speaker_id,
utterance_id utterance_id
) )
...@@ -88,10 +88,10 @@ class TestVCTK(TempDirMixin, TorchaudioTestCase): ...@@ -88,10 +88,10 @@ class TestVCTK(TempDirMixin, TorchaudioTestCase):
def _test_vctk(self, dataset): def _test_vctk(self, dataset):
num_samples = 0 num_samples = 0
for i, (data, sample_rate, utterance, speaker_id, utterance_id) in enumerate(dataset): for i, (data, sample_rate, transcript, speaker_id, utterance_id) in enumerate(dataset):
self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8) self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
assert sample_rate == self.samples[i][1] assert sample_rate == self.samples[i][1]
assert utterance == self.samples[i][2] assert transcript == self.samples[i][2]
assert speaker_id == self.samples[i][3] assert speaker_id == self.samples[i][3]
assert int(utterance_id) == self.samples[i][4] assert int(utterance_id) == self.samples[i][4]
num_samples += 1 num_samples += 1
......
...@@ -58,10 +58,10 @@ def load_cmuarctic_item(line: str, ...@@ -58,10 +58,10 @@ def load_cmuarctic_item(line: str,
folder_audio: str, folder_audio: str,
ext_audio: str) -> Tuple[Tensor, int, str, str]: ext_audio: str) -> Tuple[Tensor, int, str, str]:
utterance_id, utterance = line[0].strip().split(" ", 2)[1:] utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
# Remove space, double quote, and single parenthesis from utterance # Remove space, double quote, and single parenthesis from transcript
utterance = utterance[1:-3] transcript = transcript[1:-3]
file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio) file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
...@@ -71,7 +71,7 @@ def load_cmuarctic_item(line: str, ...@@ -71,7 +71,7 @@ def load_cmuarctic_item(line: str,
return ( return (
waveform, waveform,
sample_rate, sample_rate,
utterance, transcript,
utterance_id.split("_")[1] utterance_id.split("_")[1]
) )
...@@ -164,7 +164,7 @@ class CMUARCTIC(Dataset): ...@@ -164,7 +164,7 @@ class CMUARCTIC(Dataset):
n (int): The index of the sample to be loaded n (int): The index of the sample to be loaded
Returns: Returns:
tuple: ``(waveform, sample_rate, utterance, utterance_id)`` tuple: ``(waveform, sample_rate, transcript, utterance_id)``
""" """
line = self._walker[n] line = self._walker[n]
return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio) return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
......
...@@ -49,7 +49,7 @@ def load_librispeech_item(fileid: str, ...@@ -49,7 +49,7 @@ def load_librispeech_item(fileid: str,
# Load text # Load text
with open(file_text) as ft: with open(file_text) as ft:
for line in ft: for line in ft:
fileid_text, utterance = line.strip().split(" ", 1) fileid_text, transcript = line.strip().split(" ", 1)
if fileid_audio == fileid_text: if fileid_audio == fileid_text:
break break
else: else:
...@@ -59,7 +59,7 @@ def load_librispeech_item(fileid: str, ...@@ -59,7 +59,7 @@ def load_librispeech_item(fileid: str,
return ( return (
waveform, waveform,
sample_rate, sample_rate,
utterance, transcript,
int(speaker_id), int(speaker_id),
int(chapter_id), int(chapter_id),
int(utterance_id), int(utterance_id),
...@@ -133,7 +133,7 @@ class LIBRISPEECH(Dataset): ...@@ -133,7 +133,7 @@ class LIBRISPEECH(Dataset):
n (int): The index of the sample to be loaded n (int): The index of the sample to be loaded
Returns: Returns:
tuple: ``(waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id)`` tuple: ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
""" """
fileid = self._walker[n] fileid = self._walker[n]
return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt) return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt)
......
...@@ -241,7 +241,7 @@ class VCTK_092(Dataset): ...@@ -241,7 +241,7 @@ class VCTK_092(Dataset):
return torchaudio.load(file_path) return torchaudio.load(file_path)
def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType: def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> SampleType:
utterance_path = os.path.join( transcript_path = os.path.join(
self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt" self._txt_dir, speaker_id, f"{speaker_id}_{utterance_id}.txt"
) )
audio_path = os.path.join( audio_path = os.path.join(
...@@ -251,12 +251,12 @@ class VCTK_092(Dataset): ...@@ -251,12 +251,12 @@ class VCTK_092(Dataset):
) )
# Reading text # Reading text
utterance = self._load_text(utterance_path) transcript = self._load_text(transcript_path)
# Reading FLAC # Reading FLAC
waveform, sample_rate = self._load_audio(audio_path) waveform, sample_rate = self._load_audio(audio_path)
return (waveform, sample_rate, utterance, speaker_id, utterance_id) return (waveform, sample_rate, transcript, speaker_id, utterance_id)
def __getitem__(self, n: int) -> SampleType: def __getitem__(self, n: int) -> SampleType:
"""Load the n-th sample from the dataset. """Load the n-th sample from the dataset.
...@@ -265,7 +265,7 @@ class VCTK_092(Dataset): ...@@ -265,7 +265,7 @@ class VCTK_092(Dataset):
n (int): The index of the sample to be loaded n (int): The index of the sample to be loaded
Returns: Returns:
tuple: ``(waveform, sample_rate, utterance, speaker_id, utterance_id)`` tuple: ``(waveform, sample_rate, transcript, speaker_id, utterance_id)``
""" """
speaker_id, utterance_id = self._sample_ids[n] speaker_id, utterance_id = self._sample_ids[n]
return self._load_sample(speaker_id, utterance_id, self._mic_id) return self._load_sample(speaker_id, utterance_id, self._mic_id)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment