Commit 0b4b1fd4 authored by Caroline Chen's avatar Caroline Chen Committed by Facebook GitHub Bot
Browse files

Add IEMOCAP dataset (#2732)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2732

Reviewed By: nateanl

Differential Revision: D40186996

Pulled By: nateanl

fbshipit-source-id: a0ad325b7153c9e580dad2c515730dadbe8840c4
parent 4609daf7
...@@ -32,6 +32,7 @@ For example: ...@@ -32,6 +32,7 @@ For example:
DR_VCTK DR_VCTK
FluentSpeechCommands FluentSpeechCommands
GTZAN GTZAN
IEMOCAP
LibriMix LibriMix
LIBRISPEECH LIBRISPEECH
LibriLightLimited LibriLightLimited
......
...@@ -423,3 +423,13 @@ author = {Roldano Cattoni and Mattia Antonino {Di Gangi} and Luisa Bentivogli an ...@@ -423,3 +423,13 @@ author = {Roldano Cattoni and Mattia Antonino {Di Gangi} and Luisa Bentivogli an
keywords = {Spoken language translation, Multilingual corpus}, keywords = {Spoken language translation, Multilingual corpus},
abstract = {End-to-end spoken language translation (SLT) has recently gained popularity thanks to the advancement of sequence to sequence learning in its two parent tasks: automatic speech recognition (ASR) and machine translation (MT). However, research in the field has to confront with the scarcity of publicly available corpora to train data-hungry neural networks. Indeed, while traditional cascade solutions can build on sizable ASR and MT training data for a variety of languages, the available SLT corpora suitable for end-to-end training are few, typically small and of limited language coverage. We contribute to fill this gap by presenting MuST-C, a large and freely available Multilingual Speech Translation Corpus built from English TED Talks. Its unique features include: i) language coverage and diversity (from English into 14 languages from different families), ii) size (at least 237 hours of transcribed recordings per language, 430 on average), iii) variety of topics and speakers, and iv) data quality. Besides describing the corpus creation methodology and discussing the outcomes of empirical and manual quality evaluations, we present baseline results computed with strong systems on each language direction covered by MuST-C.} abstract = {End-to-end spoken language translation (SLT) has recently gained popularity thanks to the advancement of sequence to sequence learning in its two parent tasks: automatic speech recognition (ASR) and machine translation (MT). However, research in the field has to confront with the scarcity of publicly available corpora to train data-hungry neural networks. Indeed, while traditional cascade solutions can build on sizable ASR and MT training data for a variety of languages, the available SLT corpora suitable for end-to-end training are few, typically small and of limited language coverage. We contribute to fill this gap by presenting MuST-C, a large and freely available Multilingual Speech Translation Corpus built from English TED Talks. Its unique features include: i) language coverage and diversity (from English into 14 languages from different families), ii) size (at least 237 hours of transcribed recordings per language, 430 on average), iii) variety of topics and speakers, and iv) data quality. Besides describing the corpus creation methodology and discussing the outcomes of empirical and manual quality evaluations, we present baseline results computed with strong systems on each language direction covered by MuST-C.}
} }
@article{iemocap,
author = {Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower Provost, Emily and Kim, Samuel and Chang, Jeannette and Lee, Sungbok and Narayanan, Shrikanth},
year = {2008},
month = {12},
pages = {335-359},
title = {IEMOCAP: Interactive emotional dyadic motion capture database},
volume = {42},
journal = {Language Resources and Evaluation},
doi = {10.1007/s10579-008-9076-6}
}
import os
import random
from torchaudio.datasets import iemocap
from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
LABELS = ["neu", "hap", "ang", "sad", "exc", "xxx"]
SAMPLE_RATE = 16000
def _save_wav(filepath: str, seed: int):
wav = get_whitenoise(
sample_rate=SAMPLE_RATE,
duration=0.01,
n_channels=1,
seed=seed,
)
save_wav(filepath, wav, SAMPLE_RATE)
return wav
def _save_label(label_folder: str, filename: str, wav_stem: str):
label = random.choice(LABELS)
if label == "exc":
label = "hap"
line = f"[xxx]\t{wav_stem}\t{label}\t[yyy]"
filepath = os.path.join(label_folder, filename)
with open(filepath, "a") as f:
f.write(line + "\n")
return label
def _get_samples(dataset_dir: str, session: int):
session_folder = os.path.join(dataset_dir, f"Session{session}")
os.makedirs(session_folder, exist_ok=True)
wav_folder = os.path.join(session_folder, "sentences", "wav")
label_folder = os.path.join(session_folder, "dialog", "EmoEvaluation")
os.makedirs(wav_folder, exist_ok=True)
os.makedirs(label_folder, exist_ok=True)
samples = []
wav_stems = []
for i in range(5):
for g in ["F", "M"]:
speaker = f"Ses0{session}{g}"
subfolder = f"{speaker}_impro0{i}"
subfolder_path = os.path.join(wav_folder, subfolder)
os.makedirs(subfolder_path, exist_ok=True)
for j in range(5):
wav_stem = f"{subfolder}_F00{j}"
wav_stems.append(wav_stem)
wav_stems = sorted(wav_stems)
for wav_stem in wav_stems:
subfolder = wav_stem[:-5]
speaker = subfolder.split("_")[0]
wav_file = os.path.join(wav_folder, subfolder, wav_stem + ".wav")
wav = _save_wav(wav_file, seed=0)
label = _save_label(label_folder, subfolder + ".txt", wav_stem)
if label == "xxx":
continue
sample = (wav, SAMPLE_RATE, wav_stem, label, speaker)
samples.append(sample)
return samples
def get_mock_dataset(dataset_dir: str):
os.makedirs(dataset_dir, exist_ok=True)
samples = []
for session in range(1, 4):
samples += _get_samples(dataset_dir, session)
return samples
class TestIemocap(TempDirMixin, TorchaudioTestCase):
root_dir = None
backend = "default"
samples = []
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
dataset_dir = os.path.join(cls.root_dir, "IEMOCAP")
cls.samples = get_mock_dataset(dataset_dir)
def _testIEMOCAP(self, dataset, samples):
num_samples = 0
for i, data in enumerate(dataset):
self.assertEqual(data, samples[i])
num_samples += 1
assert num_samples == len(samples)
def testIEMOCAPDataset(self):
dataset = iemocap.IEMOCAP(self.root_dir)
self._testIEMOCAP(dataset, self.samples)
...@@ -4,6 +4,7 @@ from .commonvoice import COMMONVOICE ...@@ -4,6 +4,7 @@ from .commonvoice import COMMONVOICE
from .dr_vctk import DR_VCTK from .dr_vctk import DR_VCTK
from .fluentcommands import FluentSpeechCommands from .fluentcommands import FluentSpeechCommands
from .gtzan import GTZAN from .gtzan import GTZAN
from .iemocap import IEMOCAP
from .librilight_limited import LibriLightLimited from .librilight_limited import LibriLightLimited
from .librimix import LibriMix from .librimix import LibriMix
from .librispeech import LIBRISPEECH from .librispeech import LIBRISPEECH
...@@ -38,4 +39,5 @@ __all__ = [ ...@@ -38,4 +39,5 @@ __all__ = [
"FluentSpeechCommands", "FluentSpeechCommands",
"VoxCeleb1Identification", "VoxCeleb1Identification",
"VoxCeleb1Verification", "VoxCeleb1Verification",
"IEMOCAP",
] ]
import os
import re
from pathlib import Path
from typing import Tuple, Union
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import _load_waveform
_SAMPLE_RATE = 16000
def _get_wavs_paths(data_dir):
wav_dir = data_dir / "sentences" / "wav"
wav_paths = sorted(str(p) for p in wav_dir.glob("*/*.wav"))
relative_paths = []
for wav_path in wav_paths:
start = wav_path.find("Session")
wav_path = wav_path[start:]
relative_paths.append(wav_path)
return relative_paths
class IEMOCAP(Dataset):
"""*IEMOCAP* :cite:`iemocap` dataset.
Args:
root (str or Path): Root directory where the dataset's top level directory is found
sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
"""
def __init__(
self,
root: Union[str, Path],
sessions: Tuple[str] = (1, 2, 3, 4, 5),
):
root = Path(root)
self._path = root / "IEMOCAP"
if not os.path.isdir(self._path):
raise RuntimeError("Dataset not found.")
all_data = []
self.data = []
self.mapping = {}
for session in sessions:
session_name = f"Session{session}"
session_dir = self._path / session_name
# get wav paths
wav_paths = _get_wavs_paths(session_dir)
for wav_path in wav_paths:
wav_stem = str(Path(wav_path).stem)
all_data.append(wav_stem)
# add labels
label_dir = session_dir / "dialog" / "EmoEvaluation"
label_paths = label_dir.glob("*.txt")
for label_path in label_paths:
with open(label_path, "r") as f:
for line in f:
if not line.startswith("["):
continue
line = re.split("[\t\n]", line)
wav_stem = line[1]
label = line[2]
if label == "exc":
label = "hap"
if wav_stem not in all_data:
continue
if label not in ["neu", "hap", "ang", "sad"]:
continue
self.mapping[wav_stem] = {}
self.mapping[wav_stem]["label"] = label
for wav_path in wav_paths:
wav_stem = str(Path(wav_path).stem)
if wav_stem in self.mapping:
self.data.append(wav_stem)
self.mapping[wav_stem]["path"] = wav_path
def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
"""Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
but otherwise returns the same fields as :py:meth:`__getitem__`.
Args:
n (int): The index of the sample to be loaded
Returns:
Tuple of the following items;
str:
Path to audio
int:
Sample rate
str:
File name
str:
Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
str:
Speaker
"""
wav_stem = self.data[n]
wav_path = self.mapping[wav_stem]["path"]
label = self.mapping[wav_stem]["label"]
speaker = wav_stem.split("_")[0]
return (wav_path, _SAMPLE_RATE, wav_stem, label, speaker)
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]:
"""Load the n-th sample from the dataset.
Args:
n (int): The index of the sample to be loaded
Returns:
Tuple of the following items;
Tensor:
Waveform
int:
Sample rate
str:
File name
str:
Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
str:
Speaker
"""
metadata = self.get_metadata(n)
waveform = _load_waveform(self._path, metadata[0], metadata[1])
return (waveform,) + metadata[1:]
def __len__(self):
return len(self.data)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment