Add IEMOCAP dataset (#2732)

Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2732 Reviewed By: nateanl Differential Revision: D40186996 Pulled By: nateanl fbshipit-source-id: a0ad325b7153c9e580dad2c515730dadbe8840c4

Add IEMOCAP dataset (#2732)
Summary: Pull Request resolved: https://github.com/pytorch/audio/pull/2732 Reviewed By: nateanl Differential Revision: D40186996 Pulled By: nateanl fbshipit-source-id: a0ad325b7153c9e580dad2c515730dadbe8840c4
0b4b1fd4 · Caroline Chen · Facebook GitHub Bot · 4609daf7 · 0b4b1fd4 · 0b4b1fd4
Commit 0b4b1fd4 authored Oct 09, 2022 by Caroline Chen Committed by Facebook GitHub Bot Oct 09, 2022
5 changed files
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -32,6 +32,7 @@ For example:
   DR_VCTK
   FluentSpeechCommands
   GTZAN
+   IEMOCAP
   LibriMix
   LIBRISPEECH
   LibriLightLimited

--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -423,3 +423,13 @@ author = {Roldano Cattoni and Mattia Antonino {Di Gangi} and Luisa Bentivogli an
 keywords = {Spoken language translation, Multilingual corpus},
 abstract = {End-to-end spoken language translation (SLT) has recently gained popularity thanks to the advancement of sequence to sequence learning in its two parent tasks: automatic speech recognition (ASR) and machine translation (MT). However, research in the field has to confront with the scarcity of publicly available corpora to train data-hungry neural networks. Indeed, while traditional cascade solutions can build on sizable ASR and MT training data for a variety of languages, the available SLT corpora suitable for end-to-end training are few, typically small and of limited language coverage. We contribute to fill this gap by presenting MuST-C, a large and freely available Multilingual Speech Translation Corpus built from English TED Talks. Its unique features include: i) language coverage and diversity (from English into 14 languages from different families), ii) size (at least 237 hours of transcribed recordings per language, 430 on average), iii) variety of topics and speakers, and iv) data quality. Besides describing the corpus creation methodology and discussing the outcomes of empirical and manual quality evaluations, we present baseline results computed with strong systems on each language direction covered by MuST-C.}
 }
+@article{iemocap,
+  author = {Busso, Carlos and Bulut, Murtaza and Lee, Chi-Chun and Kazemzadeh, Abe and Mower Provost, Emily and Kim, Samuel and Chang, Jeannette and Lee, Sungbok and Narayanan, Shrikanth},
+  year = {2008},
+  month = {12},
+  pages = {335-359},
+  title = {IEMOCAP: Interactive emotional dyadic motion capture database},
+  volume = {42},
+  journal = {Language Resources and Evaluation},
+  doi = {10.1007/s10579-008-9076-6}
+}
--- a/test/torchaudio_unittest/datasets/iemocap_test.py
+++ b/test/torchaudio_unittest/datasets/iemocap_test.py
+import os
+import random
+from torchaudio.datasets import iemocap
+from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase
+LABELS = ["neu", "hap", "ang", "sad", "exc", "xxx"]
+SAMPLE_RATE = 16000
+def _save_wav(filepath: str, seed: int):
+    wav = get_whitenoise(
+        sample_rate=SAMPLE_RATE,
+        duration=0.01,
+        n_channels=1,
+        seed=seed,
+    )
+    save_wav(filepath, wav, SAMPLE_RATE)
+    return wav
+def _save_label(label_folder: str, filename: str, wav_stem: str):
+    label = random.choice(LABELS)
+    if label == "exc":
+        label = "hap"
+    line = f"[xxx]\t{wav_stem}\t{label}\t[yyy]"
+    filepath = os.path.join(label_folder, filename)
+    with open(filepath, "a") as f:
+        f.write(line + "\n")
+    return label
+def _get_samples(dataset_dir: str, session: int):
+    session_folder = os.path.join(dataset_dir, f"Session{session}")
+    os.makedirs(session_folder, exist_ok=True)
+    wav_folder = os.path.join(session_folder, "sentences", "wav")
+    label_folder = os.path.join(session_folder, "dialog", "EmoEvaluation")
+    os.makedirs(wav_folder, exist_ok=True)
+    os.makedirs(label_folder, exist_ok=True)
+    samples = []
+    wav_stems = []
+    for i in range(5):
+        for g in ["F", "M"]:
+            speaker = f"Ses0{session}{g}"
+            subfolder = f"{speaker}_impro0{i}"
+            subfolder_path = os.path.join(wav_folder, subfolder)
+            os.makedirs(subfolder_path, exist_ok=True)
+            for j in range(5):
+                wav_stem = f"{subfolder}_F00{j}"
+                wav_stems.append(wav_stem)
+    wav_stems = sorted(wav_stems)
+    for wav_stem in wav_stems:
+        subfolder = wav_stem[:-5]
+        speaker = subfolder.split("_")[0]
+        wav_file = os.path.join(wav_folder, subfolder, wav_stem + ".wav")
+        wav = _save_wav(wav_file, seed=0)
+        label = _save_label(label_folder, subfolder + ".txt", wav_stem)
+        if label == "xxx":
+            continue
+        sample = (wav, SAMPLE_RATE, wav_stem, label, speaker)
+        samples.append(sample)
+    return samples
+def get_mock_dataset(dataset_dir: str):
+    os.makedirs(dataset_dir, exist_ok=True)
+    samples = []
+    for session in range(1, 4):
+        samples += _get_samples(dataset_dir, session)
+    return samples
+class TestIemocap(TempDirMixin, TorchaudioTestCase):
+    root_dir = None
+    backend = "default"
+    samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        dataset_dir = os.path.join(cls.root_dir, "IEMOCAP")
+        cls.samples = get_mock_dataset(dataset_dir)
+    def _testIEMOCAP(self, dataset, samples):
+        num_samples = 0
+        for i, data in enumerate(dataset):
+            self.assertEqual(data, samples[i])
+            num_samples += 1
+        assert num_samples == len(samples)
+    def testIEMOCAPDataset(self):
+        dataset = iemocap.IEMOCAP(self.root_dir)
+        self._testIEMOCAP(dataset, self.samples)
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -4,6 +4,7 @@ from .commonvoice import COMMONVOICE
 from .dr_vctk import DR_VCTK
 from .fluentcommands import FluentSpeechCommands
 from .gtzan import GTZAN
+from .iemocap import IEMOCAP
 from .librilight_limited import LibriLightLimited
 from .librimix import LibriMix
 from .librispeech import LIBRISPEECH
@@ -38,4 +39,5 @@ __all__ = [
    "FluentSpeechCommands",
    "VoxCeleb1Identification",
    "VoxCeleb1Verification",
+    "IEMOCAP",
 ]
--- a/torchaudio/datasets/iemocap.py
+++ b/torchaudio/datasets/iemocap.py
+import os
+import re
+from pathlib import Path
+from typing import Tuple, Union
+from torch import Tensor
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import _load_waveform
+_SAMPLE_RATE = 16000
+def _get_wavs_paths(data_dir):
+    wav_dir = data_dir / "sentences" / "wav"
+    wav_paths = sorted(str(p) for p in wav_dir.glob("*/*.wav"))
+    relative_paths = []
+    for wav_path in wav_paths:
+        start = wav_path.find("Session")
+        wav_path = wav_path[start:]
+        relative_paths.append(wav_path)
+    return relative_paths
+class IEMOCAP(Dataset):
+    """*IEMOCAP* :cite:`iemocap` dataset.
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found
+        sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
+    """
+    def __init__(
+        self,
+        root: Union[str, Path],
+        sessions: Tuple[str] = (1, 2, 3, 4, 5),
+    ):
+        root = Path(root)
+        self._path = root / "IEMOCAP"
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found.")
+        all_data = []
+        self.data = []
+        self.mapping = {}
+        for session in sessions:
+            session_name = f"Session{session}"
+            session_dir = self._path / session_name
+            # get wav paths
+            wav_paths = _get_wavs_paths(session_dir)
+            for wav_path in wav_paths:
+                wav_stem = str(Path(wav_path).stem)
+                all_data.append(wav_stem)
+            # add labels
+            label_dir = session_dir / "dialog" / "EmoEvaluation"
+            label_paths = label_dir.glob("*.txt")
+            for label_path in label_paths:
+                with open(label_path, "r") as f:
+                    for line in f:
+                        if not line.startswith("["):
+                            continue
+                        line = re.split("[\t\n]", line)
+                        wav_stem = line[1]
+                        label = line[2]
+                        if label == "exc":
+                            label = "hap"
+                        if wav_stem not in all_data:
+                            continue
+                        if label not in ["neu", "hap", "ang", "sad"]:
+                            continue
+                        self.mapping[wav_stem] = {}
+                        self.mapping[wav_stem]["label"] = label
+            for wav_path in wav_paths:
+                wav_stem = str(Path(wav_path).stem)
+                if wav_stem in self.mapping:
+                    self.data.append(wav_stem)
+                    self.mapping[wav_stem]["path"] = wav_path
+    def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
+        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,
+        but otherwise returns the same fields as :py:meth:`__getitem__`.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+            str:
+                Path to audio
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
+            str:
+                Speaker
+        """
+        wav_stem = self.data[n]
+        wav_path = self.mapping[wav_stem]["path"]
+        label = self.mapping[wav_stem]["label"]
+        speaker = wav_stem.split("_")[0]
+        return (wav_path, _SAMPLE_RATE, wav_stem, label, speaker)
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            Tuple of the following items;
+            Tensor:
+                Waveform
+            int:
+                Sample rate
+            str:
+                File name
+            str:
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
+            str:
+                Speaker
+        """
+        metadata = self.get_metadata(n)
+        waveform = _load_waveform(self._path, metadata[0], metadata[1])
+        return (waveform,) + metadata[1:]
+    def __len__(self):
+        return len(self.data)