Add iemocap variants (#2778)

Summary: add ability to load only improvised or only scripted utterances. Pull Request resolved: https://github.com/pytorch/audio/pull/2778 Reviewed By: nateanl Differential Revision: D40511865 Pulled By: carolineechen fbshipit-source-id: e1fe3908ac2aa306ad30c242ddd25762b2268539

Add iemocap variants (#2778)
Summary: add ability to load only improvised or only scripted utterances. Pull Request resolved: https://github.com/pytorch/audio/pull/2778 Reviewed By: nateanl Differential Revision: D40511865 Pulled By: carolineechen fbshipit-source-id: e1fe3908ac2aa306ad30c242ddd25762b2268539
34255386 · Caroline Chen · Facebook GitHub Bot · 9135b544 · 34255386 · 34255386
Commit 34255386 authored Oct 19, 2022 by Caroline Chen Committed by Facebook GitHub Bot Oct 19, 2022
Show whitespace changes
Inline Side-by-side

Showing with 60 additions and 29 deletions

test/torchaudio_unittest/datasets/iemocap_test.py test/torchaudio_unittest/datasets/iemocap_test.py +43 -22

torchaudio/datasets/iemocap.py torchaudio/datasets/iemocap.py +17 -7

No files found.
--- a/test/torchaudio_unittest/datasets/iemocap_test.py
+++ b/test/torchaudio_unittest/datasets/iemocap_test.py
@@ -4,7 +4,7 @@ import random
 from torchaudio.datasets import iemocap
 from torchaudio_unittest.common_utils import get_whitenoise, save_wav, TempDirMixin, TorchaudioTestCase

-LABELS = ["neu", "hap", "ang", "sad", "exc", "xxx"]
+LABELS = ["neu", "hap", "ang", "sad", "exc", "fru", "xxx"]
 SAMPLE_RATE = 16000


@@ -21,8 +21,6 @@ def _save_wav(filepath: str, seed: int):

 def _save_label(label_folder: str, filename: str, wav_stem: str):
    label = random.choice(LABELS)
-    if label == "exc":
-        label = "hap"
    line = f"[xxx]\t{wav_stem}\t{label}\t[yyy]"
    filepath = os.path.join(label_folder, filename)

@@ -40,12 +38,12 @@ def _get_samples(dataset_dir: str, session: int):
    os.makedirs(wav_folder, exist_ok=True)
    os.makedirs(label_folder, exist_ok=True)

-    samples = []
    wav_stems = []
    for i in range(5):
        for g in ["F", "M"]:
+            for utt in ["impro", "script"]:
                speaker = f"Ses0{session}{g}"
-            subfolder = f"{speaker}_impro0{i}"
+                subfolder = f"{speaker}_{utt}0{i}"
                subfolder_path = os.path.join(wav_folder, subfolder)
                os.makedirs(subfolder_path, exist_ok=True)

@@ -53,6 +51,9 @@ def _get_samples(dataset_dir: str, session: int):
                    wav_stem = f"{subfolder}_F00{j}"
                    wav_stems.append(wav_stem)

+    all_samples = []
+    impro_samples = []
+    script_samples = []
    wav_stems = sorted(wav_stems)
    for wav_stem in wav_stems:
        subfolder = wav_stem[:-5]
@@ -64,31 +65,43 @@ def _get_samples(dataset_dir: str, session: int):
        if label == "xxx":
            continue
        sample = (wav, SAMPLE_RATE, wav_stem, label, speaker)
-        samples.append(sample)
+        all_samples.append(sample)

-    return samples
+        if "impro" in subfolder:
+            impro_samples.append(sample)
+        else:
+            script_samples.append(sample)
+
+    return all_samples, script_samples, impro_samples


 def get_mock_dataset(dataset_dir: str):
    os.makedirs(dataset_dir, exist_ok=True)

-    samples = []
+    all_samples = []
+    script_samples = []
+    impro_samples = []
    for session in range(1, 4):
-        samples += _get_samples(dataset_dir, session)
-    return samples
+        samples = _get_samples(dataset_dir, session)
+        all_samples += samples[0]
+        script_samples += samples[1]
+        impro_samples += samples[2]
+    return all_samples, script_samples, impro_samples


 class TestIemocap(TempDirMixin, TorchaudioTestCase):
    root_dir = None
    backend = "default"

-    samples = []
+    all_samples = []
+    script_samples = []
+    impro_samples = []

    @classmethod
    def setUpClass(cls):
        cls.root_dir = cls.get_base_temp_dir()
        dataset_dir = os.path.join(cls.root_dir, "IEMOCAP")
-        cls.samples = get_mock_dataset(dataset_dir)
+        cls.all_samples, cls.script_samples, cls.impro_samples = get_mock_dataset(dataset_dir)

    def _testIEMOCAP(self, dataset, samples):
        num_samples = 0
@@ -98,6 +111,14 @@ class TestIemocap(TempDirMixin, TorchaudioTestCase):

        assert num_samples == len(samples)

-    def testIEMOCAPDataset(self):
+    def testIEMOCAPFullDataset(self):
        dataset = iemocap.IEMOCAP(self.root_dir)
-        self._testIEMOCAP(dataset, self.samples)
+        self._testIEMOCAP(dataset, self.all_samples)
+
+    def testIEMOCAPScriptedDataset(self):
+        dataset = iemocap.IEMOCAP(self.root_dir, utterance_type="scripted")
+        self._testIEMOCAP(dataset, self.script_samples)
+
+    def testIEMOCAPImprovisedDataset(self):
+        dataset = iemocap.IEMOCAP(self.root_dir, utterance_type="improvised")
+        self._testIEMOCAP(dataset, self.impro_samples)
--- a/torchaudio/datasets/iemocap.py
+++ b/torchaudio/datasets/iemocap.py
 import os
 import re
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union

 from torch import Tensor
 from torch.utils.data import Dataset
@@ -28,12 +28,16 @@ class IEMOCAP(Dataset):
    Args:
        root (str or Path): Root directory where the dataset's top level directory is found
        sessions (Tuple[int]): Tuple of sessions (1-5) to use. (Default: ``(1, 2, 3, 4, 5)``)
+        utterance_type (str or None, optional): Which type(s) of utterances to include in the dataset.
+            Options: ("scripted", "improvised", ``None``). If ``None``, both scripted and improvised
+            data are used.
    """

    def __init__(
        self,
        root: Union[str, Path],
        sessions: Tuple[str] = (1, 2, 3, 4, 5),
+        utterance_type: Optional[str] = None,
    ):
        root = Path(root)
        self._path = root / "IEMOCAP"
@@ -41,6 +45,9 @@ class IEMOCAP(Dataset):
        if not os.path.isdir(self._path):
            raise RuntimeError("Dataset not found.")

+        if utterance_type not in ["scripted", "improvised", None]:
+            raise ValueError("utterance_type must be one of ['scripted', 'improvised', or None]")
+
        all_data = []
        self.data = []
        self.mapping = {}
@@ -57,7 +64,12 @@ class IEMOCAP(Dataset):

            # add labels
            label_dir = session_dir / "dialog" / "EmoEvaluation"
-            label_paths = label_dir.glob("*.txt")
+            query = "*.txt"
+            if utterance_type == "scripted":
+                query = "*script*.txt"
+            elif utterance_type == "improvised":
+                query = "*impro*.txt"
+            label_paths = label_dir.glob(query)

            for label_path in label_paths:
                with open(label_path, "r") as f:
@@ -67,11 +79,9 @@ class IEMOCAP(Dataset):
                        line = re.split("[\t\n]", line)
                        wav_stem = line[1]
                        label = line[2]
-                        if label == "exc":
-                            label = "hap"
                        if wav_stem not in all_data:
                            continue
-                        if label not in ["neu", "hap", "ang", "sad"]:
+                        if label not in ["neu", "hap", "ang", "sad", "exc", "fru"]:
                            continue
                        self.mapping[wav_stem] = {}
                        self.mapping[wav_stem]["label"] = label
@@ -99,7 +109,7 @@ class IEMOCAP(Dataset):
            str:
                File name
            str:
-                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
            str:
                Speaker
        """
@@ -125,7 +135,7 @@ class IEMOCAP(Dataset):
            str:
                File name
            str:
-                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``)
+                Label (one of ``"neu"``, ``"hap"``, ``"ang"``, ``"sad"``, ``"exc"``, ``"fru"``)
            str:
                Speaker
        """