"...text-generation-inference.git" did not exist on "4a7dd4085a3de06a1ceee376eedf0122f33e0027"
Commit 8d4e17a2 authored by Zhaoheng Ni's avatar Zhaoheng Ni Committed by Facebook GitHub Bot
Browse files

Add subset support for TEDLIUM release3 dataset (#2157)

Summary:
According to [the dataset discription](https://paperswithcode.com/dataset/ted-lium-3), the ``dev`` and ``test`` subsets of TEDLIUM v3 dataset are the same as v2. (under ``TEDLIUM_release-3/legacy`` directory). The ``train`` subset is under ``TEDLIUM_release-3/data`` directory. This PR adds subset support for it.

This also aligns with [TensorFlow's tedlium/release3](https://www.tensorflow.org/datasets/catalog/tedlium#tedliumrelease3) dataset.

Pull Request resolved: https://github.com/pytorch/audio/pull/2157

Reviewed By: mthrok

Differential Revision: D33585211

Pulled By: nateanl

fbshipit-source-id: 87cfe0d02b3a4c2cf7e2da0ccb7443fff5c43689
parent 9588435c
...@@ -35,8 +35,8 @@ _RELEASE_CONFIGS = { ...@@ -35,8 +35,8 @@ _RELEASE_CONFIGS = {
"url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz", "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
"checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
"data_path": "data/", "data_path": "data/",
"subset": None, "subset": "train",
"supported_subsets": [None], "supported_subsets": ["train", "test", "dev"],
"dict": "TEDLIUM.152k.dic", "dict": "TEDLIUM.152k.dic",
}, },
} }
...@@ -52,17 +52,17 @@ class TEDLIUM(Dataset): ...@@ -52,17 +52,17 @@ class TEDLIUM(Dataset):
Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``. Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
(default: ``"release1"``). (default: ``"release1"``).
subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``, subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
and ``"test"`` for releases 1&2, ``None`` for release3. Defaults to ``"train"`` or ``None``. and ``"test"``. Defaults to ``"train"``.
download (bool, optional): download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``). Whether to download the dataset if it is not found at root path. (default: ``False``).
audio_ext (str, optional): extension for audio file (default: ``"audio_ext"``) audio_ext (str, optional): extension for audio file (default: ``".sph"``)
""" """
def __init__( def __init__(
self, self,
root: Union[str, Path], root: Union[str, Path],
release: str = "release1", release: str = "release1",
subset: str = None, subset: str = "train",
download: bool = False, download: bool = False,
audio_ext: str = ".sph", audio_ext: str = ".sph",
) -> None: ) -> None:
...@@ -96,9 +96,13 @@ class TEDLIUM(Dataset): ...@@ -96,9 +96,13 @@ class TEDLIUM(Dataset):
basename = basename.split(".")[0] basename = basename.split(".")[0]
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) if release == "release3":
if subset in ["train", "dev", "test"]: if subset == "train":
self._path = os.path.join(self._path, subset) self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
else:
self._path = os.path.join(root, folder_in_archive, "legacy", subset)
else:
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
if download: if download:
if not os.path.isdir(self._path): if not os.path.isdir(self._path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment