Commit 8d4e17a2 authored by Zhaoheng Ni's avatar Zhaoheng Ni Committed by Facebook GitHub Bot
Browse files

Add subset support for TEDLIUM release3 dataset (#2157)

Summary:
According to [the dataset discription](https://paperswithcode.com/dataset/ted-lium-3), the ``dev`` and ``test`` subsets of TEDLIUM v3 dataset are the same as v2. (under ``TEDLIUM_release-3/legacy`` directory). The ``train`` subset is under ``TEDLIUM_release-3/data`` directory. This PR adds subset support for it.

This also aligns with [TensorFlow's tedlium/release3](https://www.tensorflow.org/datasets/catalog/tedlium#tedliumrelease3) dataset.

Pull Request resolved: https://github.com/pytorch/audio/pull/2157

Reviewed By: mthrok

Differential Revision: D33585211

Pulled By: nateanl

fbshipit-source-id: 87cfe0d02b3a4c2cf7e2da0ccb7443fff5c43689
parent 9588435c
......@@ -35,8 +35,8 @@ _RELEASE_CONFIGS = {
"url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz",
"checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb",
"data_path": "data/",
"subset": None,
"supported_subsets": [None],
"subset": "train",
"supported_subsets": ["train", "test", "dev"],
"dict": "TEDLIUM.152k.dic",
},
}
......@@ -52,17 +52,17 @@ class TEDLIUM(Dataset):
Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``.
(default: ``"release1"``).
subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``,
and ``"test"`` for releases 1&2, ``None`` for release3. Defaults to ``"train"`` or ``None``.
and ``"test"``. Defaults to ``"train"``.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
audio_ext (str, optional): extension for audio file (default: ``"audio_ext"``)
audio_ext (str, optional): extension for audio file (default: ``".sph"``)
"""
def __init__(
self,
root: Union[str, Path],
release: str = "release1",
subset: str = None,
subset: str = "train",
download: bool = False,
audio_ext: str = ".sph",
) -> None:
......@@ -96,9 +96,13 @@ class TEDLIUM(Dataset):
basename = basename.split(".")[0]
if release == "release3":
if subset == "train":
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"])
if subset in ["train", "dev", "test"]:
self._path = os.path.join(self._path, subset)
else:
self._path = os.path.join(root, folder_in_archive, "legacy", subset)
else:
self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"], subset)
if download:
if not os.path.isdir(self._path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment