Unverified Commit 366cef83 authored by moto's avatar moto Committed by GitHub
Browse files

Revert "no longer download CommonVoice directly (#1018)" (#1079)

This reverts commit 09a6fca1.
parent a2085b85
...@@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase): ...@@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase):
path = get_asset_path() path = get_asset_path()
def test_disckcache_iterator(self): def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.diskcache_iterator(data) data = dataset_utils.diskcache_iterator(data)
# Save # Save
data[0] data[0]
...@@ -62,7 +62,7 @@ class TestIterator(TorchaudioTestCase): ...@@ -62,7 +62,7 @@ class TestIterator(TorchaudioTestCase):
data[0] data[0]
def test_bg_iterator(self): def test_bg_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.bg_iterator(data, 5) data = dataset_utils.bg_iterator(data, 5)
for _ in data: for _ in data:
pass pass
import os import os
import warnings
from pathlib import Path from pathlib import Path
from typing import List, Dict, Tuple, Optional, Union from typing import List, Dict, Tuple, Union
import torchaudio import torchaudio
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torch import Tensor from torch import Tensor
from torch.utils.data import Dataset from torch.utils.data import Dataset
...@@ -17,39 +16,68 @@ from torch.utils.data import Dataset ...@@ -17,39 +16,68 @@ from torch.utils.data import Dataset
# validated.tsv # validated.tsv
FOLDER_IN_ARCHIVE = "CommonVoice" FOLDER_IN_ARCHIVE = "CommonVoice"
LANGUAGE = "english" URL = "english"
VERSION = "cv-corpus-5.1-2020-06-22" VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv" TSV = "train.tsv"
_CHECKSUMS = { _CHECKSUMS = {
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
"cv-corpus-5.1-2020-06-22/en.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/de.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
"cv-corpus-5.1-2020-06-22/br.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/it.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/et.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
"cv-corpus-5.1-2020-06-22/es.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None, None,
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None, "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
} }
...@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str], ...@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
class COMMONVOICE(Dataset): class COMMONVOICE(Dataset):
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_. """Create a Dataset for CommonVoice.
Args: Args:
root (str or Path): Path to the directory where the dataset is found or downloaded. root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata. tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``) (default: ``"train.tsv"``)
url (str, optional): Deprecated. url (str, optional): The URL to download the dataset from, or the language of
folder_in_archive (str, optional): The top-level directory of the dataset. the dataset to download. (default: ``"english"``).
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``) Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
language (str, optional): Language of the dataset. (default: None)
The following values are mapped to their corresponding shortened version:
``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``, ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
...@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset): ...@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``. ``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional): Deprecated. download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
""" """
_ext_txt = ".txt" _ext_txt = ".txt"
...@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset): ...@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
def __init__(self, def __init__(self,
root: Union[str, Path], root: Union[str, Path],
tsv: str = TSV, tsv: str = TSV,
url: Optional[str] = None, url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE, folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION, version: str = VERSION,
language: str = LANGUAGE, download: bool = False) -> None:
download: Optional[bool] = False) -> None:
if download is True:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
elif download is False:
warnings.warn(
"The use of the download flag is deprecated, since the dataset "
"is no longer directly accessible.", RuntimeWarning
)
if url is not None:
warnings.warn(
"The use of the url flag is deprecated, since the dataset "
"is no longer publicly accessible. To specify the language of the dataset, "
"please use the language parameter instead.", RuntimeWarning
)
languages = { languages = {
"tatar": "tt", "tatar": "tt",
...@@ -172,22 +180,12 @@ class COMMONVOICE(Dataset): ...@@ -172,22 +180,12 @@ class COMMONVOICE(Dataset):
"romansh sursilvan": "rm-sursilv" "romansh sursilvan": "rm-sursilv"
} }
if language in languages: if url in languages:
ext_archive = ".tar.gz" ext_archive = ".tar.gz"
language = languages[language] language = languages[url]
url = os.path.join(version, language + ext_archive)
else: base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
raise ValueError( url = os.path.join(base_url, version, language + ext_archive)
'Allowed language values are "tatar", "english", "german",'
'"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",'
'"irish", "kabyle", "catalan", "taiwanese", "slovenian",'
'"italian", "dutch", "hakha chin", "esperanto", "estonian",'
'"persian", "portuguese", "basque", "spanish", "chinese",'
'"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",'
'"russian", "indonesian", "arabic", "tamil", "interlingua",'
'"latvian", "japanese", "votic", "abkhaz", "cantonese" and'
'"romansh sursilvan".'
)
# Get string representation of 'root' in case Path object is passed # Get string representation of 'root' in case Path object is passed
root = os.fspath(root) root = os.fspath(root)
...@@ -200,23 +198,12 @@ class COMMONVOICE(Dataset): ...@@ -200,23 +198,12 @@ class COMMONVOICE(Dataset):
self._path = os.path.join(root, folder_in_archive) self._path = os.path.join(root, folder_in_archive)
if not os.path.isdir(self._path): if download:
if os.path.isfile(archive): if not os.path.isdir(self._path):
checksum = _CHECKSUMS.get(url, None) if not os.path.isfile(archive):
if checksum: checksum = _CHECKSUMS.get(url, None)
filepath = os.path.basename(url) download_url(url, root, hash_value=checksum)
with open(filepath, "rb") as file_obj:
if not validate_file(file_obj, checksum, "sha256"):
raise RuntimeError(
f"The hash of {filepath} does not match. Delete the file manually and retry."
)
extract_archive(archive) extract_archive(archive)
else:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
self._tsv = os.path.join(root, folder_in_archive, tsv) self._tsv = os.path.join(root, folder_in_archive, tsv)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment