Unverified Commit 366cef83 authored by moto's avatar moto Committed by GitHub
Browse files

Revert "no longer download CommonVoice directly (#1018)" (#1079)

This reverts commit 09a6fca1.
parent a2085b85
......@@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase):
path = get_asset_path()
def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.diskcache_iterator(data)
# Save
data[0]
......@@ -62,7 +62,7 @@ class TestIterator(TorchaudioTestCase):
data[0]
def test_bg_iterator(self):
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = COMMONVOICE(self.path, url="tatar")
data = dataset_utils.bg_iterator(data, 5)
for _ in data:
pass
import os
import warnings
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Union
from typing import List, Dict, Tuple, Union
import torchaudio
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torch import Tensor
from torch.utils.data import Dataset
......@@ -17,39 +16,68 @@ from torch.utils.data import Dataset
# validated.tsv
FOLDER_IN_ARCHIVE = "CommonVoice"
LANGUAGE = "english"
VERSION = "cv-corpus-5.1-2020-06-22"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv"
_CHECKSUMS = {
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None,
"cv-corpus-5.1-2020-06-22/en.tar.gz": None,
"cv-corpus-5.1-2020-06-22/de.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None,
"cv-corpus-5.1-2020-06-22/br.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/it.tar.gz": None,
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None,
"cv-corpus-5.1-2020-06-22/et.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None,
"cv-corpus-5.1-2020-06-22/es.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None,
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None,
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
}
......@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
class COMMONVOICE(Dataset):
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_.
"""Create a Dataset for CommonVoice.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
url (str, optional): Deprecated.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
language (str, optional): Language of the dataset. (default: None)
The following values are mapped to their corresponding shortened version:
``"tatar"``, ``"english"``, ``"german"``,
url (str, optional): The URL to download the dataset from, or the language of
the dataset to download. (default: ``"english"``).
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
......@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional): Deprecated.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
"""
_ext_txt = ".txt"
......@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
def __init__(self,
root: Union[str, Path],
tsv: str = TSV,
url: Optional[str] = None,
url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION,
language: str = LANGUAGE,
download: Optional[bool] = False) -> None:
if download is True:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
elif download is False:
warnings.warn(
"The use of the download flag is deprecated, since the dataset "
"is no longer directly accessible.", RuntimeWarning
)
if url is not None:
warnings.warn(
"The use of the url flag is deprecated, since the dataset "
"is no longer publicly accessible. To specify the language of the dataset, "
"please use the language parameter instead.", RuntimeWarning
)
download: bool = False) -> None:
languages = {
"tatar": "tt",
......@@ -172,22 +180,12 @@ class COMMONVOICE(Dataset):
"romansh sursilvan": "rm-sursilv"
}
if language in languages:
if url in languages:
ext_archive = ".tar.gz"
language = languages[language]
url = os.path.join(version, language + ext_archive)
else:
raise ValueError(
'Allowed language values are "tatar", "english", "german",'
'"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",'
'"irish", "kabyle", "catalan", "taiwanese", "slovenian",'
'"italian", "dutch", "hakha chin", "esperanto", "estonian",'
'"persian", "portuguese", "basque", "spanish", "chinese",'
'"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",'
'"russian", "indonesian", "arabic", "tamil", "interlingua",'
'"latvian", "japanese", "votic", "abkhaz", "cantonese" and'
'"romansh sursilvan".'
)
language = languages[url]
base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
url = os.path.join(base_url, version, language + ext_archive)
# Get string representation of 'root' in case Path object is passed
root = os.fspath(root)
......@@ -200,23 +198,12 @@ class COMMONVOICE(Dataset):
self._path = os.path.join(root, folder_in_archive)
if not os.path.isdir(self._path):
if os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
if checksum:
filepath = os.path.basename(url)
with open(filepath, "rb") as file_obj:
if not validate_file(file_obj, checksum, "sha256"):
raise RuntimeError(
f"The hash of {filepath} does not match. Delete the file manually and retry."
)
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
else:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
self._tsv = os.path.join(root, folder_in_archive, tsv)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment