[BC-Breaking] Remove download and subdir from CommonVoice (#1082)

* Removes code for download logics * [BC-breaking] Changes the meaning of `root` argument to the exact directory of the dataset * Deprecates the constructor arguments for download and subdirectory construction

[BC-Breaking] Remove download and subdir from CommonVoice (#1082)
* Removes code for download logics * [BC-breaking] Changes the meaning of `root` argument to the exact directory of the dataset * Deprecates the constructor arguments for download and subdirectory construction
6b810240 · moto · GitHub · 80c97e6a · 6b810240 · 6b810240
Unverified Commit 6b810240 authored Dec 18, 2020 by moto Committed by GitHub Dec 18, 2020
3 changed files
--- a/test/torchaudio_unittest/datasets/commonvoice_test.py
+++ b/test/torchaudio_unittest/datasets/commonvoice_test.py
 import os
 import csv
-import random
 from pathlib import Path

-from torchaudio.datasets import commonvoice
+from torchaudio.datasets import COMMONVOICE
 from torchaudio_unittest.common_utils import (
    TempDirMixin,
    TorchaudioTestCase,
@@ -31,29 +30,23 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
            "common_voice_en_18607573.wav",
            "Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
    ]
-    _folder_audio = "clips"
    sample_rate = 48000

    @classmethod
    def setUpClass(cls):
        cls.root_dir = cls.get_base_temp_dir()
-        # The path convention commonvoice uses
-        base_dir = os.path.join(cls.root_dir, commonvoice.FOLDER_IN_ARCHIVE, commonvoice.VERSION, "en")
-        os.makedirs(base_dir, exist_ok=True)
-
        # Tsv file name difference does not mean different subset, testing as a whole dataset here
-        tsv_filename = os.path.join(base_dir, commonvoice.TSV)
+        tsv_filename = os.path.join(cls.root_dir, "train.tsv")
+        audio_base_path = os.path.join(cls.root_dir, "clips")
+        os.makedirs(audio_base_path, exist_ok=True)
        with open(tsv_filename, "w", newline='') as tsv:
            writer = csv.writer(tsv, delimiter='\t')
            writer.writerow(cls._headers)
            for i, content in enumerate(cls._train_csv_contents):
-                audio_filename = audio_filename = content[1]
                writer.writerow(content)

                # Generate and store audio
-                audio_base_path = os.path.join(base_dir, cls._folder_audio)
-                os.makedirs(audio_base_path, exist_ok=True)
-                audio_path = os.path.join(audio_base_path, audio_filename)
+                audio_path = os.path.join(audio_base_path, content[1])
                data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
                save_wav(audio_path, data, cls.sample_rate)

@@ -72,9 +65,9 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
        assert n_ite == len(self.data)

    def test_commonvoice_str(self):
-        dataset = commonvoice.COMMONVOICE(self.root_dir)
+        dataset = COMMONVOICE(self.root_dir)
        self._test_commonvoice(dataset)

    def test_commonvoice_path(self):
-        dataset = commonvoice.COMMONVOICE(Path(self.root_dir))
+        dataset = COMMONVOICE(Path(self.root_dir))
        self._test_commonvoice(dataset)
--- a/test/torchaudio_unittest/datasets/utils_test.py
+++ b/test/torchaudio_unittest/datasets/utils_test.py
@@ -51,7 +51,7 @@ class TestWalkFiles(TempDirMixin, TorchaudioTestCase):

 class TestIterator(TorchaudioTestCase):
    backend = 'default'
-    path = get_asset_path()
+    path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')

    def test_disckcache_iterator(self):
        data = COMMONVOICE(self.path, url="tatar")

--- a/torchaudio/datasets/commonvoice.py
+++ b/torchaudio/datasets/commonvoice.py
 import os
+import warnings
 from pathlib import Path
-from typing import List, Dict, Tuple, Union
+from typing import List, Dict, Tuple, Union, Optional

 import torchaudio
-from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
+from torchaudio.datasets.utils import unicode_csv_reader
 from torch import Tensor
 from torch.utils.data import Dataset

-# Default TSV should be one of
-# dev.tsv
-# invalidated.tsv
-# other.tsv
-# test.tsv
-# train.tsv
-# validated.tsv
-
-FOLDER_IN_ARCHIVE = "CommonVoice"
-URL = "english"
-VERSION = "cv-corpus-4-2019-12-10"
-TSV = "train.tsv"
-_CHECKSUMS = {
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
-    None,
-    "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
-    None
-}
-

 def load_commonvoice_item(line: List[str],
                          header: List[str],
@@ -104,25 +32,16 @@ class COMMONVOICE(Dataset):
    """Create a Dataset for CommonVoice.

    Args:
-        root (str or Path): Path to the directory where the dataset is found or downloaded.
-        tsv (str, optional): The name of the tsv file used to construct the metadata.
-            (default: ``"train.tsv"``)
-        url (str, optional): The URL to download the dataset from, or the language of
-            the dataset to download. (default: ``"english"``).
-            Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
-            ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
-            ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
-            ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
-            ``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,
-            ``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,
-            ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
-            ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
-            ``"romansh sursilvan"``.
-        folder_in_archive (str, optional): The top-level directory of the dataset.
-        version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
-            For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
-        download (bool, optional):
-            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        root (str or Path): Path to the directory where the dataset is located.
+             (Where the ``tsv`` file is present.)
+        tsv (str, optional):
+            The name of the tsv file used to construct the metadata, such as
+            ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
+            ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
+        url (str, optional): Deprecated, not used.
+        folder_in_archive (str, optional): Deprecated, not used.
+        version (str): Deprecated, not used.
+        download (bool, optional): Deprecated, not used.
    """

    _ext_txt = ".txt"
@@ -131,93 +50,36 @@ class COMMONVOICE(Dataset):

    def __init__(self,
                 root: Union[str, Path],
-                 tsv: str = TSV,
-                 url: str = URL,
-                 folder_in_archive: str = FOLDER_IN_ARCHIVE,
-                 version: str = VERSION,
-                 download: bool = False) -> None:
-
-        languages = {
-            "tatar": "tt",
-            "english": "en",
-            "german": "de",
-            "french": "fr",
-            "welsh": "cy",
-            "breton": "br",
-            "chuvash": "cv",
-            "turkish": "tr",
-            "kyrgyz": "ky",
-            "irish": "ga-IE",
-            "kabyle": "kab",
-            "catalan": "ca",
-            "taiwanese": "zh-TW",
-            "slovenian": "sl",
-            "italian": "it",
-            "dutch": "nl",
-            "hakha chin": "cnh",
-            "esperanto": "eo",
-            "estonian": "et",
-            "persian": "fa",
-            "portuguese": "pt",
-            "basque": "eu",
-            "spanish": "es",
-            "chinese": "zh-CN",
-            "mongolian": "mn",
-            "sakha": "sah",
-            "dhivehi": "dv",
-            "kinyarwanda": "rw",
-            "swedish": "sv-SE",
-            "russian": "ru",
-            "indonesian": "id",
-            "arabic": "ar",
-            "tamil": "ta",
-            "interlingua": "ia",
-            "latvian": "lv",
-            "japanese": "ja",
-            "votic": "vot",
-            "abkhaz": "ab",
-            "cantonese": "zh-HK",
-            "romansh sursilvan": "rm-sursilv"
-        }
-
+                 tsv: str = "train.tsv",
+                 url: Optional[str] = None,
+                 folder_in_archive: Optional[str] = None,
+                 version: Optional[str] = None,
+                 download: Optional[bool] = None) -> None:
        if download:
            raise RuntimeError(
                "Common Voice dataset requires user agreement on the usage term, "
                "and torchaudio no longer provides the download feature. "
-                "Please download the dataset manually and extract it in the root directory, "
-                "then provide the target language to `url` argument.")
-        if url not in languages:
-            raise ValueError(f"`url` must be one of available languages: {languages.keys()}")
-
-        if url in languages:
-            ext_archive = ".tar.gz"
-            language = languages[url]
-
-            base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
-            url = os.path.join(base_url, version, language + ext_archive)
+                "Please download the dataset and extract it manually.")
+
+        deprecated = [
+            ('url', url),
+            ('folder_in_archive', folder_in_archive),
+            ('version', version),
+            ('download', download)
+        ]
+        for name, val in deprecated:
+            if val is not None:
+                warnings.warn(
+                    f"`{name}` argument is no longer used and deprecated. "
+                    "It will be removed in 0.9.0 releaase. "
+                    "Please remove it from the function call")

        # Get string representation of 'root' in case Path object is passed
-        root = os.fspath(root)
-
-        basename = os.path.basename(url)
-        archive = os.path.join(root, basename)
-
-        basename = basename.rsplit(".", 2)[0]
-        folder_in_archive = os.path.join(folder_in_archive, version, basename)
-
-        self._path = os.path.join(root, folder_in_archive)
-
-        if download:
-            if not os.path.isdir(self._path):
-                if not os.path.isfile(archive):
-                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum)
-                extract_archive(archive)
-
-        self._tsv = os.path.join(root, folder_in_archive, tsv)
+        self._path = os.fspath(root)
+        self._tsv = os.path.join(self._path, tsv)

-        with open(self._tsv, "r") as tsv:
-            walker = unicode_csv_reader(tsv, delimiter="\t")
+        with open(self._tsv, "r") as tsv_:
+            walker = unicode_csv_reader(tsv_, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)