Unverified Commit 6b810240 authored by moto's avatar moto Committed by GitHub
Browse files

[BC-Breaking] Remove download and subdir from CommonVoice (#1082)

* Removes code for download logics 
* [BC-breaking] Changes the meaning of `root` argument to the exact directory of the dataset
* Deprecates the constructor arguments for download and subdirectory construction
parent 80c97e6a
import os
import csv
import random
from pathlib import Path
from torchaudio.datasets import commonvoice
from torchaudio.datasets import COMMONVOICE
from torchaudio_unittest.common_utils import (
TempDirMixin,
TorchaudioTestCase,
......@@ -31,29 +30,23 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
"common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
]
_folder_audio = "clips"
sample_rate = 48000
@classmethod
def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir()
# The path convention commonvoice uses
base_dir = os.path.join(cls.root_dir, commonvoice.FOLDER_IN_ARCHIVE, commonvoice.VERSION, "en")
os.makedirs(base_dir, exist_ok=True)
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename = os.path.join(base_dir, commonvoice.TSV)
tsv_filename = os.path.join(cls.root_dir, "train.tsv")
audio_base_path = os.path.join(cls.root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t')
writer.writerow(cls._headers)
for i, content in enumerate(cls._train_csv_contents):
audio_filename = audio_filename = content[1]
writer.writerow(content)
# Generate and store audio
audio_base_path = os.path.join(base_dir, cls._folder_audio)
os.makedirs(audio_base_path, exist_ok=True)
audio_path = os.path.join(audio_base_path, audio_filename)
audio_path = os.path.join(audio_base_path, content[1])
data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
save_wav(audio_path, data, cls.sample_rate)
......@@ -72,9 +65,9 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
assert n_ite == len(self.data)
def test_commonvoice_str(self):
dataset = commonvoice.COMMONVOICE(self.root_dir)
dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset)
def test_commonvoice_path(self):
dataset = commonvoice.COMMONVOICE(Path(self.root_dir))
dataset = COMMONVOICE(Path(self.root_dir))
self._test_commonvoice(dataset)
......@@ -51,7 +51,7 @@ class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
class TestIterator(TorchaudioTestCase):
backend = 'default'
path = get_asset_path()
path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')
def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, url="tatar")
......
import os
import warnings
from pathlib import Path
from typing import List, Dict, Tuple, Union
from typing import List, Dict, Tuple, Union, Optional
import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torchaudio.datasets.utils import unicode_csv_reader
from torch import Tensor
from torch.utils.data import Dataset
# Default TSV should be one of
# dev.tsv
# invalidated.tsv
# other.tsv
# test.tsv
# train.tsv
# validated.tsv
FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
}
def load_commonvoice_item(line: List[str],
header: List[str],
......@@ -104,25 +32,16 @@ class COMMONVOICE(Dataset):
"""Create a Dataset for CommonVoice.
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
url (str, optional): The URL to download the dataset from, or the language of
the dataset to download. (default: ``"english"``).
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,
``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
root (str or Path): Path to the directory where the dataset is located.
(Where the ``tsv`` file is present.)
tsv (str, optional):
The name of the tsv file used to construct the metadata, such as
``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
url (str, optional): Deprecated, not used.
folder_in_archive (str, optional): Deprecated, not used.
version (str): Deprecated, not used.
download (bool, optional): Deprecated, not used.
"""
_ext_txt = ".txt"
......@@ -131,93 +50,36 @@ class COMMONVOICE(Dataset):
def __init__(self,
root: Union[str, Path],
tsv: str = TSV,
url: str = URL,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION,
download: bool = False) -> None:
languages = {
"tatar": "tt",
"english": "en",
"german": "de",
"french": "fr",
"welsh": "cy",
"breton": "br",
"chuvash": "cv",
"turkish": "tr",
"kyrgyz": "ky",
"irish": "ga-IE",
"kabyle": "kab",
"catalan": "ca",
"taiwanese": "zh-TW",
"slovenian": "sl",
"italian": "it",
"dutch": "nl",
"hakha chin": "cnh",
"esperanto": "eo",
"estonian": "et",
"persian": "fa",
"portuguese": "pt",
"basque": "eu",
"spanish": "es",
"chinese": "zh-CN",
"mongolian": "mn",
"sakha": "sah",
"dhivehi": "dv",
"kinyarwanda": "rw",
"swedish": "sv-SE",
"russian": "ru",
"indonesian": "id",
"arabic": "ar",
"tamil": "ta",
"interlingua": "ia",
"latvian": "lv",
"japanese": "ja",
"votic": "vot",
"abkhaz": "ab",
"cantonese": "zh-HK",
"romansh sursilvan": "rm-sursilv"
}
tsv: str = "train.tsv",
url: Optional[str] = None,
folder_in_archive: Optional[str] = None,
version: Optional[str] = None,
download: Optional[bool] = None) -> None:
if download:
raise RuntimeError(
"Common Voice dataset requires user agreement on the usage term, "
"and torchaudio no longer provides the download feature. "
"Please download the dataset manually and extract it in the root directory, "
"then provide the target language to `url` argument.")
if url not in languages:
raise ValueError(f"`url` must be one of available languages: {languages.keys()}")
if url in languages:
ext_archive = ".tar.gz"
language = languages[url]
base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
url = os.path.join(base_url, version, language + ext_archive)
"Please download the dataset and extract it manually.")
deprecated = [
('url', url),
('folder_in_archive', folder_in_archive),
('version', version),
('download', download)
]
for name, val in deprecated:
if val is not None:
warnings.warn(
f"`{name}` argument is no longer used and deprecated. "
"It will be removed in 0.9.0 releaase. "
"Please remove it from the function call")
# Get string representation of 'root' in case Path object is passed
root = os.fspath(root)
basename = os.path.basename(url)
archive = os.path.join(root, basename)
basename = basename.rsplit(".", 2)[0]
folder_in_archive = os.path.join(folder_in_archive, version, basename)
self._path = os.path.join(root, folder_in_archive)
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
self._tsv = os.path.join(root, folder_in_archive, tsv)
self._path = os.fspath(root)
self._tsv = os.path.join(self._path, tsv)
with open(self._tsv, "r") as tsv:
walker = unicode_csv_reader(tsv, delimiter="\t")
with open(self._tsv, "r") as tsv_:
walker = unicode_csv_reader(tsv_, delimiter="\t")
self._header = next(walker)
self._walker = list(walker)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment