"tests/vscode:/vscode.git/clone" did not exist on "f103bbf9ee76bb1cbca9f8876986318680e95053"
Unverified Commit 6b810240 authored by moto's avatar moto Committed by GitHub
Browse files

[BC-Breaking] Remove download and subdir from CommonVoice (#1082)

* Removes code for download logics 
* [BC-breaking] Changes the meaning of `root` argument to the exact directory of the dataset
* Deprecates the constructor arguments for download and subdirectory construction
parent 80c97e6a
import os import os
import csv import csv
import random
from pathlib import Path from pathlib import Path
from torchaudio.datasets import commonvoice from torchaudio.datasets import COMMONVOICE
from torchaudio_unittest.common_utils import ( from torchaudio_unittest.common_utils import (
TempDirMixin, TempDirMixin,
TorchaudioTestCase, TorchaudioTestCase,
...@@ -31,29 +30,23 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase): ...@@ -31,29 +30,23 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
"common_voice_en_18607573.wav", "common_voice_en_18607573.wav",
"Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"], "Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
] ]
_folder_audio = "clips"
sample_rate = 48000 sample_rate = 48000
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.root_dir = cls.get_base_temp_dir() cls.root_dir = cls.get_base_temp_dir()
# The path convention commonvoice uses
base_dir = os.path.join(cls.root_dir, commonvoice.FOLDER_IN_ARCHIVE, commonvoice.VERSION, "en")
os.makedirs(base_dir, exist_ok=True)
# Tsv file name difference does not mean different subset, testing as a whole dataset here # Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename = os.path.join(base_dir, commonvoice.TSV) tsv_filename = os.path.join(cls.root_dir, "train.tsv")
audio_base_path = os.path.join(cls.root_dir, "clips")
os.makedirs(audio_base_path, exist_ok=True)
with open(tsv_filename, "w", newline='') as tsv: with open(tsv_filename, "w", newline='') as tsv:
writer = csv.writer(tsv, delimiter='\t') writer = csv.writer(tsv, delimiter='\t')
writer.writerow(cls._headers) writer.writerow(cls._headers)
for i, content in enumerate(cls._train_csv_contents): for i, content in enumerate(cls._train_csv_contents):
audio_filename = audio_filename = content[1]
writer.writerow(content) writer.writerow(content)
# Generate and store audio # Generate and store audio
audio_base_path = os.path.join(base_dir, cls._folder_audio) audio_path = os.path.join(audio_base_path, content[1])
os.makedirs(audio_base_path, exist_ok=True)
audio_path = os.path.join(audio_base_path, audio_filename)
data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32') data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32')
save_wav(audio_path, data, cls.sample_rate) save_wav(audio_path, data, cls.sample_rate)
...@@ -72,9 +65,9 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase): ...@@ -72,9 +65,9 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
assert n_ite == len(self.data) assert n_ite == len(self.data)
def test_commonvoice_str(self): def test_commonvoice_str(self):
dataset = commonvoice.COMMONVOICE(self.root_dir) dataset = COMMONVOICE(self.root_dir)
self._test_commonvoice(dataset) self._test_commonvoice(dataset)
def test_commonvoice_path(self): def test_commonvoice_path(self):
dataset = commonvoice.COMMONVOICE(Path(self.root_dir)) dataset = COMMONVOICE(Path(self.root_dir))
self._test_commonvoice(dataset) self._test_commonvoice(dataset)
...@@ -51,7 +51,7 @@ class TestWalkFiles(TempDirMixin, TorchaudioTestCase): ...@@ -51,7 +51,7 @@ class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
class TestIterator(TorchaudioTestCase): class TestIterator(TorchaudioTestCase):
backend = 'default' backend = 'default'
path = get_asset_path() path = get_asset_path('CommonVoice', 'cv-corpus-4-2019-12-10', 'tt')
def test_disckcache_iterator(self): def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, url="tatar") data = COMMONVOICE(self.path, url="tatar")
......
import os import os
import warnings
from pathlib import Path from pathlib import Path
from typing import List, Dict, Tuple, Union from typing import List, Dict, Tuple, Union, Optional
import torchaudio import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader from torchaudio.datasets.utils import unicode_csv_reader
from torch import Tensor from torch import Tensor
from torch.utils.data import Dataset from torch.utils.data import Dataset
# Default TSV should be one of
# dev.tsv
# invalidated.tsv
# other.tsv
# test.tsv
# train.tsv
# validated.tsv
FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
}
def load_commonvoice_item(line: List[str], def load_commonvoice_item(line: List[str],
header: List[str], header: List[str],
...@@ -104,25 +32,16 @@ class COMMONVOICE(Dataset): ...@@ -104,25 +32,16 @@ class COMMONVOICE(Dataset):
"""Create a Dataset for CommonVoice. """Create a Dataset for CommonVoice.
Args: Args:
root (str or Path): Path to the directory where the dataset is found or downloaded. root (str or Path): Path to the directory where the dataset is located.
tsv (str, optional): The name of the tsv file used to construct the metadata. (Where the ``tsv`` file is present.)
(default: ``"train.tsv"``) tsv (str, optional):
url (str, optional): The URL to download the dataset from, or the language of The name of the tsv file used to construct the metadata, such as
the dataset to download. (default: ``"english"``). ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, url (str, optional): Deprecated, not used.
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, folder_in_archive (str, optional): Deprecated, not used.
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``, version (str): Deprecated, not used.
``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``, download (bool, optional): Deprecated, not used.
``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
""" """
_ext_txt = ".txt" _ext_txt = ".txt"
...@@ -131,93 +50,36 @@ class COMMONVOICE(Dataset): ...@@ -131,93 +50,36 @@ class COMMONVOICE(Dataset):
def __init__(self, def __init__(self,
root: Union[str, Path], root: Union[str, Path],
tsv: str = TSV, tsv: str = "train.tsv",
url: str = URL, url: Optional[str] = None,
folder_in_archive: str = FOLDER_IN_ARCHIVE, folder_in_archive: Optional[str] = None,
version: str = VERSION, version: Optional[str] = None,
download: bool = False) -> None: download: Optional[bool] = None) -> None:
languages = {
"tatar": "tt",
"english": "en",
"german": "de",
"french": "fr",
"welsh": "cy",
"breton": "br",
"chuvash": "cv",
"turkish": "tr",
"kyrgyz": "ky",
"irish": "ga-IE",
"kabyle": "kab",
"catalan": "ca",
"taiwanese": "zh-TW",
"slovenian": "sl",
"italian": "it",
"dutch": "nl",
"hakha chin": "cnh",
"esperanto": "eo",
"estonian": "et",
"persian": "fa",
"portuguese": "pt",
"basque": "eu",
"spanish": "es",
"chinese": "zh-CN",
"mongolian": "mn",
"sakha": "sah",
"dhivehi": "dv",
"kinyarwanda": "rw",
"swedish": "sv-SE",
"russian": "ru",
"indonesian": "id",
"arabic": "ar",
"tamil": "ta",
"interlingua": "ia",
"latvian": "lv",
"japanese": "ja",
"votic": "vot",
"abkhaz": "ab",
"cantonese": "zh-HK",
"romansh sursilvan": "rm-sursilv"
}
if download: if download:
raise RuntimeError( raise RuntimeError(
"Common Voice dataset requires user agreement on the usage term, " "Common Voice dataset requires user agreement on the usage term, "
"and torchaudio no longer provides the download feature. " "and torchaudio no longer provides the download feature. "
"Please download the dataset manually and extract it in the root directory, " "Please download the dataset and extract it manually.")
"then provide the target language to `url` argument.")
if url not in languages: deprecated = [
raise ValueError(f"`url` must be one of available languages: {languages.keys()}") ('url', url),
('folder_in_archive', folder_in_archive),
if url in languages: ('version', version),
ext_archive = ".tar.gz" ('download', download)
language = languages[url] ]
for name, val in deprecated:
base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com" if val is not None:
url = os.path.join(base_url, version, language + ext_archive) warnings.warn(
f"`{name}` argument is no longer used and deprecated. "
"It will be removed in 0.9.0 releaase. "
"Please remove it from the function call")
# Get string representation of 'root' in case Path object is passed # Get string representation of 'root' in case Path object is passed
root = os.fspath(root) self._path = os.fspath(root)
self._tsv = os.path.join(self._path, tsv)
basename = os.path.basename(url)
archive = os.path.join(root, basename)
basename = basename.rsplit(".", 2)[0]
folder_in_archive = os.path.join(folder_in_archive, version, basename)
self._path = os.path.join(root, folder_in_archive)
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
self._tsv = os.path.join(root, folder_in_archive, tsv)
with open(self._tsv, "r") as tsv: with open(self._tsv, "r") as tsv_:
walker = unicode_csv_reader(tsv, delimiter="\t") walker = unicode_csv_reader(tsv_, delimiter="\t")
self._header = next(walker) self._header = next(walker)
self._walker = list(walker) self._walker = list(walker)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment