Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
366cef83
Unverified
Commit
366cef83
authored
Dec 11, 2020
by
moto
Committed by
GitHub
Dec 11, 2020
Browse files
Revert "no longer download CommonVoice directly (#1018)" (#1079)
This reverts commit
09a6fca1
.
parent
a2085b85
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
84 additions
and
97 deletions
+84
-97
test/torchaudio_unittest/datasets/utils_test.py
test/torchaudio_unittest/datasets/utils_test.py
+2
-2
torchaudio/datasets/commonvoice.py
torchaudio/datasets/commonvoice.py
+82
-95
No files found.
test/torchaudio_unittest/datasets/utils_test.py
View file @
366cef83
...
@@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase):
...
@@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase):
path
=
get_asset_path
()
path
=
get_asset_path
()
def
test_disckcache_iterator
(
self
):
def
test_disckcache_iterator
(
self
):
data
=
COMMONVOICE
(
self
.
path
,
version
=
"cv-corpus-4-2019-12-10"
,
language
=
"tatar"
)
data
=
COMMONVOICE
(
self
.
path
,
url
=
"tatar"
)
data
=
dataset_utils
.
diskcache_iterator
(
data
)
data
=
dataset_utils
.
diskcache_iterator
(
data
)
# Save
# Save
data
[
0
]
data
[
0
]
...
@@ -62,7 +62,7 @@ class TestIterator(TorchaudioTestCase):
...
@@ -62,7 +62,7 @@ class TestIterator(TorchaudioTestCase):
data
[
0
]
data
[
0
]
def
test_bg_iterator
(
self
):
def
test_bg_iterator
(
self
):
data
=
COMMONVOICE
(
self
.
path
,
version
=
"cv-corpus-4-2019-12-10"
,
language
=
"tatar"
)
data
=
COMMONVOICE
(
self
.
path
,
url
=
"tatar"
)
data
=
dataset_utils
.
bg_iterator
(
data
,
5
)
data
=
dataset_utils
.
bg_iterator
(
data
,
5
)
for
_
in
data
:
for
_
in
data
:
pass
pass
torchaudio/datasets/commonvoice.py
View file @
366cef83
import
os
import
os
import
warnings
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
List
,
Dict
,
Tuple
,
Optional
,
Union
from
typing
import
List
,
Dict
,
Tuple
,
Union
import
torchaudio
import
torchaudio
from
torchaudio.datasets.utils
import
extract_archive
,
unicode_csv_reader
,
validate_file
from
torchaudio.datasets.utils
import
download_url
,
extract_archive
,
unicode_csv_reader
from
torch
import
Tensor
from
torch
import
Tensor
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
...
@@ -17,39 +16,68 @@ from torch.utils.data import Dataset
...
@@ -17,39 +16,68 @@ from torch.utils.data import Dataset
# validated.tsv
# validated.tsv
FOLDER_IN_ARCHIVE
=
"CommonVoice"
FOLDER_IN_ARCHIVE
=
"CommonVoice"
LANGUAGE
=
"english"
URL
=
"english"
VERSION
=
"cv-corpus-
5.1-2020-06-22
"
VERSION
=
"cv-corpus-
4-2019-12-10
"
TSV
=
"train.tsv"
TSV
=
"train.tsv"
_CHECKSUMS
=
{
_CHECKSUMS
=
{
"cv-corpus-5.1-2020-06-22/tt.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz"
:
"cv-corpus-5.1-2020-06-22/en.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/de.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz"
:
"cv-corpus-5.1-2020-06-22/fr.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/cy.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz"
:
"cv-corpus-5.1-2020-06-22/br.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/cv.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz"
:
"cv-corpus-5.1-2020-06-22/tr.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/ky.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz"
:
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/kab.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz"
:
"cv-corpus-5.1-2020-06-22/ca.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz"
:
"cv-corpus-5.1-2020-06-22/sl.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/it.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz"
:
"cv-corpus-5.1-2020-06-22/nl.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/cnh.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz"
:
"cv-corpus-5.1-2020-06-22/eo.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/et.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz"
:
"cv-corpus-5.1-2020-06-22/fa.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/eu.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz"
:
"cv-corpus-5.1-2020-06-22/es.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz"
:
"cv-corpus-5.1-2020-06-22/mn.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/sah.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz"
:
"cv-corpus-5.1-2020-06-22/dv.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/rw.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz"
:
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz"
:
None
,
None
,
"cv-corpus-5.1-2020-06-22/ru.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz"
:
None
,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz"
:
None
}
}
...
@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
...
@@ -73,18 +101,15 @@ def load_commonvoice_item(line: List[str],
class
COMMONVOICE
(
Dataset
):
class
COMMONVOICE
(
Dataset
):
"""Create a Dataset for
`
CommonVoice
<https://commonvoice.mozilla.org/>`_
.
"""Create a Dataset for CommonVoice.
Args:
Args:
root (str or Path): Path to the directory where the dataset is found or downloaded.
root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
(default: ``"train.tsv"``)
url (str, optional): Deprecated.
url (str, optional): The URL to download the dataset from, or the language of
folder_in_archive (str, optional): The top-level directory of the dataset.
the dataset to download. (default: ``"english"``).
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
language (str, optional): Language of the dataset. (default: None)
The following values are mapped to their corresponding shortened version:
``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
...
@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
...
@@ -93,8 +118,11 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional): Deprecated.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
"""
"""
_ext_txt
=
".txt"
_ext_txt
=
".txt"
...
@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
...
@@ -104,30 +132,10 @@ class COMMONVOICE(Dataset):
def
__init__
(
self
,
def
__init__
(
self
,
root
:
Union
[
str
,
Path
],
root
:
Union
[
str
,
Path
],
tsv
:
str
=
TSV
,
tsv
:
str
=
TSV
,
url
:
Optional
[
str
]
=
None
,
url
:
str
=
URL
,
folder_in_archive
:
str
=
FOLDER_IN_ARCHIVE
,
folder_in_archive
:
str
=
FOLDER_IN_ARCHIVE
,
version
:
str
=
VERSION
,
version
:
str
=
VERSION
,
language
:
str
=
LANGUAGE
,
download
:
bool
=
False
)
->
None
:
download
:
Optional
[
bool
]
=
False
)
->
None
:
if
download
is
True
:
raise
RuntimeError
(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
elif
download
is
False
:
warnings
.
warn
(
"The use of the download flag is deprecated, since the dataset "
"is no longer directly accessible."
,
RuntimeWarning
)
if
url
is
not
None
:
warnings
.
warn
(
"The use of the url flag is deprecated, since the dataset "
"is no longer publicly accessible. To specify the language of the dataset, "
"please use the language parameter instead."
,
RuntimeWarning
)
languages
=
{
languages
=
{
"tatar"
:
"tt"
,
"tatar"
:
"tt"
,
...
@@ -172,22 +180,12 @@ class COMMONVOICE(Dataset):
...
@@ -172,22 +180,12 @@ class COMMONVOICE(Dataset):
"romansh sursilvan"
:
"rm-sursilv"
"romansh sursilvan"
:
"rm-sursilv"
}
}
if
language
in
languages
:
if
url
in
languages
:
ext_archive
=
".tar.gz"
ext_archive
=
".tar.gz"
language
=
languages
[
language
]
language
=
languages
[
url
]
url
=
os
.
path
.
join
(
version
,
language
+
ext_archive
)
else
:
base_url
=
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
raise
ValueError
(
url
=
os
.
path
.
join
(
base_url
,
version
,
language
+
ext_archive
)
'Allowed language values are "tatar", "english", "german",'
'"french", "welsh", "breton", "chuvash", "turkish", "kyrgyz",'
'"irish", "kabyle", "catalan", "taiwanese", "slovenian",'
'"italian", "dutch", "hakha chin", "esperanto", "estonian",'
'"persian", "portuguese", "basque", "spanish", "chinese",'
'"mongolian", "sakha", "dhivehi", "kinyarwanda", "swedish",'
'"russian", "indonesian", "arabic", "tamil", "interlingua",'
'"latvian", "japanese", "votic", "abkhaz", "cantonese" and'
'"romansh sursilvan".'
)
# Get string representation of 'root' in case Path object is passed
# Get string representation of 'root' in case Path object is passed
root
=
os
.
fspath
(
root
)
root
=
os
.
fspath
(
root
)
...
@@ -200,23 +198,12 @@ class COMMONVOICE(Dataset):
...
@@ -200,23 +198,12 @@ class COMMONVOICE(Dataset):
self
.
_path
=
os
.
path
.
join
(
root
,
folder_in_archive
)
self
.
_path
=
os
.
path
.
join
(
root
,
folder_in_archive
)
if
not
os
.
path
.
isdir
(
self
.
_path
):
if
download
:
if
os
.
path
.
isfile
(
archive
):
if
not
os
.
path
.
isdir
(
self
.
_path
):
checksum
=
_CHECKSUMS
.
get
(
url
,
None
)
if
not
os
.
path
.
isfile
(
archive
):
if
checksum
:
checksum
=
_CHECKSUMS
.
get
(
url
,
None
)
filepath
=
os
.
path
.
basename
(
url
)
download_url
(
url
,
root
,
hash_value
=
checksum
)
with
open
(
filepath
,
"rb"
)
as
file_obj
:
if
not
validate_file
(
file_obj
,
checksum
,
"sha256"
):
raise
RuntimeError
(
f
"The hash of
{
filepath
}
does not match. Delete the file manually and retry."
)
extract_archive
(
archive
)
extract_archive
(
archive
)
else
:
raise
RuntimeError
(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
self
.
_tsv
=
os
.
path
.
join
(
root
,
folder_in_archive
,
tsv
)
self
.
_tsv
=
os
.
path
.
join
(
root
,
folder_in_archive
,
tsv
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment