Commit b18e583e authored by Joao Gomes's avatar Joao Gomes Committed by Facebook GitHub Bot
Browse files

Deprecating data utils (#2073)

Summary:
- Deprecates data utils (with warning that will be removed in v0.12)
- replaces all usages of `torchaudio.datasets.utils.download_url` with `torch.hub.download_url_to_file`
- replaces all MD5 hashes with SHA256 hash

#Addresses https://github.com/pytorch/audio/issues/1883

Pull Request resolved: https://github.com/pytorch/audio/pull/2073

Reviewed By: mthrok

Differential Revision: D33241756

Pulled By: jdsgomes

fbshipit-source-id: 49388ec5965bfc91d9a1d8d0786eeafb2969f6cf
parent 575d221e
......@@ -6,8 +6,8 @@ from typing import Tuple, Union
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -15,41 +15,41 @@ URL = "aew"
FOLDER_IN_ARCHIVE = "ARCTIC"
_CHECKSUMS = {
"http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2":
"4382b116efcc8339c37e01253cb56295",
"645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406",
"http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2":
"b072d6e961e3f36a2473042d097d6da9",
"024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c",
"http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2":
"5301c7aee8919d2abd632e2667adfa7f",
"2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc",
"http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2":
"280fdff1e9857119d9a2c57b50e12db7",
"d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c",
"http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2":
"5e21cb26c6529c533df1d02ccde5a186",
"dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff",
"http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2":
"b2c3e558f656af2e0a65da0ac0c3377a",
"26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904",
"http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2":
"3957c503748e3ce17a3b73c1b9861fb0",
"3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6",
"http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2":
"59708e932d27664f9eda3e8e6859969b",
"8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a",
"http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2":
"dba4f992ff023347c07c304bf72f4c73",
"3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2",
"http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2":
"24a876ea7335c1b0ff21460e1241340f",
"dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f",
"http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2":
"afb69d95f02350537e8a28df5ab6004b",
"3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73",
"http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2":
"4ce5b3b91a0a54b6b685b1b05aa0b3be",
"8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717",
"http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2":
"6f45a3b2c86a4ed0465b353be291f77d",
"b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0",
"http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2":
"c6a15abad5c14d27f4ee856502f0232f",
"4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a",
"http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2":
"71072c983df1e590d9e9519e2a621f6e",
"c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4",
"http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2":
"3771ff03a2f5b5c3b53aa0a68b9ad0d5",
"1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b",
"http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2":
"9cbf984a832ea01b5058ba9a96862850",
"54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1",
"http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2":
"959eecb2cbbc4ac304c6b92269380c81",
"7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea",
}
......@@ -148,7 +148,7 @@ class CMUARCTIC(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
self._text = os.path.join(self._path, self._folder_text, self._file_text)
......
......@@ -4,13 +4,13 @@ from pathlib import Path
from typing import Iterable, Tuple, Union, List
from torch.utils.data import Dataset
from torchaudio.datasets.utils import download_url
from torch.hub import download_url_to_file
_CHECKSUMS = {
"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b":
"825f4ebd9183f2417df9f067a9cabe86",
"209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",
"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols":
"385e490aabc71b48e772118e3d02923e",
"408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",
}
_PUNCTUATIONS = set([
"!EXCLAMATION-POINT",
......@@ -144,14 +144,14 @@ class CMUDict(Dataset):
'The dictionary file is not found in the following location. '
f'Set `download=True` to download it. {dict_file}')
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url, dict_file, checksum)
if not os.path.exists(symbol_file):
if not download:
raise RuntimeError(
'The symbol file is not found in the following location. '
f'Set `download=True` to download it. {symbol_file}')
checksum = _CHECKSUMS.get(url_symbols, None)
download_url(url_symbols, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url_symbols, symbol_file, checksum)
with open(symbol_file, "r") as text:
self._symbols = [line.strip() for line in text.readlines()]
......
......@@ -3,17 +3,16 @@ from typing import Dict, Tuple, Union
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
import torchaudio
from torchaudio.datasets.utils import (
download_url,
extract_archive,
validate_file,
)
_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
_CHECKSUM = "29e93debeb0e779986542229a81ff29b"
_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
_SUPPORTED_SUBSETS = {"train", "test"}
......@@ -55,20 +54,12 @@ class DR_VCTK(Dataset):
if not archive.is_file():
if not download:
raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
download_url(url, root)
self._validate_checksum(archive)
download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
extract_archive(archive, root)
self._config = self._load_config(self._config_filepath)
self._filename_list = sorted(self._config)
def _validate_checksum(self, archive):
with open(archive, "rb") as file_obj:
if not validate_file(file_obj, _CHECKSUM, "md5"):
raise RuntimeError(
f"The hash of {str(archive)} does not match. Delete the file manually and retry."
)
def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
# Skip header
skip_rows = 2 if self._subset == "train" else 1
......
......@@ -5,8 +5,8 @@ from typing import Tuple, Optional, Union
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -977,7 +977,7 @@ filtered_valid = [
URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz"
FOLDER_IN_ARCHIVE = "genres"
_CHECKSUMS = {
"http://opihi.cs.uvic.ca/sound/genres.tar.gz": "5b3d6dddb579ab49814ab86dba69e7c7"
"http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6"
}
......@@ -1051,7 +1051,7 @@ class GTZAN(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
if not os.path.isdir(self._path):
......
......@@ -5,8 +5,9 @@ from pathlib import Path
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -121,7 +122,7 @@ class LIBRISPEECH(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))
......
......@@ -5,21 +5,28 @@ from pathlib import Path
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriTTS"
_CHECKSUMS = {
"http://www.openslr.org/resources/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207",
"http://www.openslr.org/resources/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d",
"http://www.openslr.org/resources/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f",
"http://www.openslr.org/resources/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4",
"http://www.openslr.org/resources/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8",
"http://www.openslr.org/resources/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d",
"http://www.openslr.org/resources/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f",
"http://www.openslr.org/resources/60/dev-clean.tar.gz":
"da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a",
"http://www.openslr.org/resources/60/dev-other.tar.gz":
"d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c",
"http://www.openslr.org/resources/60/test-clean.tar.gz":
"234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5",
"http://www.openslr.org/resources/60/test-other.tar.gz":
"33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d",
"http://www.openslr.org/resources/60/train-clean-100.tar.gz":
"c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b",
"http://www.openslr.org/resources/60/train-clean-360.tar.gz":
"ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886",
"http://www.openslr.org/resources/60/train-other-500.tar.gz":
"e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df",
}
......@@ -122,7 +129,7 @@ class LIBRITTS(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))
......
......@@ -4,9 +4,11 @@ from typing import Tuple, Union
from pathlib import Path
import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive
from torchaudio.datasets.utils import extract_archive
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
_RELEASE_CONFIGS = {
"release1": {
......@@ -54,7 +56,7 @@ class LJSPEECH(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _RELEASE_CONFIGS["release1"]["checksum"]
download_url(url, root, hash_value=checksum)
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
with open(self._metadata_path, "r", newline='') as metadata:
......
......@@ -5,8 +5,9 @@ from pathlib import Path
import torchaudio
from torch.utils.data import Dataset
from torch import Tensor
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -16,9 +17,9 @@ HASH_DIVIDER = "_nohash_"
EXCEPT_FOLDER = "_background_noise_"
_CHECKSUMS = {
"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz":
"3cd23799cb2bbdec517f1cc028f8d43c",
"743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d",
"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz":
"6b74f3901214cb2c2934e98196829835",
"af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58",
}
......@@ -111,7 +112,7 @@ class SPEECHCOMMANDS(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive, self._path)
if subset == "validation":
......
......@@ -5,8 +5,9 @@ from pathlib import Path
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -101,7 +102,7 @@ class TEDLIUM(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _RELEASE_CONFIGS[release]["checksum"]
download_url(url, root, hash_value=checksum)
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
# Create list for all samples
......
......@@ -5,6 +5,7 @@ import tarfile
import urllib
import urllib.request
import zipfile
import warnings
from typing import Any, Iterable, List, Optional
from torch.utils.model_zoo import tqdm
......@@ -71,7 +72,7 @@ def download_url(url: str,
progress_bar (bool, optional): Display a progress bar (Default: ``True``).
resume (bool, optional): Enable resuming download (Default: ``False``).
"""
warnings.warn("download_url is deprecated and will be removed in the v0.12 release.")
req = urllib.request.Request(url, method="HEAD")
req_info = urllib.request.urlopen(req).info()
......
......@@ -3,16 +3,18 @@ from typing import Tuple
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
import torchaudio
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
_CHECKSUMS = {
"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "8a6ba2946b36fcbef0212cad601f4bfa"
"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip":
"f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c"
}
......@@ -63,7 +65,7 @@ class VCTK_092(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive, self._path)
if not os.path.isdir(self._path):
......
......@@ -4,10 +4,10 @@ from typing import List, Tuple, Union
from torch import Tensor
from torch.utils.data import Dataset
from torch.hub import download_url_to_file
import torchaudio
from torchaudio.datasets.utils import (
download_url,
extract_archive,
)
......@@ -54,7 +54,7 @@ class YESNO(Dataset):
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _RELEASE_CONFIGS["release1"]["checksum"]
download_url(url, root, hash_value=checksum)
download_url_to_file(url, archive, hash_prefix=checksum)
extract_archive(archive)
if not os.path.isdir(self._path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment