Deprecating data utils (#2073)

Summary: - Deprecates data utils (with warning that will be removed in v0.12) - replaces all usages of `torchaudio.datasets.utils.download_url` with `torch.hub.download_url_to_file` - replaces all MD5 hashes with SHA256 hash #Addresses https://github.com/pytorch/audio/issues/1883 Pull Request resolved: https://github.com/pytorch/audio/pull/2073 Reviewed By: mthrok Differential Revision: D33241756 Pulled By: jdsgomes fbshipit-source-id: 49388ec5965bfc91d9a1d8d0786eeafb2969f6cf

Deprecating data utils (#2073)
Summary: - Deprecates data utils (with warning that will be removed in v0.12) - replaces all usages of `torchaudio.datasets.utils.download_url` with `torch.hub.download_url_to_file` - replaces all MD5 hashes with SHA256 hash #Addresses https://github.com/pytorch/audio/issues/1883 Pull Request resolved: https://github.com/pytorch/audio/pull/2073 Reviewed By: mthrok Differential Revision: D33241756 Pulled By: jdsgomes fbshipit-source-id: 49388ec5965bfc91d9a1d8d0786eeafb2969f6cf
b18e583e · Joao Gomes · Facebook GitHub Bot · 575d221e · b18e583e · b18e583e
Commit b18e583e authored Dec 22, 2021 by Joao Gomes Committed by Facebook GitHub Bot Dec 22, 2021
12 changed files
--- a/torchaudio/datasets/cmuarctic.py
+++ b/torchaudio/datasets/cmuarctic.py
@@ -6,8 +6,8 @@ from typing import Tuple, Union
 import torchaudio
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -15,41 +15,41 @@ URL = "aew"
 FOLDER_IN_ARCHIVE = "ARCTIC"
 _CHECKSUMS = {
    "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2":
-    "4382b116efcc8339c37e01253cb56295",
+    "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406",
    "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2":
-    "b072d6e961e3f36a2473042d097d6da9",
+    "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c",
    "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2":
-    "5301c7aee8919d2abd632e2667adfa7f",
+    "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc",
    "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2":
-    "280fdff1e9857119d9a2c57b50e12db7",
+    "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c",
    "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2":
-    "5e21cb26c6529c533df1d02ccde5a186",
+    "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff",
    "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2":
-    "b2c3e558f656af2e0a65da0ac0c3377a",
+    "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904",
    "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2":
-    "3957c503748e3ce17a3b73c1b9861fb0",
+    "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6",
    "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2":
-    "59708e932d27664f9eda3e8e6859969b",
+    "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a",
    "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2":
-    "dba4f992ff023347c07c304bf72f4c73",
+    "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2",
    "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2":
-    "24a876ea7335c1b0ff21460e1241340f",
+    "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f",
    "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2":
-    "afb69d95f02350537e8a28df5ab6004b",
+    "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73",
    "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2":
-    "4ce5b3b91a0a54b6b685b1b05aa0b3be",
+    "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717",
    "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2":
-    "6f45a3b2c86a4ed0465b353be291f77d",
+    "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0",
    "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2":
-    "c6a15abad5c14d27f4ee856502f0232f",
+    "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a",
    "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2":
-    "71072c983df1e590d9e9519e2a621f6e",
+    "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4",
    "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2":
-    "3771ff03a2f5b5c3b53aa0a68b9ad0d5",
+    "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b",
    "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2":
-    "9cbf984a832ea01b5058ba9a96862850",
+    "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1",
    "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2":
-    "959eecb2cbbc4ac304c6b92269380c81",
+    "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea",
 }


@@ -148,7 +148,7 @@ class CMUARCTIC(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum, hash_type="md5")
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        self._text = os.path.join(self._path, self._folder_text, self._file_text)

--- a/torchaudio/datasets/cmudict.py
+++ b/torchaudio/datasets/cmudict.py
@@ -4,13 +4,13 @@ from pathlib import Path
 from typing import Iterable, Tuple, Union, List

 from torch.utils.data import Dataset
-from torchaudio.datasets.utils import download_url
+from torch.hub import download_url_to_file

 _CHECKSUMS = {
    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b":
-    "825f4ebd9183f2417df9f067a9cabe86",
+    "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",
    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols":
-    "385e490aabc71b48e772118e3d02923e",
+    "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",
 }
 _PUNCTUATIONS = set([
    "!EXCLAMATION-POINT",
@@ -144,14 +144,14 @@ class CMUDict(Dataset):
                    'The dictionary file is not found in the following location. '
                    f'Set `download=True` to download it. {dict_file}')
            checksum = _CHECKSUMS.get(url, None)
-            download_url(url, root, hash_value=checksum, hash_type="md5")
+            download_url_to_file(url, dict_file, checksum)
        if not os.path.exists(symbol_file):
            if not download:
                raise RuntimeError(
                    'The symbol file is not found in the following location. '
                    f'Set `download=True` to download it. {symbol_file}')
            checksum = _CHECKSUMS.get(url_symbols, None)
-            download_url(url_symbols, root, hash_value=checksum, hash_type="md5")
+            download_url_to_file(url_symbols, symbol_file, checksum)

        with open(symbol_file, "r") as text:
            self._symbols = [line.strip() for line in text.readlines()]

--- a/torchaudio/datasets/dr_vctk.py
+++ b/torchaudio/datasets/dr_vctk.py
@@ -3,17 +3,16 @@ from typing import Dict, Tuple, Union

 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file

 import torchaudio
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
-    validate_file,
 )


 _URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
-_CHECKSUM = "29e93debeb0e779986542229a81ff29b"
+_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
 _SUPPORTED_SUBSETS = {"train", "test"}


@@ -55,20 +54,12 @@ class DR_VCTK(Dataset):
            if not archive.is_file():
                if not download:
                    raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
-                download_url(url, root)
-            self._validate_checksum(archive)
+                download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
            extract_archive(archive, root)

        self._config = self._load_config(self._config_filepath)
        self._filename_list = sorted(self._config)

-    def _validate_checksum(self, archive):
-        with open(archive, "rb") as file_obj:
-            if not validate_file(file_obj, _CHECKSUM, "md5"):
-                raise RuntimeError(
-                    f"The hash of {str(archive)} does not match. Delete the file manually and retry."
-                )
-
    def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
        # Skip header
        skip_rows = 2 if self._subset == "train" else 1

--- a/torchaudio/datasets/gtzan.py
+++ b/torchaudio/datasets/gtzan.py
@@ -5,8 +5,8 @@ from typing import Tuple, Optional, Union
 import torchaudio
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -977,7 +977,7 @@ filtered_valid = [
 URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz"
 FOLDER_IN_ARCHIVE = "genres"
 _CHECKSUMS = {
-    "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "5b3d6dddb579ab49814ab86dba69e7c7"
+    "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6"
 }


@@ -1051,7 +1051,7 @@ class GTZAN(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum, hash_type="md5")
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        if not os.path.isdir(self._path):

--- a/torchaudio/datasets/librispeech.py
+++ b/torchaudio/datasets/librispeech.py
@@ -5,8 +5,9 @@ from pathlib import Path
 import torchaudio
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
+
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -121,7 +122,7 @@ class LIBRISPEECH(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))

--- a/torchaudio/datasets/libritts.py
+++ b/torchaudio/datasets/libritts.py
@@ -5,21 +5,28 @@ from pathlib import Path
 import torchaudio
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

 URL = "train-clean-100"
 FOLDER_IN_ARCHIVE = "LibriTTS"
 _CHECKSUMS = {
-    "http://www.openslr.org/resources/60/dev-clean.tar.gz": "0c3076c1e5245bb3f0af7d82087ee207",
-    "http://www.openslr.org/resources/60/dev-other.tar.gz": "815555d8d75995782ac3ccd7f047213d",
-    "http://www.openslr.org/resources/60/test-clean.tar.gz": "7bed3bdb047c4c197f1ad3bc412db59f",
-    "http://www.openslr.org/resources/60/test-other.tar.gz": "ae3258249472a13b5abef2a816f733e4",
-    "http://www.openslr.org/resources/60/train-clean-100.tar.gz": "4a8c202b78fe1bc0c47916a98f3a2ea8",
-    "http://www.openslr.org/resources/60/train-clean-360.tar.gz": "a84ef10ddade5fd25df69596a2767b2d",
-    "http://www.openslr.org/resources/60/train-other-500.tar.gz": "7b181dd5ace343a5f38427999684aa6f",
+    "http://www.openslr.org/resources/60/dev-clean.tar.gz":
+        "da0864e1bd26debed35da8a869dd5c04dfc27682921936de7cff9c8a254dbe1a",
+    "http://www.openslr.org/resources/60/dev-other.tar.gz":
+        "d413eda26f3a152ac7c9cf3658ef85504dfb1b625296e5fa83727f5186cca79c",
+    "http://www.openslr.org/resources/60/test-clean.tar.gz":
+        "234ea5b25859102a87024a4b9b86641f5b5aaaf1197335c95090cde04fe9a4f5",
+    "http://www.openslr.org/resources/60/test-other.tar.gz":
+        "33a5342094f3bba7ccc2e0500b9e72d558f72eb99328ac8debe1d9080402f10d",
+    "http://www.openslr.org/resources/60/train-clean-100.tar.gz":
+        "c5608bf1ef74bb621935382b8399c5cdd51cd3ee47cec51f00f885a64c6c7f6b",
+    "http://www.openslr.org/resources/60/train-clean-360.tar.gz":
+        "ce7cff44dcac46009d18379f37ef36551123a1dc4e5c8e4eb73ae57260de4886",
+    "http://www.openslr.org/resources/60/train-other-500.tar.gz":
+        "e35f7e34deeb2e2bdfe4403d88c8fdd5fbf64865cae41f027a185a6965f0a5df",
 }


@@ -122,7 +129,7 @@ class LIBRITTS(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        self._walker = sorted(str(p.stem) for p in Path(self._path).glob('*/*/*' + self._ext_audio))

--- a/torchaudio/datasets/ljspeech.py
+++ b/torchaudio/datasets/ljspeech.py
@@ -4,9 +4,11 @@ from typing import Tuple, Union
 from pathlib import Path

 import torchaudio
-from torchaudio.datasets.utils import download_url, extract_archive
+from torchaudio.datasets.utils import extract_archive
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
+

 _RELEASE_CONFIGS = {
    "release1": {
@@ -54,7 +56,7 @@ class LJSPEECH(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
-                    download_url(url, root, hash_value=checksum)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        with open(self._metadata_path, "r", newline='') as metadata:

--- a/torchaudio/datasets/speechcommands.py
+++ b/torchaudio/datasets/speechcommands.py
@@ -5,8 +5,9 @@ from pathlib import Path
 import torchaudio
 from torch.utils.data import Dataset
 from torch import Tensor
+from torch.hub import download_url_to_file
+
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -16,9 +17,9 @@ HASH_DIVIDER = "_nohash_"
 EXCEPT_FOLDER = "_background_noise_"
 _CHECKSUMS = {
    "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz":
-    "3cd23799cb2bbdec517f1cc028f8d43c",
+    "743935421bb51cccdb6bdd152e04c5c70274e935c82119ad7faeec31780d811d",
    "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz":
-    "6b74f3901214cb2c2934e98196829835",
+    "af14739ee7dc311471de98f5f9d2c9191b18aedfe957f4a6ff791c709868ff58",
 }


@@ -111,7 +112,7 @@ class SPEECHCOMMANDS(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum, hash_type="md5")
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive, self._path)

        if subset == "validation":

--- a/torchaudio/datasets/tedlium.py
+++ b/torchaudio/datasets/tedlium.py
@@ -5,8 +5,9 @@ from pathlib import Path
 import torchaudio
 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
+
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -101,7 +102,7 @@ class TEDLIUM(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS[release]["checksum"]
-                    download_url(url, root, hash_value=checksum)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        # Create list for all samples

--- a/torchaudio/datasets/utils.py
+++ b/torchaudio/datasets/utils.py
@@ -5,6 +5,7 @@ import tarfile
 import urllib
 import urllib.request
 import zipfile
+import warnings
 from typing import Any, Iterable, List, Optional

 from torch.utils.model_zoo import tqdm
@@ -71,7 +72,7 @@ def download_url(url: str,
        progress_bar (bool, optional): Display a progress bar (Default: ``True``).
        resume (bool, optional): Enable resuming download (Default: ``False``).
    """
-
+    warnings.warn("download_url is deprecated and will be removed in the v0.12 release.")
    req = urllib.request.Request(url, method="HEAD")
    req_info = urllib.request.urlopen(req).info()


--- a/torchaudio/datasets/vctk.py
+++ b/torchaudio/datasets/vctk.py
@@ -3,16 +3,18 @@ from typing import Tuple

 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file
+

 import torchaudio
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

 URL = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
 _CHECKSUMS = {
-    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip": "8a6ba2946b36fcbef0212cad601f4bfa"
+    "https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip":
+        "f96258be9fdc2cbff6559541aae7ea4f59df3fcaf5cf963aae5ca647357e359c"
 }


@@ -63,7 +65,7 @@ class VCTK_092(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _CHECKSUMS.get(url, None)
-                    download_url(url, root, hash_value=checksum, hash_type="md5")
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive, self._path)

        if not os.path.isdir(self._path):

--- a/torchaudio/datasets/yesno.py
+++ b/torchaudio/datasets/yesno.py
@@ -4,10 +4,10 @@ from typing import List, Tuple, Union

 from torch import Tensor
 from torch.utils.data import Dataset
+from torch.hub import download_url_to_file

 import torchaudio
 from torchaudio.datasets.utils import (
-    download_url,
    extract_archive,
 )

@@ -54,7 +54,7 @@ class YESNO(Dataset):
            if not os.path.isdir(self._path):
                if not os.path.isfile(archive):
                    checksum = _RELEASE_CONFIGS["release1"]["checksum"]
-                    download_url(url, root, hash_value=checksum)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
                extract_archive(archive)

        if not os.path.isdir(self._path):