Unverified Commit c53ceb84 authored by Bhargav Kathivarapu's avatar Bhargav Kathivarapu Committed by GitHub
Browse files

Add datasets checksum (#499)

* add checksums

* checksum function changes

* function Docstring change

* checksums moved to Dataset Modules
parent 27a0f765
......@@ -18,6 +18,66 @@ FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
}
def load_commonvoice_item(line: List[str],
......@@ -120,8 +180,9 @@ class COMMONVOICE(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
extract_archive(archive, self._path)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
self._tsv = os.path.join(root, folder_in_archive, tsv)
......
......@@ -12,6 +12,22 @@ from torchaudio.datasets.utils import (
URL = "train-clean-100"
FOLDER_IN_ARCHIVE = "LibriSpeech"
_CHECKSUMS = {
"http://www.openslr.org/resources/12/dev-clean.tar.gz":
"76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",
"http://www.openslr.org/resources/12/dev-other.tar.gz":
"12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",
"http://www.openslr.org/resources/12/test-clean.tar.gz":
"39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",
"http://www.openslr.org/resources/12/test-other.tar.gz":
"d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",
"http://www.openslr.org/resources/12/train-clean-100.tar.gz":
"d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",
"http://www.openslr.org/resources/12/train-clean-360.tar.gz":
"146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",
"http://www.openslr.org/resources/12/train-other-500.tar.gz":
"ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"
}
def load_librispeech_item(fileid: str,
......@@ -91,7 +107,8 @@ class LIBRISPEECH(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
walker = walk_files(
......
......@@ -9,6 +9,10 @@ from torch.utils.data import Dataset
URL = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
FOLDER_IN_ARCHIVE = "wavs"
_CHECKSUMS = {
"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2":
"be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"
}
def load_ljspeech_item(line: List[str], path: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
......@@ -55,7 +59,8 @@ class LJSPEECH(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)
with open(self._metadata_path, "r") as metadata:
......
......@@ -14,6 +14,12 @@ FOLDER_IN_ARCHIVE = "SpeechCommands"
URL = "speech_commands_v0.02"
HASH_DIVIDER = "_nohash_"
EXCEPT_FOLDER = "_background_noise_"
_CHECKSUMS = {
"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz":
"3cd23799cb2bbdec517f1cc028f8d43c",
"https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz":
"6b74f3901214cb2c2934e98196829835",
}
def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str, str, int]:
......@@ -60,7 +66,8 @@ class SPEECHCOMMANDS(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
extract_archive(archive, self._path)
walker = walk_files(self._path, suffix=".wav", prefix=True)
......
......@@ -5,10 +5,18 @@ from typing import Any, Tuple
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import download_url, extract_archive, walk_files
from torchaudio.datasets.utils import (
download_url,
extract_archive,
walk_files
)
URL = "http://homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"
FOLDER_IN_ARCHIVE = "VCTK-Corpus"
_CHECKSUMS = {
"http://homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz":
"45e8dede780278ef5541fde0b82ac292"
}
def load_vctk_item(fileid: str,
......@@ -90,7 +98,8 @@ class VCTK(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
extract_archive(archive)
if not os.path.isdir(self._path):
......
......@@ -5,10 +5,18 @@ from typing import Any, List, Tuple
import torchaudio
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio.datasets.utils import download_url, extract_archive, walk_files
from torchaudio.datasets.utils import (
download_url,
extract_archive,
walk_files
)
URL = "http://www.openslr.org/resources/1/waves_yesno.tar.gz"
FOLDER_IN_ARCHIVE = "waves_yesno"
_CHECKSUMS = {
"http://www.openslr.org/resources/1/waves_yesno.tar.gz":
"962ff6e904d2df1126132ecec6978786"
}
def load_yesno_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, int, List[int]]:
......@@ -55,7 +63,8 @@ class YESNO(Dataset):
if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
download_url(url, root)
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum, hash_type="md5")
extract_archive(archive)
if not os.path.isdir(self._path):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment