Unverified Commit ed6931e7 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #250 from bigscience-workshop/thomas/fix_best_download_version

Best-download have backward compatibility issue
parents cc238121 78824d7f
...@@ -21,7 +21,7 @@ class Arithmetic(Task): ...@@ -21,7 +21,7 @@ class Arithmetic(Task):
url = 'https://raw.githubusercontent.com/openai/gpt-3/master/data/' + file_name url = 'https://raw.githubusercontent.com/openai/gpt-3/master/data/' + file_name
if not os.path.exists(self.directory): if not os.path.exists(self.directory):
os.makedirs(self.directory) os.makedirs(self.directory)
download_file(url, self.directory+file_name, checksum) download_file(url, local_file=self.directory+file_name, expected_checksum=checksum)
self.set_docs() self.set_docs()
@abc.abstractmethod @abc.abstractmethod
......
...@@ -16,8 +16,8 @@ class CoQA(Task): ...@@ -16,8 +16,8 @@ class CoQA(Task):
sh ("""mkdir -p data/coqa""") sh ("""mkdir -p data/coqa""")
download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json", coqa_train_filepath, "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6") download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json", local_file=coqa_train_filepath, expected_checksum="b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6")
download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json", coqa_dev_filepath, "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a") download_file("http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json", local_file=coqa_dev_filepath, expected_checksum="dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a")
def has_training_docs(self): def has_training_docs(self):
return True return True
......
...@@ -27,7 +27,7 @@ class DROP(Task): ...@@ -27,7 +27,7 @@ class DROP(Task):
url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip" url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6" checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
zip_path = self.DATASET_PATH / "drop_dataset.zip" zip_path = self.DATASET_PATH / "drop_dataset.zip"
download_file(url, str(zip_path), checksum) download_file(url, local_file=str(zip_path), expected_checksum=checksum)
with ZipFile(zip_path, "r") as zip: with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH) zip.extractall(self.DATASET_PATH)
......
...@@ -20,7 +20,7 @@ class Ethics(Task): ...@@ -20,7 +20,7 @@ class Ethics(Task):
def download(self): def download(self):
if not os.path.exists('data/ethics/done'): if not os.path.exists('data/ethics/done'):
sh("mkdir -p data") sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", "data/ethics.tar", "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333") download_file("https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", local_file="data/ethics.tar", expected_checksum="40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333")
sh(""" sh("""
tar -xf data/ethics.tar -C data/ tar -xf data/ethics.tar -C data/
rm data/ethics.tar rm data/ethics.tar
......
...@@ -18,7 +18,7 @@ class Math(Task): ...@@ -18,7 +18,7 @@ class Math(Task):
def download(self): def download(self):
if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists(): if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists():
sh(f"mkdir -p {self.DATASET_PATH}") sh(f"mkdir -p {self.DATASET_PATH}")
download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", f"{self.DATASET_PATH}.tar", "01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da") download_file("https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", local_file=f"{self.DATASET_PATH}.tar", expected_checksum="01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da")
sh(f""" sh(f"""
tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'} tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
rm {self.DATASET_PATH}.tar rm {self.DATASET_PATH}.tar
......
...@@ -45,7 +45,7 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -45,7 +45,7 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def download(self): def download(self):
if not (self.DATASET_PATH / 'done').exists(): if not (self.DATASET_PATH / 'done').exists():
sh("mkdir -p data") sh("mkdir -p data")
download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", "data/data.tar", "78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4") download_file("https://people.eecs.berkeley.edu/~hendrycks/data.tar", local_file="data/data.tar", expected_checksum="78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4")
sh(""" sh("""
tar -xf data/data.tar -C data/ tar -xf data/data.tar -C data/
rm data/data.tar rm data/data.tar
......
...@@ -14,8 +14,8 @@ class LAMBADA(Task): ...@@ -14,8 +14,8 @@ class LAMBADA(Task):
if not os.path.exists("data/lambada/lambada_test.jsonl"): if not os.path.exists("data/lambada/lambada_test.jsonl"):
download_file( download_file(
"http://eaidata.bmk.sh/data/lambada_test.jsonl", "http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl", local_file="data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226" expected_checksum="4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
) )
except: except:
# fallback - for some reason best_download doesnt work all the time here # fallback - for some reason best_download doesnt work all the time here
......
...@@ -32,8 +32,8 @@ class MultilingualLAMBADA(lambada.LAMBADA): ...@@ -32,8 +32,8 @@ class MultilingualLAMBADA(lambada.LAMBADA):
if not os.path.exists(f): if not os.path.exists(f):
download_file( download_file(
url, url,
f, local_file=f,
CHECKSUMS[self.LANG] expected_checksum=CHECKSUMS[self.LANG]
) )
except: except:
# fallback - for some reason best_download doesnt work all the time here # fallback - for some reason best_download doesnt work all the time here
......
...@@ -19,7 +19,7 @@ class LogiQA(MultipleChoiceTask): ...@@ -19,7 +19,7 @@ class LogiQA(MultipleChoiceTask):
] ]
for split in splits: for split in splits:
file = self.DATASET_PATH / f"{split['name']}.txt" file = self.DATASET_PATH / f"{split['name']}.txt"
download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"]) download_file(f"{base_url}/{split['name']}.txt", local_file=str(file), expected_checksum=split["checksum"])
def has_training_docs(self): def has_training_docs(self):
return True return True
......
...@@ -36,8 +36,8 @@ class MuTualBase(Task): ...@@ -36,8 +36,8 @@ class MuTualBase(Task):
master_zip = Path("data/master.zip") master_zip = Path("data/master.zip")
download_file( download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip", "https://github.com/Nealcly/MuTual/archive/master.zip",
str(master_zip), local_file=str(master_zip),
"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9") expected_checksum="bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip: with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data") zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH)) Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
......
...@@ -18,12 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC): ...@@ -18,12 +18,11 @@ class PilePerplexityTask(PerplexityTask, abc.ABC):
def download(self): def download(self):
# TODO: separate pile val/test out by component so we don't have to scan the entire file once per set # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
if not os.path.exists("data/pile/test.jsonl.zst"): if not os.path.exists("data/pile/test.jsonl.zst"):
# todo use new best_download fallback api # todo use new best_download fallback api
os.makedirs("data/pile/", exist_ok=True) os.makedirs("data/pile/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92") download_file("http://eaidata.bmk.sh/data/pile/val.jsonl.zst", local_file=self.VAL_PATH, expected_checksum="264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e") download_file("http://eaidata.bmk.sh/data/pile/test.jsonl.zst", local_file=self.TEST_PATH, expected_checksum="0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def validation_docs(self): def validation_docs(self):
rdr = lm_dataformat.Reader(self.VAL_PATH) rdr = lm_dataformat.Reader(self.VAL_PATH)
......
...@@ -32,8 +32,8 @@ class QA4MRE(MultipleChoiceTask): ...@@ -32,8 +32,8 @@ class QA4MRE(MultipleChoiceTask):
if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"): if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
download_file( download_file(
url_path, url_path,
f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml", local_file=f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
sha256sums[year], expected_checksum=sha256sums[year],
) )
def has_training_docs(self): def has_training_docs(self):
......
...@@ -13,8 +13,8 @@ class SciQ(MultipleChoiceTask): ...@@ -13,8 +13,8 @@ class SciQ(MultipleChoiceTask):
os.makedirs('data/sciq', exist_ok=True) os.makedirs('data/sciq', exist_ok=True)
download_file( download_file(
'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip', 'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip',
'data/sciq/SciQ.zip', local_file='data/sciq/SciQ.zip',
'7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c', expected_checksum='7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c',
) )
with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf: with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
zf.extractall("data/sciq/") zf.extractall("data/sciq/")
......
...@@ -12,7 +12,7 @@ class TriviaQA(Task): ...@@ -12,7 +12,7 @@ class TriviaQA(Task):
def download(self): def download(self):
if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'): if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'):
os.makedirs("data/triviaqa/", exist_ok=True) os.makedirs("data/triviaqa/", exist_ok=True)
download_file("http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz", "data/triviaqa/triviaqa-unfiltered.tar.gz", "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e") download_file("http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz", local_file="data/triviaqa/triviaqa-unfiltered.tar.gz", expected_checksum="adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e")
sh(""" sh("""
cd data/triviaqa/ cd data/triviaqa/
tar -xf triviaqa-unfiltered.tar.gz tar -xf triviaqa-unfiltered.tar.gz
......
...@@ -58,7 +58,7 @@ class TruthfulQAMultipleChoice(Task): ...@@ -58,7 +58,7 @@ class TruthfulQAMultipleChoice(Task):
Path.mkdir(self.DATASET_PATH, parents=True) Path.mkdir(self.DATASET_PATH, parents=True)
mc_url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json" mc_url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/data/mc_task.json"
checksum = "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954" checksum = "6eb4125d25750c0145c4be2dce00440736684ab6f74ce6bff2139571cc758954"
download_file(mc_url, str(self.DATASET_PATH / "mc_task.json"), checksum) download_file(mc_url, local_file=str(self.DATASET_PATH / "mc_task.json"), expected_checksum=checksum)
def has_training_docs(self): def has_training_docs(self):
return False return False
...@@ -168,7 +168,7 @@ class TruthfulQAGeneration(Task): ...@@ -168,7 +168,7 @@ class TruthfulQAGeneration(Task):
Path.mkdir(self.DATASET_PATH, parents=True) Path.mkdir(self.DATASET_PATH, parents=True)
url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv" url = "https://raw.githubusercontent.com/sylinrl/TruthfulQA/013686a06be7a7bde5bf8223943e106c7250123c/TruthfulQA.csv"
checksum = "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2" checksum = "8d7dd15f033196140f032d97d30f037da7a7b1192c3f36f9937c1850925335a2"
download_file(url, str(self.DATASET_PATH / "TruthfulQA.csv"), checksum) download_file(url, local_file=str(self.DATASET_PATH / "TruthfulQA.csv"), expected_checksum=checksum)
def has_training_docs(self): def has_training_docs(self):
return False return False
......
...@@ -29,7 +29,7 @@ class WordUnscrambleTask(Task): ...@@ -29,7 +29,7 @@ class WordUnscrambleTask(Task):
if not file.exists(): if not file.exists():
rawfile = file.parent / (file.name + ".gz") rawfile = file.parent / (file.name + ".gz")
base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data" base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM) download_file(f"{base_url}/{self.FILENAME}.gz", local_file=str(rawfile), expected_checksum=self.CHECKSUM)
extract_gzip(gz=rawfile, to=file) extract_gzip(gz=rawfile, to=file)
def has_training_docs(self): def has_training_docs(self):
......
...@@ -46,7 +46,7 @@ class WikiText(PerplexityTask): ...@@ -46,7 +46,7 @@ class WikiText(PerplexityTask):
def download(self): def download(self):
if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'): if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'):
os.makedirs("data/wikitext/", exist_ok=True) os.makedirs("data/wikitext/", exist_ok=True)
download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", "data/wikitext/wikitext-2-raw-v1.zip", "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11") download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", local_file="data/wikitext/wikitext-2-raw-v1.zip", expected_checksum="ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11")
sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip") sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip")
def has_validation_docs(self): def has_validation_docs(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment