Commit 40ed7172 authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'refs/remotes/huggingface/master'

parents 86a63070 7296f101
''' Script for downloading all GLUE data.
Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
""" Link tester.
This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys
import requests
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
def list_python_files_in_repository():
""" List all python files in the repository.
This function assumes that the script is executed in the root folder.
"""
source_code_files = []
for path, subdirs, files in os.walk("."):
if "templates" in path:
continue
for name in files:
if ".py" in name and ".pyc" not in name:
path_to_files = os.path.join(path, name)
source_code_files.append(path_to_files)
return source_code_files
def find_all_links(file_paths):
links = []
for path in file_paths:
links += scan_code_for_links(path)
return links
def scan_code_for_links(source):
""" Scans the file to find links using a regular expression.
Returns a list of links.
"""
with open(source, 'r') as content:
content = content.read()
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
links = [prefix + suffix for _, prefix, suffix in raw_links]
return links
def check_all_links(links):
""" Check that the provided links are valid.
Links are considered valid if a HEAD request to the server
returns a 200 status code.
"""
broken_links = []
for link in links:
head = requests.head(link)
if head.status_code != 200:
broken_links.append(link)
return broken_links
if __name__ == "__main__":
file_paths = list_python_files_in_repository()
links = find_all_links(file_paths)
broken_links = check_all_links(links)
print("Looking for broken links to pre-trained models/configs/tokenizers...")
if broken_links:
print("The following links did not respond:")
for link in broken_links:
print("- {}".format(link))
sys.exit(1)
print("All links are ok.")
---
- step:
name: Execute python examples/run_glue.py
image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
command:
- python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data
- pip install -e .
- pip install -r examples/requirements.txt
- python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters}
parameters:
- name: model_type
pass-as: --model_type={v}
type: string
default: bert
- name: model_name_or_path
pass-as: --model_name_or_path={v}
type: string
default: bert-base-uncased
- name: task_name
pass-as: --task_name={v}
type: string
default: MRPC
- name: max_seq_length
pass-as: --max_seq_length={v}
description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
type: integer
default: 128
- name: per_gpu_train_batch_size
pass-as: --per_gpu_train_batch_size={v}
description: Batch size per GPU/CPU for training.
type: integer
default: 8
- name: per_gpu_eval_batch_size
pass-as: --per_gpu_eval_batch_size={v}
description: Batch size per GPU/CPU for evaluation.
type: integer
default: 8
- name: gradient_accumulation_steps
pass-as: --gradient_accumulation_steps={v}
description: Number of updates steps to accumulate before performing a backward/update pass.
type: integer
default: 1
- name: learning_rate
pass-as: --learning_rate={v}
description: The initial learning rate for Adam.
type: float
default: 0.00005
- name: adam_epsilon
pass-as: --adam_epsilon={v}
description: Epsilon for Adam optimizer.
type: float
default: 0.00000001
- name: max_grad_norm
pass-as: --max_grad_norm={v}
description: Max gradient norm.
type: float
default: 1.0
- name: num_train_epochs
pass-as: --num_train_epochs={v}
description: Total number of training epochs to perform.
type: integer
default: 3
- name: max_steps
pass-as: --max_steps={v}
description: If > 0, set total number of training steps to perform. Override num_train_epochs.
type: integer
default: -1
- name: warmup_steps
pass-as: --warmup_steps={v}
description: Linear warmup over warmup_steps.
type: integer
default: -1
- name: logging_steps
pass-as: --logging_steps={v}
description: Log every X updates steps.
type: integer
default: 25
- name: save_steps
pass-as: --save_steps={v}
description: Save checkpoint every X updates steps.
type: integer
default: -1
- name: output_dir
pass-as: --output_dir={v}
type: string
default: /valohai/outputs
- name: evaluate_during_training
description: Run evaluation during training at each logging step.
type: flag
default: true
- name: do_lower_case
description: Set this flag if you are using an uncased model.
type: flag
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment