Commit 1ab25c49 authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'master' into pr/2115

parents df396112 18601c3b
......@@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
padding_side = "left"
def __init__(self, vocab_file,
do_lower_case=False, remove_space=True, keep_accents=False,
......@@ -74,6 +75,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
self._pad_token_type_id = 3
try:
import sentencepiece as spm
......
""" Link tester.
This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys
import requests
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
def list_python_files_in_repository():
""" List all python files in the repository.
This function assumes that the script is executed in the root folder.
"""
source_code_files = []
for path, subdirs, files in os.walk("."):
if "templates" in path:
continue
for name in files:
if ".py" in name and ".pyc" not in name:
path_to_files = os.path.join(path, name)
source_code_files.append(path_to_files)
return source_code_files
def find_all_links(file_paths):
links = []
for path in file_paths:
links += scan_code_for_links(path)
return links
def scan_code_for_links(source):
""" Scans the file to find links using a regular expression.
Returns a list of links.
"""
with open(source, 'r') as content:
content = content.read()
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
links = [prefix + suffix for _, prefix, suffix in raw_links]
return links
def check_all_links(links):
""" Check that the provided links are valid.
Links are considered valid if a HEAD request to the server
returns a 200 status code.
"""
broken_links = []
for link in links:
head = requests.head(link)
if head.status_code != 200:
broken_links.append(link)
return broken_links
if __name__ == "__main__":
file_paths = list_python_files_in_repository()
links = find_all_links(file_paths)
broken_links = check_all_links(links)
print("Looking for broken links to pre-trained models/configs/tokenizers...")
if broken_links:
print("The following links did not respond:")
for link in broken_links:
print("- {}".format(link))
sys.exit(1)
print("All links are ok.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment