Unverified Commit 137e20a8 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #2075 from huggingface/check-link-validity

Check link validity
parents 9c58b236 d5712f7c
...@@ -103,6 +103,16 @@ jobs: ...@@ -103,6 +103,16 @@ jobs:
- run: sudo pip install --progress-bar off -r docs/requirements.txt - run: sudo pip install --progress-bar off -r docs/requirements.txt
- run: sudo pip install --progress-bar off -r requirements.txt - run: sudo pip install --progress-bar off -r requirements.txt
- run: ./.circleci/deploy.sh - run: ./.circleci/deploy.sh
repository_consistency:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
resource_class: small
parallelism: 1
steps:
- checkout
- run: sudo pip install requests
- run: python ./utils/link_tester.py
workflow_filters: &workflow_filters workflow_filters: &workflow_filters
filters: filters:
branches: branches:
...@@ -112,6 +122,7 @@ workflows: ...@@ -112,6 +122,7 @@ workflows:
version: 2 version: 2
build_and_test: build_and_test:
jobs: jobs:
- repository_consistency
- build_py3_custom_tokenizers - build_py3_custom_tokenizers
- build_py2_custom_tokenizers - build_py2_custom_tokenizers
- build_py3_torch_and_tf - build_py3_torch_and_tf
......
""" Link tester.
This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys
import requests
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
def list_python_files_in_repository():
""" List all python files in the repository.
This function assumes that the script is executed in the root folder.
"""
source_code_files = []
for path, subdirs, files in os.walk("."):
if "templates" in path:
continue
for name in files:
if ".py" in name and ".pyc" not in name:
path_to_files = os.path.join(path, name)
source_code_files.append(path_to_files)
return source_code_files
def find_all_links(file_paths):
links = []
for path in file_paths:
links += scan_code_for_links(path)
return links
def scan_code_for_links(source):
""" Scans the file to find links using a regular expression.
Returns a list of links.
"""
with open(source, 'r') as content:
content = content.read()
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
links = [prefix + suffix for _, prefix, suffix in raw_links]
return links
def check_all_links(links):
""" Check that the provided links are valid.
Links are considered valid if a HEAD request to the server
returns a 200 status code.
"""
broken_links = []
for link in links:
head = requests.head(link)
if head.status_code != 200:
broken_links.append(link)
return broken_links
if __name__ == "__main__":
file_paths = list_python_files_in_repository()
links = find_all_links(file_paths)
broken_links = check_all_links(links)
print("Looking for broken links to pre-trained models/configs/tokenizers...")
if broken_links:
print("The following links did not respond:")
for link in broken_links:
print("- {}".format(link))
sys.exit(1)
print("All links are ok.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment