Merge pull request #2075 from huggingface/check-link-validity

Check link validity

Merge pull request #2075 from huggingface/check-link-validity
Check link validity
137e20a8 · Thomas Wolf · GitHub · 9c58b236 · d5712f7c · 137e20a8
Unverified Commit 137e20a8 authored Dec 12, 2019 by Thomas Wolf Committed by GitHub Dec 12, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 0 deletions

.circleci/config.yml .circleci/config.yml +11 -0

utils/link_tester.py utils/link_tester.py +79 -0

No files found.
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -103,6 +103,16 @@ jobs:
            - run: sudo pip install --progress-bar off -r docs/requirements.txt
            - run: sudo pip install --progress-bar off -r requirements.txt
            - run: ./.circleci/deploy.sh
+    repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install requests
+            - run: python ./utils/link_tester.py
 workflow_filters: &workflow_filters
    filters:
        branches:
@@ -112,6 +122,7 @@ workflows:
    version: 2
    build_and_test:
        jobs:
+            - repository_consistency
            - build_py3_custom_tokenizers
            - build_py2_custom_tokenizers
            - build_py3_torch_and_tf

--- a/utils/link_tester.py
+++ b/utils/link_tester.py
+""" Link tester.
+This little utility reads all the python files in the repository,
+scans for links pointing to S3 and tests the links one by one. Raises an error
+at the end of the scan if at least one link was reported broken.
+"""
+import os
+import re
+import sys
+import requests
+REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
+def list_python_files_in_repository():
+    """ List all python files in the repository.
+    This function assumes that the script is executed in the root folder.
+    """
+    source_code_files = []
+    for path, subdirs, files in os.walk("."):
+        if "templates" in path:
+            continue
+        for name in files:
+            if ".py" in name and ".pyc" not in name:
+                path_to_files = os.path.join(path, name)
+                source_code_files.append(path_to_files)
+    return source_code_files
+def find_all_links(file_paths):
+    links = []
+    for path in file_paths:
+        links += scan_code_for_links(path)
+    return links
+def scan_code_for_links(source):
+    """ Scans the file to find links using a regular expression.
+    Returns a list of links.
+    """
+    with open(source, 'r') as content:
+        content = content.read()
+        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
+        links = [prefix + suffix for _, prefix, suffix in raw_links]
+    return links
+def check_all_links(links):
+    """ Check that the provided links are valid.
+    Links are considered valid if a HEAD request to the server
+    returns a 200 status code.
+    """
+    broken_links = []
+    for link in links:
+        head = requests.head(link)
+        if head.status_code != 200:
+            broken_links.append(link)
+    return broken_links
+if __name__ == "__main__":
+    file_paths = list_python_files_in_repository()
+    links = find_all_links(file_paths)
+    broken_links = check_all_links(links)
+    print("Looking for broken links to pre-trained models/configs/tokenizers...")
+    if broken_links:
+        print("The following links did not respond:")
+        for link in broken_links:
+            print("- {}".format(link))
+        sys.exit(1)
+    print("All links are ok.")