"model/__init__.py" did not exist on "caaeea8640d5a3505fabb74dab2fbfcb642265a4"
link_tester.py 2.11 KB
Newer Older
R茅mi Louf's avatar
R茅mi Louf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
""" Link tester.

This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys

import requests


REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""


def list_python_files_in_repository():
    """ List all python files in the repository.

    This function assumes that the script is executed in the root folder.
    """
    source_code_files = []
    for path, subdirs, files in os.walk("."):
        if "templates" in path:
            continue
        for name in files:
            if ".py" in name and ".pyc" not in name:
                path_to_files = os.path.join(path, name)
                source_code_files.append(path_to_files)

    return source_code_files


def find_all_links(file_paths):
    links = []
    for path in file_paths:
        links += scan_code_for_links(path)

    return links


def scan_code_for_links(source):
    """ Scans the file to find links using a regular expression.
    Returns a list of links.
    """
46
    with open(source, "r") as content:
R茅mi Louf's avatar
R茅mi Louf committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
        content = content.read()
        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
        links = [prefix + suffix for _, prefix, suffix in raw_links]

    return links


def check_all_links(links):
    """ Check that the provided links are valid.

    Links are considered valid if a HEAD request to the server
    returns a 200 status code.
    """
    broken_links = []
    for link in links:
        head = requests.head(link)
        if head.status_code != 200:
            broken_links.append(link)

    return broken_links


if __name__ == "__main__":
    file_paths = list_python_files_in_repository()
    links = find_all_links(file_paths)
    broken_links = check_all_links(links)
    print("Looking for broken links to pre-trained models/configs/tokenizers...")
    if broken_links:
        print("The following links did not respond:")
        for link in broken_links:
            print("- {}".format(link))
        sys.exit(1)
    print("All links are ok.")