link_tester.py 2.82 KB
Newer Older
Sylvain Gugger's avatar
Sylvain Gugger committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Link tester.
Rémi Louf's avatar
Rémi Louf committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys

import requests


REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""


Julien Chaumond's avatar
Julien Chaumond committed
31
32
33
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"


Rémi Louf's avatar
Rémi Louf committed
34
def list_python_files_in_repository():
Lysandre's avatar
Lysandre committed
35
    """List all python files in the repository.
Rémi Louf's avatar
Rémi Louf committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

    This function assumes that the script is executed in the root folder.
    """
    source_code_files = []
    for path, subdirs, files in os.walk("."):
        if "templates" in path:
            continue
        for name in files:
            if ".py" in name and ".pyc" not in name:
                path_to_files = os.path.join(path, name)
                source_code_files.append(path_to_files)

    return source_code_files


def find_all_links(file_paths):
    links = []
    for path in file_paths:
        links += scan_code_for_links(path)

Julien Chaumond's avatar
Julien Chaumond committed
56
    return [link for link in links if link != S3_BUCKET_PREFIX]
Rémi Louf's avatar
Rémi Louf committed
57
58
59


def scan_code_for_links(source):
Lysandre's avatar
Lysandre committed
60
    """Scans the file to find links using a regular expression.
Rémi Louf's avatar
Rémi Louf committed
61
62
    Returns a list of links.
    """
63
    with open(source, "r") as content:
Rémi Louf's avatar
Rémi Louf committed
64
65
66
67
68
69
70
71
        content = content.read()
        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
        links = [prefix + suffix for _, prefix, suffix in raw_links]

    return links


def check_all_links(links):
Lysandre's avatar
Lysandre committed
72
    """Check that the provided links are valid.
Rémi Louf's avatar
Rémi Louf committed
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

    Links are considered valid if a HEAD request to the server
    returns a 200 status code.
    """
    broken_links = []
    for link in links:
        head = requests.head(link)
        if head.status_code != 200:
            broken_links.append(link)

    return broken_links


if __name__ == "__main__":
    file_paths = list_python_files_in_repository()
    links = find_all_links(file_paths)
    broken_links = check_all_links(links)
    print("Looking for broken links to pre-trained models/configs/tokenizers...")
    if broken_links:
        print("The following links did not respond:")
        for link in broken_links:
            print("- {}".format(link))
        sys.exit(1)
    print("All links are ok.")