check_copies.py 18.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import glob
import os
import re
20
21

import black
22
23
24
25
26


# All paths are set with the intent you should run this script from the root of the repo with the command
# python utils/check_copies.py
TRANSFORMERS_PATH = "src/transformers"
27
PATH_TO_DOCS = "docs/source"
28
REPO_PATH = "."
29

30
# Mapping for files that are full copies of others (keys are copies, values the file to keep them up to data with)
31
32
33
34
FULL_COPIES = {
    "examples/tensorflow/question-answering/utils_qa.py": "examples/pytorch/question-answering/utils_qa.py",
    "examples/flax/question-answering/utils_qa.py": "examples/pytorch/question-answering/utils_qa.py",
}
35

36

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
LOCALIZED_READMES = {
    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
    "README.md": {
        "start_prompt": "🤗 Transformers currently provides the following architectures",
        "end_prompt": "1. Want to contribute a new model?",
        "format_model_list": "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by {paper_authors}.{supplements}",
    },
    "README_zh-hans.md": {
        "start_prompt": "🤗 Transformers 目前支持如下的架构",
        "end_prompt": "1. 想要贡献新的模型?",
        "format_model_list": "**[{title}]({model_link})** (来自 {paper_affiliations}) 伴随论文 {paper_title_link} 由 {paper_authors} 发布。{supplements}",
    },
    "README_zh-hant.md": {
        "start_prompt": "🤗 Transformers 目前支援以下的架構",
        "end_prompt": "1. 想要貢獻新的模型?",
        "format_model_list": "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by {paper_authors}.{supplements}",
    },
54
55
56
57
58
    "README_ko.md": {
        "start_prompt": "🤗 Transformers는 다음 모델들을 제공합니다",
        "end_prompt": "1. 새로운 모델을 올리고 싶나요?",
        "format_model_list": "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by {paper_authors}.{supplements}",
    },
59
60
61
}


62
def _should_continue(line, indent):
63
    return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
64
65


66
def find_code_in_transformers(object_name):
Patrick von Platen's avatar
Patrick von Platen committed
67
    """Find and return the code source code of `object_name`."""
68
69
70
71
72
73
74
    parts = object_name.split(".")
    i = 0

    # First let's find the module where our object lives.
    module = parts[i]
    while i < len(parts) and not os.path.isfile(os.path.join(TRANSFORMERS_PATH, f"{module}.py")):
        i += 1
75
76
        if i < len(parts):
            module = os.path.join(module, parts[i])
77
78
79
80
81
    if i >= len(parts):
        raise ValueError(
            f"`object_name` should begin with the name of a module of transformers but got {object_name}."
        )

82
    with open(os.path.join(TRANSFORMERS_PATH, f"{module}.py"), "r", encoding="utf-8", newline="\n") as f:
83
84
85
86
87
88
        lines = f.readlines()

    # Now let's find the class / func in the code!
    indent = ""
    line_index = 0
    for name in parts[i + 1 :]:
89
90
91
        while (
            line_index < len(lines) and re.search(fr"^{indent}(class|def)\s+{name}(\(|\:)", lines[line_index]) is None
        ):
92
93
94
95
96
97
98
99
100
            line_index += 1
        indent += "    "
        line_index += 1

    if line_index >= len(lines):
        raise ValueError(f" {object_name} does not match any function or class in {module}.")

    # We found the beginning of the class / func, now let's find the end (when the indent diminishes).
    start_index = line_index
101
    while line_index < len(lines) and _should_continue(lines[line_index], indent):
102
103
104
105
106
107
108
109
110
111
        line_index += 1
    # Clean up empty lines at the end (if any).
    while len(lines[line_index - 1]) <= 1:
        line_index -= 1

    code_lines = lines[start_index:line_index]
    return "".join(code_lines)


_re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+transformers\.(\S+\.\S+)\s*($|\S.*$)")
112
_re_replace_pattern = re.compile(r"^\s*(\S+)->(\S+)(\s+.*|$)")
113
114


115
116
117
118
119
120
121
def get_indent(code):
    lines = code.split("\n")
    idx = 0
    while idx < len(lines) and len(lines[idx]) == 0:
        idx += 1
    if idx < len(lines):
        return re.search(r"^(\s*)\S", lines[idx]).groups()[0]
122
123
124
125
126
127
128
129
130
131
132
133
    return ""


def blackify(code):
    """
    Applies the black part of our `make style` command to `code`.
    """
    has_indent = len(get_indent(code)) > 0
    if has_indent:
        code = f"class Bla:\n{code}"
    result = black.format_str(code, mode=black.FileMode([black.TargetVersion.PY35], line_length=119))
    return result[len("class Bla:\n") :] if has_indent else result
134
135


136
137
138
139
140
141
def is_copy_consistent(filename, overwrite=False):
    """
    Check if the code commented as a copy in `filename` matches the original.

    Return the differences or overwrites the content depending on `overwrite`.
    """
142
    with open(filename, "r", encoding="utf-8", newline="\n") as f:
143
        lines = f.readlines()
144
    diffs = []
145
    line_index = 0
146
    # Not a for loop cause `lines` is going to change (if `overwrite=True`).
147
148
149
150
151
152
153
154
155
    while line_index < len(lines):
        search = _re_copy_warning.search(lines[line_index])
        if search is None:
            line_index += 1
            continue

        # There is some copied code here, let's retrieve the original.
        indent, object_name, replace_pattern = search.groups()
        theoretical_code = find_code_in_transformers(object_name)
156
        theoretical_indent = get_indent(theoretical_code)
157
158
159
160
161
162
163
164
165
166
167
168

        start_index = line_index + 1 if indent == theoretical_indent else line_index + 2
        indent = theoretical_indent
        line_index = start_index

        # Loop to check the observed code, stop when indentation diminishes or if we see a End copy comment.
        should_continue = True
        while line_index < len(lines) and should_continue:
            line_index += 1
            if line_index >= len(lines):
                break
            line = lines[line_index]
169
            should_continue = _should_continue(line, indent) and re.search(f"^{indent}# End copy", line) is None
170
171
172
173
174
175
176
177
178
        # Clean up empty lines at the end (if any).
        while len(lines[line_index - 1]) <= 1:
            line_index -= 1

        observed_code_lines = lines[start_index:line_index]
        observed_code = "".join(observed_code_lines)

        # Before comparing, use the `replace_pattern` on the original code.
        if len(replace_pattern) > 0:
179
180
181
182
183
184
            patterns = replace_pattern.replace("with", "").split(",")
            patterns = [_re_replace_pattern.search(p) for p in patterns]
            for pattern in patterns:
                if pattern is None:
                    continue
                obj1, obj2, option = pattern.groups()
185
                theoretical_code = re.sub(obj1, obj2, theoretical_code)
186
187
188
                if option.strip() == "all-casing":
                    theoretical_code = re.sub(obj1.lower(), obj2.lower(), theoretical_code)
                    theoretical_code = re.sub(obj1.upper(), obj2.upper(), theoretical_code)
189

190
191
192
193
194
            # Blackify after replacement. To be able to do that, we need the header (class or function definition)
            # from the previous line
            theoretical_code = blackify(lines[start_index - 1] + theoretical_code)
            theoretical_code = theoretical_code[len(lines[start_index - 1]) :]

195
196
        # Test for a diff and act accordingly.
        if observed_code != theoretical_code:
197
            diffs.append([object_name, start_index])
198
199
200
201
            if overwrite:
                lines = lines[:start_index] + [theoretical_code] + lines[line_index:]
                line_index = start_index + 1

202
    if overwrite and len(diffs) > 0:
203
204
        # Warn the user a file has been modified.
        print(f"Detected changes, rewriting {filename}.")
205
        with open(filename, "w", encoding="utf-8", newline="\n") as f:
206
            f.writelines(lines)
207
    return diffs
208
209
210
211
212
213


def check_copies(overwrite: bool = False):
    all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
    diffs = []
    for filename in all_files:
214
215
        new_diffs = is_copy_consistent(filename, overwrite)
        diffs += [f"- {filename}: copy does not match {d[0]} at line {d[1]}" for d in new_diffs]
216
217
218
    if not overwrite and len(diffs) > 0:
        diff = "\n".join(diffs)
        raise Exception(
219
            "Found the following copy inconsistencies:\n"
220
            + diff
221
            + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
222
        )
223
224
225
    check_model_list_copy(overwrite=overwrite)


226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def check_full_copies(overwrite: bool = False):
    diffs = []
    for target, source in FULL_COPIES.items():
        with open(source, "r", encoding="utf-8") as f:
            source_code = f.read()
        with open(target, "r", encoding="utf-8") as f:
            target_code = f.read()
        if source_code != target_code:
            if overwrite:
                with open(target, "w", encoding="utf-8") as f:
                    print(f"Replacing the content of {target} by the one of {source}.")
                    f.write(source_code)
            else:
                diffs.append(f"- {target}: copy does not match {source}.")

    if not overwrite and len(diffs) > 0:
        diff = "\n".join(diffs)
        raise Exception(
            "Found the following copy inconsistencies:\n"
            + diff
            + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
        )


250
def get_model_list(filename, start_prompt, end_prompt):
Patrick von Platen's avatar
Patrick von Platen committed
251
    """Extracts the model list from the README."""
252
    with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f:
253
254
255
        lines = f.readlines()
    # Find the start of the list.
    start_index = 0
256
    while not lines[start_index].startswith(start_prompt):
257
258
259
260
261
262
263
        start_index += 1
    start_index += 1

    result = []
    current_line = ""
    end_index = start_index

264
    while not lines[end_index].startswith(end_prompt):
265
266
267
268
269
270
271
272
273
274
275
276
277
278
        if lines[end_index].startswith("1."):
            if len(current_line) > 1:
                result.append(current_line)
            current_line = lines[end_index]
        elif len(lines[end_index]) > 1:
            current_line = f"{current_line[:-1]} {lines[end_index].lstrip()}"
        end_index += 1
    if len(current_line) > 1:
        result.append(current_line)

    return "".join(result)


def split_long_line_with_indent(line, max_per_line, indent):
Patrick von Platen's avatar
Patrick von Platen committed
279
    """Split the `line` so that it doesn't go over `max_per_line` and adds `indent` to new lines."""
280
281
282
283
284
285
286
287
288
289
290
291
292
293
    words = line.split(" ")
    lines = []
    current_line = words[0]
    for word in words[1:]:
        if len(f"{current_line} {word}") > max_per_line:
            lines.append(current_line)
            current_line = " " * indent + word
        else:
            current_line = f"{current_line} {word}"
    lines.append(current_line)
    return "\n".join(lines)


def convert_to_rst(model_list, max_per_line=None):
Patrick von Platen's avatar
Patrick von Platen committed
294
    """Convert `model_list` to rst format."""
295
    # Convert **[description](link)** to `description <link>`__
296
297
298
299
300
301
302
303
304
305
306
    def _rep_link(match):
        title, link = match.groups()
        # Keep hard links for the models not released yet
        if "master" in link or not link.startswith("https://huggingface.co/transformers"):
            return f"`{title} <{link}>`__"
        # Convert links to relative links otherwise
        else:
            link = link[len("https://huggingface.co/transformers/") : -len(".html")]
            return f":doc:`{title} <{link}>`"

    model_list = re.sub(r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\*", _rep_link, model_list)
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325

    # Convert [description](link) to `description <link>`__
    model_list = re.sub(r"\[([^\]]*)\]\(([^\)]*)\)", r"`\1 <\2>`__", model_list)

    # Enumerate the lines properly
    lines = model_list.split("\n")
    result = []
    for i, line in enumerate(lines):
        line = re.sub(r"^\s*(\d+)\.", f"{i+1}.", line)
        # Split the lines that are too long
        if max_per_line is not None and len(line) > max_per_line:
            prompt = re.search(r"^(\s*\d+\.\s+)\S", line)
            indent = len(prompt.groups()[0]) if prompt is not None else 0
            line = split_long_line_with_indent(line, max_per_line, indent)

        result.append(line)
    return "\n".join(result)


326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
def convert_to_localized_md(model_list, localized_model_list, format_str):
    """Convert `model_list` to each localized README."""

    def _rep(match):
        title, model_link, paper_affiliations, paper_title_link, paper_authors, supplements = match.groups()
        return format_str.format(
            title=title,
            model_link=model_link,
            paper_affiliations=paper_affiliations,
            paper_title_link=paper_title_link,
            paper_authors=paper_authors,
            supplements=" " + supplements.strip() if len(supplements) != 0 else "",
        )

    # This regex captures metadata from an English model description, including model title, model link,
    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for example).
    _re_capture_meta = re.compile(
        r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\* \(from ([^)]*)\)[^\[]*([^\)]*\)).*?by (.*?[A-Za-z\*]{2,}?)\. (.*)$"
    )
345
346
    # This regex is used to synchronize link.
    _re_capture_title_link = re.compile(r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\*")
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

    num_models_equal = True

    if len(localized_model_list) == 0:
        localized_model_index = {}
    else:
        try:
            localized_model_index = {
                re.search(r"\*\*\[([^\]]*)", line).groups()[0]: line
                for line in localized_model_list.strip().split("\n")
            }
        except AttributeError:
            raise AttributeError("A model name in localized READMEs cannot be recognized.")

    for model in model_list.strip().split("\n"):
362
        title, model_link = _re_capture_title_link.search(model).groups()
363
364
365
366
        if title not in localized_model_index:
            num_models_equal = False
            # Add an anchor white space behind a model description string for regex.
            # If metadata cannot be captured, the English version will be directly copied.
367
368
369
370
371
372
            localized_model_index[title] = _re_capture_meta.sub(_rep, model + " ")
        else:
            # Synchronize link
            localized_model_index[title] = _re_capture_title_link.sub(
                f"**[{title}]({model_link})**", localized_model_index[title], count=1
            )
373
374
375
376
377
378

    sorted_index = sorted(localized_model_index.items(), key=lambda x: x[0].lower())

    return num_models_equal, "\n".join(map(lambda x: x[1], sorted_index)) + "\n"


Sylvain Gugger's avatar
Sylvain Gugger committed
379
380
381
382
383
384
def _find_text_in_file(filename, start_prompt, end_prompt):
    """
    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
    lines.
    """
    with open(filename, "r", encoding="utf-8", newline="\n") as f:
385
        lines = f.readlines()
Sylvain Gugger's avatar
Sylvain Gugger committed
386
    # Find the start prompt.
387
    start_index = 0
Sylvain Gugger's avatar
Sylvain Gugger committed
388
    while not lines[start_index].startswith(start_prompt):
389
390
391
392
        start_index += 1
    start_index += 1

    end_index = start_index
Sylvain Gugger's avatar
Sylvain Gugger committed
393
    while not lines[end_index].startswith(end_prompt):
394
395
396
397
398
399
400
401
        end_index += 1
    end_index -= 1

    while len(lines[start_index]) <= 1:
        start_index += 1
    while len(lines[end_index]) <= 1:
        end_index -= 1
    end_index += 1
Sylvain Gugger's avatar
Sylvain Gugger committed
402
403
    return "".join(lines[start_index:end_index]), start_index, end_index, lines

404

Sylvain Gugger's avatar
Sylvain Gugger committed
405
def check_model_list_copy(overwrite=False, max_per_line=119):
Patrick von Platen's avatar
Patrick von Platen committed
406
    """Check the model lists in the README and index.rst are consistent and maybe `overwrite`."""
407
408

    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
Sylvain Gugger's avatar
Sylvain Gugger committed
409
410
411
    rst_list, start_index, end_index, lines = _find_text_in_file(
        filename=os.path.join(PATH_TO_DOCS, "index.rst"),
        start_prompt="    This list is updated automatically from the README",
412
        end_prompt="Supported frameworks",
Sylvain Gugger's avatar
Sylvain Gugger committed
413
    )
414
415
416
417
418
419
420
    md_list = get_model_list(
        filename="README.md",
        start_prompt=LOCALIZED_READMES["README.md"]["start_prompt"],
        end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"],
    )

    converted_rst_list = convert_to_rst(md_list, max_per_line=max_per_line)
421

422
423
424
425
426
427
428
429
430
431
432
433
    converted_md_lists = []
    for filename, value in LOCALIZED_READMES.items():
        _start_prompt = value["start_prompt"]
        _end_prompt = value["end_prompt"]
        _format_model_list = value["format_model_list"]

        localized_md_list = get_model_list(filename, _start_prompt, _end_prompt)
        num_models_equal, converted_md_list = convert_to_localized_md(md_list, localized_md_list, _format_model_list)

        converted_md_lists.append((filename, num_models_equal, converted_md_list, _start_prompt, _end_prompt))

    if converted_rst_list != rst_list:
434
        if overwrite:
435
            with open(os.path.join(PATH_TO_DOCS, "index.rst"), "w", encoding="utf-8", newline="\n") as f:
436
                f.writelines(lines[:start_index] + [converted_rst_list] + lines[end_index:])
437
438
        else:
            raise ValueError(
Sylvain Gugger's avatar
Sylvain Gugger committed
439
440
441
442
                "The model list in the README changed and the list in `index.rst` has not been updated. Run "
                "`make fix-copies` to fix this."
            )

443
444
445
    for converted_md_list in converted_md_lists:
        filename, num_models_equal, converted_md, _start_prompt, _end_prompt = converted_md_list

446
447
448
449
450
451
452
453
454
455
456
457
458
        if filename == "README.md":
            continue
        if overwrite:
            _, start_index, end_index, lines = _find_text_in_file(
                filename=os.path.join(REPO_PATH, filename), start_prompt=_start_prompt, end_prompt=_end_prompt
            )
            with open(os.path.join(REPO_PATH, filename), "w", encoding="utf-8", newline="\n") as f:
                f.writelines(lines[:start_index] + [converted_md] + lines[end_index:])
        elif not num_models_equal:
            raise ValueError(
                f"The model list in the README changed and the list in `{filename}` has not been updated. Run "
                "`make fix-copies` to fix this."
            )
459

Sylvain Gugger's avatar
Sylvain Gugger committed
460

461
462
463
464
465
466
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
    args = parser.parse_args()

    check_copies(args.fix_and_overwrite)
467
    check_full_copies(args.fix_and_overwrite)