Fix README localizer script (#17407)

bd908e9b · Sylvain Gugger · GitHub · 4d727bd2 · bd908e9b
Unverified Commit bd908e9b authored May 25, 2022 by Sylvain Gugger Committed by GitHub May 25, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 9 deletions

utils/check_copies.py utils/check_copies.py +13 -9

No files found.
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -312,8 +312,6 @@ def convert_to_localized_md(model_list, localized_model_list, format_str):
    # This regex is used to synchronize link.
    _re_capture_title_link = re.compile(r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\*")

-    num_models_equal = True
-
    if len(localized_model_list) == 0:
        localized_model_index = {}
    else:
@@ -325,10 +323,16 @@ def convert_to_localized_md(model_list, localized_model_list, format_str):
        except AttributeError:
            raise AttributeError("A model name in localized READMEs cannot be recognized.")

+    model_keys = [re.search(r"\*\*\[([^\]]*)", line).groups()[0] for line in model_list.strip().split("\n")]
+
+    # We exclude keys in localized README not in the main one.
+    readmes_match = not any([k not in model_keys for k in localized_model_index])
+    localized_model_index = {k: v for k, v in localized_model_index.items() if k in model_keys}
+
    for model in model_list.strip().split("\n"):
        title, model_link = _re_capture_title_link.search(model).groups()
        if title not in localized_model_index:
-            num_models_equal = False
+            readmes_match = False
            # Add an anchor white space behind a model description string for regex.
            # If metadata cannot be captured, the English version will be directly copied.
            localized_model_index[title] = _re_capture_meta.sub(_rep, model + " ")
@@ -340,7 +344,7 @@ def convert_to_localized_md(model_list, localized_model_list, format_str):

    sorted_index = sorted(localized_model_index.items(), key=lambda x: x[0].lower())

-    return num_models_equal, "\n".join(map(lambda x: x[1], sorted_index)) + "\n"
+    return readmes_match, "\n".join(map(lambda x: x[1], sorted_index)) + "\n"


 def convert_readme_to_index(model_list):
@@ -380,7 +384,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
    with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
        readme = f.read()
    new_readme = readme.replace("https://huggingface.co/transformers", "https://huggingface.co/docs/transformers")
-    new_readme = readme.replace(
+    new_readme = new_readme.replace(
        "https://huggingface.co/docs/main/transformers", "https://huggingface.co/docs/transformers/main"
    )
    if new_readme != readme:
@@ -412,9 +416,9 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
        _format_model_list = value["format_model_list"]

        localized_md_list = get_model_list(filename, _start_prompt, _end_prompt)
-        num_models_equal, converted_md_list = convert_to_localized_md(md_list, localized_md_list, _format_model_list)
+        readmes_match, converted_md_list = convert_to_localized_md(md_list, localized_md_list, _format_model_list)

-        converted_md_lists.append((filename, num_models_equal, converted_md_list, _start_prompt, _end_prompt))
+        converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt))

    converted_md_list = convert_readme_to_index(md_list)
    if converted_md_list != index_list:
@@ -428,7 +432,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
            )

    for converted_md_list in converted_md_lists:
-        filename, num_models_equal, converted_md, _start_prompt, _end_prompt = converted_md_list
+        filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list

        if filename == "README.md":
            continue
@@ -438,7 +442,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
            )
            with open(os.path.join(REPO_PATH, filename), "w", encoding="utf-8", newline="\n") as f:
                f.writelines(lines[:start_index] + [converted_md] + lines[end_index:])
-        elif not num_models_equal:
+        elif not readmes_match:
            raise ValueError(
                f"The model list in the README changed and the list in `{filename}` has not been updated. Run "
                "`make fix-copies` to fix this."