Unverified Commit 7051b892 authored by Sam Hardwick's avatar Sam Hardwick Committed by GitHub
Browse files

Update Tatoeba conversion (#13757)

* Update Tatoeba conversion
parent 12b4d66a
...@@ -13,31 +13,32 @@ ...@@ -13,31 +13,32 @@
# limitations under the License. # limitations under the License.
import argparse import argparse
import datetime
import json
import os import os
import re
from pathlib import Path from pathlib import Path
from typing import List, Tuple from typing import Tuple
from tqdm import tqdm
import yaml
from transformers.models.marian.convert_marian_to_pytorch import ( from transformers.models.marian.convert_marian_to_pytorch import (
FRONT_MATTER_TEMPLATE, FRONT_MATTER_TEMPLATE,
_parse_readme, convert,
convert_all_sentencepiece_models, convert_opus_name_to_hf_name,
download_and_unzip,
get_system_metadata, get_system_metadata,
remove_prefix,
remove_suffix,
) )
try:
import pandas as pd
except ImportError:
pass
DEFAULT_REPO = "Tatoeba-Challenge" DEFAULT_REPO = "Tatoeba-Challenge"
DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models") DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
LANG_CODE_URL = "https://datahub.io/core/language-codes/r/language-codes-3b2.csv" LANG_CODE_URL = "https://datahub.io/core/language-codes/r/language-codes-3b2.csv"
ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv" ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
ISO_PATH = "lang_code_data/iso-639-3.csv" ISO_PATH = "lang_code_data/iso-639-3.csv"
LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv" LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
TATOEBA_MODELS_URL = "https://object.pouta.csc.fi/Tatoeba-MT-models"
class TatoebaConverter: class TatoebaConverter:
...@@ -46,194 +47,236 @@ class TatoebaConverter: ...@@ -46,194 +47,236 @@ class TatoebaConverter:
Steps: Steps:
1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion). 1. Convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique 2. Rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group 3. Select the best model for a particular pair, parse the yml for it and write a model card. By default the
members. best model is the one listed first in released-model-results, but it's also possible to specify the most
recent one.
""" """
def __init__(self, save_dir="marian_converted"): def __init__(self, save_dir="marian_converted"):
assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git" assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
reg = self.make_tatoeba_registry() self.download_lang_info()
self.download_metadata() self.model_results = json.load(open("Tatoeba-Challenge/models/released-model-results.json"))
self.registry = reg self.alpha3_to_alpha2 = {}
reg_df = pd.DataFrame(reg, columns=["id", "prepro", "url_model", "url_test_set"]) for line in open(ISO_PATH):
assert reg_df.id.value_counts().max() == 1 parts = line.split("\t")
reg_df = reg_df.set_index("id") if len(parts[0]) == 3 and len(parts[3]) == 2:
reg_df["src"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[0]).values self.alpha3_to_alpha2[parts[0]] = parts[3]
reg_df["tgt"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[1]).values for line in LANG_CODE_PATH:
parts = line.split(",")
released_cols = [ if len(parts[0]) == 3 and len(parts[1]) == 2:
"url_base", self.alpha3_to_alpha2[parts[0]] = parts[1]
"pair", # (ISO639-3/ISO639-5 codes),
"short_pair", # (reduced codes),
"chrF2_score",
"bleu",
"brevity_penalty",
"ref_len",
"src_name",
"tgt_name",
]
released = pd.read_csv("Tatoeba-Challenge/models/released-models.txt", sep="\t", header=None).iloc[:-1]
released.columns = released_cols
released["fname"] = released["url_base"].apply(
lambda x: remove_suffix(remove_prefix(x, "https://object.pouta.csc.fi/Tatoeba-Challenge/opus"), ".zip")
)
released["2m"] = released.fname.str.startswith("2m")
released["date"] = pd.to_datetime(
released["fname"].apply(lambda x: remove_prefix(remove_prefix(x, "2m-"), "-"))
)
released["base_ext"] = released.url_base.apply(lambda x: Path(x).name)
reg_df["base_ext"] = reg_df.url_model.apply(lambda x: Path(x).name)
metadata_new = reg_df.reset_index().merge(released.rename(columns={"pair": "id"}), on=["base_ext", "id"])
metadata_renamer = {"src": "src_alpha3", "tgt": "tgt_alpha3", "id": "long_pair", "date": "train_date"}
metadata_new = metadata_new.rename(columns=metadata_renamer)
metadata_new["src_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[0])
metadata_new["tgt_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[1])
DROP_COLS_BOTH = ["url_base", "base_ext", "fname"]
metadata_new = metadata_new.drop(DROP_COLS_BOTH, 1)
metadata_new["prefer_old"] = metadata_new.long_pair.isin([])
self.metadata = metadata_new
assert self.metadata.short_pair.value_counts().max() == 1, "Multiple metadata entries for a short pair"
self.metadata = self.metadata.set_index("short_pair")
# wget.download(LANG_CODE_URL)
mapper = pd.read_csv(LANG_CODE_PATH)
mapper.columns = ["a3", "a2", "ref"]
self.iso_table = pd.read_csv(ISO_PATH, sep="\t").rename(columns=lambda x: x.lower())
more_3_to_2 = self.iso_table.set_index("id").part1.dropna().to_dict()
more_3_to_2.update(mapper.set_index("a3").a2.to_dict())
self.alpha3_to_alpha2 = more_3_to_2
self.model_card_dir = Path(save_dir) self.model_card_dir = Path(save_dir)
self.constituents = GROUP_MEMBERS self.tag2name = {}
for key, value in GROUP_MEMBERS.items():
self.tag2name[key] = value[0]
def convert_models(self, tatoeba_ids, dry_run=False): def convert_models(self, tatoeba_ids, dry_run=False):
entries_to_convert = [x for x in self.registry if x[0] in tatoeba_ids] models_to_convert = [self.parse_metadata(x) for x in tatoeba_ids]
converted_paths = convert_all_sentencepiece_models(entries_to_convert, dest_dir=self.model_card_dir) save_dir = Path("marian_ckpt")
dest_dir = Path(self.model_card_dir)
for path in converted_paths: dest_dir.mkdir(exist_ok=True)
long_pair = remove_prefix(path.name, "opus-mt-").split("-") # eg. heb-eng for model in tqdm(models_to_convert): # k, prepro, download, test_set_url in tqdm(model_list):
assert len(long_pair) == 2 if "SentencePiece" not in model["pre-processing"]:
new_p_src = self.get_two_letter_code(long_pair[0]) print(f"Skipping {model['release']} because it doesn't appear to use SentencePiece")
new_p_tgt = self.get_two_letter_code(long_pair[1]) continue
hf_model_id = f"opus-mt-{new_p_src}-{new_p_tgt}" if not os.path.exists(save_dir / model["_name"]):
new_path = path.parent.joinpath(hf_model_id) # opus-mt-he-en download_and_unzip(f"{TATOEBA_MODELS_URL}/{model['release']}", save_dir / model["_name"])
os.rename(str(path), str(new_path)) # from convert_marian_to_pytorch
self.write_model_card(hf_model_id, dry_run=dry_run) opus_language_groups_to_hf = convert_opus_name_to_hf_name
pair_name = opus_language_groups_to_hf(model["_name"])
def get_two_letter_code(self, three_letter_code): convert(save_dir / model["_name"], dest_dir / f"opus-mt-{pair_name}")
return self.alpha3_to_alpha2.get(three_letter_code, three_letter_code) self.write_model_card(model, dry_run=dry_run)
def expand_group_to_two_letter_codes(self, grp_name): def expand_group_to_two_letter_codes(self, grp_name):
return [self.get_two_letter_code(x) for x in self.constituents[grp_name]] return [self.alpha3_to_alpha2.get(x, x) for x in GROUP_MEMBERS[grp_name][1]]
def is_group(self, code, name):
return "languages" in name or len(GROUP_MEMBERS.get(code, [])) > 1
def get_tags(self, code, ref_name): def get_tags(self, code, name):
if len(code) == 2: if len(code) == 2:
assert "languages" not in ref_name, f"{code}: {ref_name}" assert "languages" not in name, f"{code}: {name}"
return [code], False return [code]
elif "languages" in ref_name or len(self.constituents.get(code, [])) > 1: elif self.is_group(code, name):
group = self.expand_group_to_two_letter_codes(code) group = self.expand_group_to_two_letter_codes(code)
group.append(code) group.append(code)
return group, True return group
else: # zho-> zh else: # zho-> zh
print(f"Three letter monolingual code: {code}") print(f"Three letter monolingual code: {code}")
return [code], False return [code]
def resolve_lang_code(self, r) -> Tuple[List[str], str, str]: def resolve_lang_code(self, src, tgt) -> Tuple[str, str]:
"""R is a row in ported""" src_tags = self.get_tags(src, self.tag2name[src])
short_pair = r.short_pair tgt_tags = self.get_tags(tgt, self.tag2name[tgt])
src, tgt = short_pair.split("-") return src_tags, tgt_tags
src_tags, src_multilingual = self.get_tags(src, r.src_name)
assert isinstance(src_tags, list)
tgt_tags, tgt_multilingual = self.get_tags(tgt, r.tgt_name)
assert isinstance(tgt_tags, list)
return dedup(src_tags + tgt_tags), src_multilingual, tgt_multilingual @staticmethod
def model_type_info_from_model_name(name):
info = {"_has_backtranslated_data": False}
if "1m" in name:
info["_data_per_pair"] = str(1e6)
if "2m" in name:
info["_data_per_pair"] = str(2e6)
if "4m" in name:
info["_data_per_pair"] = str(4e6)
if "+bt" in name:
info["_has_backtranslated_data"] = True
if "tuned4" in name:
info["_tuned"] = re.search(r"tuned4[^-]+", name).group()
return info
def write_model_card( def write_model_card(self, model_dict, dry_run=False) -> str:
self,
hf_model_id: str,
repo_root=DEFAULT_REPO,
dry_run=False,
) -> str:
""" """
Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync Construct card from data parsed from YAML and the model's name. upload command: aws s3 sync model_card_dir
model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
""" """
short_pair = remove_prefix(hf_model_id, "opus-mt-") model_dir_url = f"{TATOEBA_MODELS_URL}/{model_dict['release']}"
extra_metadata = self.metadata.loc[short_pair].drop("2m") long_pair = model_dict["_name"].split("-")
extra_metadata["short_pair"] = short_pair assert len(long_pair) == 2, f"got a translation pair {model_dict['_name']} that doesn't appear to be a pair"
lang_tags, src_multilingual, tgt_multilingual = self.resolve_lang_code(extra_metadata) short_src = self.alpha3_to_alpha2.get(long_pair[0], long_pair[0])
opus_name = f"{extra_metadata.src_alpha3}-{extra_metadata.tgt_alpha3}" short_tgt = self.alpha3_to_alpha2.get(long_pair[1], long_pair[1])
# opus_name: str = self.convert_hf_name_to_opus_name(hf_model_name) model_dict["_hf_model_id"] = f"opus-mt-{short_src}-{short_tgt}"
assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")] a3_src, a3_tgt = model_dict["_name"].split("-")
# opus_src_tags, opus_tgt_tags = a3_src.split("+"), a3_tgt.split("+")
readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md" # This messy part tries to deal with language tags in multilingual models, possibly
# not all having three-letter codes
resolved_src_tags, resolved_tgt_tags = self.resolve_lang_code(a3_src, a3_tgt)
a2_src_tags, a2_tgt_tags = [], []
for tag in resolved_src_tags:
if tag not in self.alpha3_to_alpha2:
a2_src_tags.append(tag)
for tag in resolved_tgt_tags:
if tag not in self.alpha3_to_alpha2:
a2_tgt_tags.append(tag)
s, t = ",".join(opus_src), ",".join(opus_tgt) lang_tags = dedup(a2_src_tags + a2_tgt_tags)
src_multilingual, tgt_multilingual = (len(a2_src_tags) > 1), (len(a2_tgt_tags) > 1)
s, t = ",".join(a2_src_tags), ",".join(a2_tgt_tags)
metadata = { metadata = {
"hf_name": short_pair, "hf_name": model_dict["_name"],
"source_languages": s, "source_languages": s,
"target_languages": t, "target_languages": t,
"opus_readme_url": readme_url, "opus_readme_url": f"{model_dir_url}/README.md",
"original_repo": repo_root, "original_repo": "Tatoeba-Challenge",
"tags": ["translation"], "tags": ["translation"],
"languages": lang_tags, "languages": lang_tags,
} }
lang_tags = l2front_matter(lang_tags) lang_tags = l2front_matter(lang_tags)
metadata["src_constituents"] = self.constituents[s]
metadata["tgt_constituents"] = self.constituents[t] metadata["src_constituents"] = list(GROUP_MEMBERS[a3_src][1])
metadata["tgt_constituents"] = list(GROUP_MEMBERS[a3_tgt][1])
metadata["src_multilingual"] = src_multilingual metadata["src_multilingual"] = src_multilingual
metadata["tgt_multilingual"] = tgt_multilingual metadata["tgt_multilingual"] = tgt_multilingual
metadata.update(extra_metadata) backtranslated_data = ""
metadata.update(get_system_metadata(repo_root)) if model_dict["_has_backtranslated_data"]:
backtranslated_data = " with backtranslations"
# combine with Tatoeba markdown multilingual_data = ""
if "_data_per_pair" in model_dict:
multilingual_data = f"* data per pair in multilingual model: {model_dict['_data_per_pair']}\n"
tuned = ""
if "_tuned" in model_dict:
tuned = f"* multilingual model tuned for: {model_dict['_tuned']}\n"
model_base_filename = model_dict["release"].split("/")[-1]
download = f"* download original weights: [{model_base_filename}]({model_dir_url}/{model_dict['release']})\n"
langtoken = ""
if tgt_multilingual:
langtoken = (
"* a sentence-initial language token is required in the form of >>id<<"
"(id = valid, usually three-letter target language ID)\n"
)
metadata.update(get_system_metadata(DEFAULT_REPO))
scorestable = ""
for k, v in model_dict.items():
if "scores" in k:
this_score_table = f"* {k}\n|Test set|score|\n|---|---|\n"
pairs = sorted(v.items(), key=lambda x: x[1], reverse=True)
for pair in pairs:
this_score_table += f"|{pair[0]}|{pair[1]}|\n"
scorestable += this_score_table
datainfo = ""
if "training-data" in model_dict:
datainfo += "* Training data: \n"
for k, v in model_dict["training-data"].items():
datainfo += f" * {str(k)}: {str(v)}\n"
if "validation-data" in model_dict:
datainfo += "* Validation data: \n"
for k, v in model_dict["validation-data"].items():
datainfo += f" * {str(k)}: {str(v)}\n"
if "test-data" in model_dict:
datainfo += "* Test data: \n"
for k, v in model_dict["test-data"].items():
datainfo += f" * {str(k)}: {str(v)}\n"
extra_markdown = f"### {short_pair}\n\n* source group: {metadata['src_name']} \n* target group: {metadata['tgt_name']} \n* OPUS readme: [{opus_name}]({readme_url})\n" testsetfilename = model_dict["release"].replace(".zip", ".test.txt")
testscoresfilename = model_dict["release"].replace(".zip", ".eval.txt")
testset = f"* test set translations file: [test.txt]({model_dir_url}/{testsetfilename})\n"
testscores = f"* test set scores file: [eval.txt]({model_dir_url}/{testscoresfilename})\n"
content = opus_readme_path.open().read() # combine with Tatoeba markdown
content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model. readme_url = f"{TATOEBA_MODELS_URL}/{model_dict['_name']}/README.md"
splat = content.split("*")[2:] extra_markdown = f"""
### {model_dict['_name']}
content = "*".join(splat) * source language name: {self.tag2name[a3_src]}
# BETTER FRONT MATTER LOGIC * target language name: {self.tag2name[a3_tgt]}
* OPUS readme: [README.md]({readme_url})
"""
content = ( content = (
FRONT_MATTER_TEMPLATE.format(lang_tags) f"""
+ extra_markdown * model: {model_dict['modeltype']}
+ "\n* " * source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)}
+ content.replace("download", "download original " "weights") * target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)}
* dataset: opus {backtranslated_data}
* release date: {model_dict['release-date']}
* pre-processing: {model_dict['pre-processing']}
"""
+ multilingual_data
+ tuned
+ download
+ langtoken
+ datainfo
+ testset
+ testscores
+ scorestable
) )
items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()]) content = FRONT_MATTER_TEMPLATE.format(lang_tags) + extra_markdown + content
items = "\n".join([f"* {k}: {v}" for k, v in metadata.items()])
sec3 = "\n### System Info: \n" + items sec3 = "\n### System Info: \n" + items
content += sec3 content += sec3
if dry_run: if dry_run:
return content, metadata print("CONTENT:")
sub_dir = self.model_card_dir / hf_model_id print(content)
print("METADATA:")
print(metadata)
return
sub_dir = self.model_card_dir / model_dict["_hf_model_id"]
sub_dir.mkdir(exist_ok=True) sub_dir.mkdir(exist_ok=True)
dest = sub_dir / "README.md" dest = sub_dir / "README.md"
dest.open("w").write(content) dest.open("w").write(content)
pd.Series(metadata).to_json(sub_dir / "metadata.json") for k, v in metadata.items():
return content, metadata if isinstance(v, datetime.date):
metadata[k] = datetime.datetime.strftime(v, "%Y-%m-%d")
with open(sub_dir / "metadata.json", "w", encoding="utf-8") as writeobj:
json.dump(metadata, writeobj)
def download_metadata(self): def download_lang_info(self):
Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True) Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True)
import wget import wget
...@@ -242,20 +285,35 @@ class TatoebaConverter: ...@@ -242,20 +285,35 @@ class TatoebaConverter:
if not os.path.exists(LANG_CODE_PATH): if not os.path.exists(LANG_CODE_PATH):
wget.download(LANG_CODE_URL, LANG_CODE_PATH) wget.download(LANG_CODE_URL, LANG_CODE_PATH)
@staticmethod def parse_metadata(self, model_name, repo_path=DEFAULT_MODEL_DIR, method="best"):
def make_tatoeba_registry(repo_path=DEFAULT_MODEL_DIR): p = Path(repo_path) / model_name
if not (Path(repo_path) / "zho-eng" / "README.md").exists():
raise ValueError( def url_to_name(url):
f"repo_path:{repo_path} does not exist: " return url.split("/")[-1].split(".")[0]
"You must run: git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git before calling."
if model_name not in self.model_results:
# This is not a language pair, so model results are ambiguous, go by newest
method = "newest"
if method == "best":
# Sort by how early they appear in released-models-results
results = [url_to_name(model["download"]) for model in self.model_results[model_name]]
ymls = [f for f in os.listdir(p) if f.endswith(".yml") and f[:-4] in results]
ymls.sort(key=lambda x: results.index(x[:-4]))
metadata = yaml.safe_load(open(p / ymls[0]))
metadata.update(self.model_type_info_from_model_name(ymls[0][:-4]))
elif method == "newest":
ymls = [f for f in os.listdir(p) if f.endswith(".yml")]
# Sort by date
ymls.sort(
key=lambda x: datetime.datetime.strptime(re.search(r"\d\d\d\d-\d\d?-\d\d?", x).group(), "%Y-%m-%d")
) )
results = {} metadata = yaml.safe_load(open(p / ymls[-1]))
for p in Path(repo_path).iterdir(): metadata.update(self.model_type_info_from_model_name(ymls[-1][:-4]))
if len(p.name) != 7: else:
continue raise NotImplementedError(f"Don't know argument method='{method}' to parse_metadata()")
lns = list(open(p / "README.md").readlines()) metadata["_name"] = model_name
results[p.name] = _parse_readme(lns) return metadata
return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
GROUP_MEMBERS = { GROUP_MEMBERS = {
...@@ -1248,9 +1306,7 @@ def dedup(lst): ...@@ -1248,9 +1306,7 @@ def dedup(lst):
"""Preservers order""" """Preservers order"""
new_lst = [] new_lst = []
for item in lst: for item in lst:
if not item: if not item or item in new_lst:
continue
elif item in new_lst:
continue continue
else: else:
new_lst.append(item) new_lst.append(item)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment