Unverified Commit 12d76241 authored by Sam Shleifer's avatar Sam Shleifer Committed by GitHub
Browse files

[marian] converter supports models from new Tatoeba project (#6342)

parent fb7330b3
MarianMT
----------------------------------------------------
**DISCLAIMER:** If you see something strange,
file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
**Bugs:** If you see something strange,
file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__ and assign
@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.
Implementation Notes
~~~~~~~~~~~~~~~~~~~~
- Each model is about 298 MB on disk, there are 1,000+ models.
- The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
- The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
- models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
- The 80 opus models that require BPE preprocessing are not supported.
- The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
......
......@@ -2,9 +2,11 @@ import argparse
import json
import os
import shutil
import socket
import time
import warnings
from pathlib import Path
from typing import Dict, List, Union
from typing import Dict, List, Tuple, Union
from zipfile import ZipFile
import numpy as np
......@@ -15,6 +17,87 @@ from transformers import MarianConfig, MarianMTModel, MarianTokenizer
from transformers.hf_api import HfApi
def remove_suffix(text: str, suffix: str):
if text.endswith(suffix):
return text[: -len(suffix)]
return text # or whatever
def _process_benchmark_table_row(x):
fields = lmap(str.strip, x.replace("\t", "").split("|")[1:-1])
assert len(fields) == 3
return (fields[0], float(fields[1]), float(fields[2]))
def process_last_benchmark_table(readme_path) -> List[Tuple[str, float, float]]:
md_content = Path(readme_path).open().read()
entries = md_content.split("## Benchmarks")[-1].strip().split("\n")[2:]
data = lmap(_process_benchmark_table_row, entries)
return data
def check_if_models_are_dominated(old_repo_path="OPUS-MT-train/models", new_repo_path="Tatoeba-Challenge/models/"):
"""Make a blacklist for models where we have already ported the same language pair, and the ported model has higher BLEU score."""
import pandas as pd
released_cols = [
"url_base",
"pair", # (ISO639-3/ISO639-5 codes),
"short_pair", # (reduced codes),
"chrF2_score",
"bleu",
"brevity_penalty",
"ref_len",
"src_name",
"tgt_name",
]
released = pd.read_csv(f"{new_repo_path}/released-models.txt", sep="\t", header=None).iloc[:-1]
released.columns = released_cols
old_reg = make_registry(repo_path=old_repo_path)
old_reg = pd.DataFrame(old_reg, columns=["id", "prepro", "url_model", "url_test_set"])
assert old_reg.id.value_counts().max() == 1
old_reg = old_reg.set_index("id")
released["fname"] = released["url_base"].apply(
lambda x: remove_suffix(remove_prefix(x, "https://object.pouta.csc.fi/Tatoeba-Challenge/opus"), ".zip")
)
released["2m"] = released.fname.str.startswith("2m")
released["date"] = pd.to_datetime(released["fname"].apply(lambda x: remove_prefix(remove_prefix(x, "2m-"), "-")))
newest_released = released.dsort("date").drop_duplicates(["short_pair"], keep="first")
short_to_new_bleu = newest_released.set_index("short_pair").bleu
assert released.groupby("short_pair").pair.nunique().max() == 1
short_to_long = released.groupby("short_pair").pair.first().to_dict()
overlap_short = old_reg.index.intersection(released.short_pair.unique())
overlap_long = [short_to_long[o] for o in overlap_short]
new_reported_bleu = [short_to_new_bleu[o] for o in overlap_short]
def get_old_bleu(o) -> float:
pat = old_repo_path + "/{}/README.md"
bm_data = process_last_benchmark_table(pat.format(o))
tab = pd.DataFrame(bm_data, columns=["testset", "bleu", "chr-f"])
tato_bleu = tab.loc[lambda x: x.testset.str.startswith("Tato")].bleu
if tato_bleu.shape[0] > 0:
return tato_bleu.iloc[0]
else:
return np.nan
old_bleu = [get_old_bleu(o) for o in overlap_short]
cmp_df = pd.DataFrame(
dict(short=overlap_short, long=overlap_long, old_bleu=old_bleu, new_bleu=new_reported_bleu)
).fillna(-1)
dominated = cmp_df[cmp_df.old_bleu > cmp_df.new_bleu]
blacklist = dominated.long.unique().tolist() # 3 letter codes
return dominated, blacklist
def remove_prefix(text: str, prefix: str):
if text.startswith(prefix):
return text[len(prefix) :]
......@@ -149,37 +232,87 @@ def convert_hf_name_to_opus_name(hf_model_name):
return remove_prefix(opus_w_prefix, "opus-mt-")
def get_system_metadata(repo_root):
import git
return dict(
helsinki_git_sha=git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
transformers_git_sha=git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
port_machine=socket.gethostname(),
port_time=time.strftime("%Y-%m-%d-%H:%M"),
)
front_matter = """---
language: {}
tags:
- translation
license: apache-2.0
---
"""
def write_model_card(
hf_model_name: str,
repo_path="OPUS-MT-train/models/",
dry_run=False,
model_card_dir=Path("marian_converted/model_cards/Helsinki-NLP/"),
hf_model_name: str, repo_root="OPUS-MT-train", save_dir=Path("marian_converted"), dry_run=False, extra_metadata={},
) -> str:
"""Copy the most recent model's readme section from opus, and add metadata.
upload command: s3cmd sync --recursive model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/
upload command: aws s3 sync model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
"""
import pandas as pd
hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
opus_name: str = convert_hf_name_to_opus_name(hf_model_name)
assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
readme_url = OPUS_GITHUB_URL + f"{opus_name}/README.md"
readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
s, t = ",".join(opus_src), ",".join(opus_tgt)
extra_markdown = f"### {hf_model_name}\n\n* source languages: {s}\n* target languages: {t}\n* OPUS readme: [{opus_name}]({readme_url})\n"
metadata = {
"hf_name": hf_model_name,
"source_languages": s,
"target_languages": t,
"opus_readme_url": readme_url,
"original_repo": repo_root,
"tags": ["translation"],
}
metadata.update(extra_metadata)
metadata.update(get_system_metadata(repo_root))
# combine with opus markdown
opus_readme_path = Path(f"{repo_path}{opus_name}/README.md")
assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
extra_markdown = f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: {metadata['tgt_name']} \n* OPUS readme: [{opus_name}]({readme_url})\n"
content = opus_readme_path.open().read()
content = content.split("\n# ")[-1] # Get the lowest level 1 header in the README -- the most recent model.
content = "*".join(content.split("*")[1:])
content = extra_markdown + "\n* " + content.replace("download", "download original weights")
splat = content.split("*")[2:]
print(splat[3])
content = "*".join(splat)
content = (
front_matter.format(metadata["src_alpha2"])
+ extra_markdown
+ "\n* "
+ content.replace("download", "download original weights")
)
items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
sec3 = "\n### System Info: \n" + items
content += sec3
if dry_run:
return content
# Save string to model_cards/hf_model_name/readme.md
model_card_dir.mkdir(exist_ok=True)
sub_dir = model_card_dir / hf_model_name
return content, metadata
sub_dir = save_dir / f"opus-mt-{hf_model_name}"
sub_dir.mkdir(exist_ok=True)
dest = sub_dir / "README.md"
dest.open("w").write(content)
return content
pd.Series(metadata).to_json(sub_dir / "metadata.json")
# if dry_run:
return content, metadata
def get_clean_model_id_mapping(multiling_model_ids):
......@@ -193,7 +326,7 @@ def make_registry(repo_path="Opus-MT-train/models"):
"You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
)
results = {}
for p in Path(repo_path).ls():
for p in Path(repo_path).iterdir():
n_dash = p.name.count("-")
if n_dash == 0:
continue
......@@ -203,6 +336,21 @@ def make_registry(repo_path="Opus-MT-train/models"):
return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
def make_tatoeba_registry(repo_path="Tatoeba-Challenge/models"):
if not (Path(repo_path) / "zho-eng" / "README.md").exists():
raise ValueError(
f"repo_path:{repo_path} does not exist: "
"You must run: git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git before calling."
)
results = {}
for p in Path(repo_path).iterdir():
if len(p.name) != 7:
continue
lns = list(open(p / "README.md").readlines())
results[p.name] = _parse_readme(lns)
return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
def convert_all_sentencepiece_models(model_list=None, repo_path=None):
"""Requires 300GB"""
save_dir = Path("marian_ckpt")
......@@ -516,19 +664,6 @@ def convert(source_dir: Path, dest_dir):
model.from_pretrained(dest_dir) # sanity check
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--src", type=str, help="path to marian model dir", default="en-de")
parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
args = parser.parse_args()
source_dir = Path(args.src)
assert source_dir.exists(), f"Source directory {source_dir} not found"
dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
convert(source_dir, dest_dir)
def load_yaml(path):
import yaml
......@@ -544,3 +679,23 @@ def save_json(content: Union[Dict, List], path: str) -> None:
def unzip(zip_path: str, dest_dir: str) -> None:
with ZipFile(zip_path, "r") as zipObj:
zipObj.extractall(dest_dir)
if __name__ == "__main__":
"""
To bulk convert, run
>>> from transformers.convert_marian_to_pytorch import make_tatoeba_registry, convert_all_sentencepiece_models
>>> reg = make_tatoeba_registry()
>>> convert_all_sentencepiece_models(model_list=reg) # saves to marian_converted
(bash) aws s3 sync marian_converted s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
"""
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--src", type=str, help="path to marian model dir", default="en-de")
parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
args = parser.parse_args()
source_dir = Path(args.src)
assert source_dir.exists(), f"Source directory {source_dir} not found"
dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
convert(source_dir, dest_dir)
......@@ -205,6 +205,17 @@ class TestMarian_MT_EN(MarianIntegrationTest):
self._assert_generated_batch_equal_expected()
class TestMarian_eng_zho(MarianIntegrationTest):
src = "eng"
tgt = "zho"
src_text = ["My name is Wolfgang and I live in Berlin"]
expected_text = ["我叫沃尔夫冈 我住在柏林"]
@slow
def test_batch_generation_eng_zho(self):
self._assert_generated_batch_equal_expected()
class TestMarian_en_ROMANCE(MarianIntegrationTest):
"""Multilingual on target side."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment