Commit f046d331 authored by Muennighoff's avatar Muennighoff
Browse files

Add space tokenization for JA/ZH

parent 198ca732
......@@ -3,6 +3,11 @@ from pprint import pprint
from sacrebleu import sacrebleu
from lm_eval import metrics
from lm_eval.base import Task, rf
from typing import List
import jieba
import nagisa
"""
This file implements translation tasks using datasets from WMT conferences, provided by sacrebleu.
......@@ -25,6 +30,20 @@ def create_tasks_from_benchmarks(benchmark_dict):
for language_pair in language_pairs
}
########################################
# Language Specifics
########################################
def zh_split(zh_text: List[str]) -> List[str]:
"""Chinese splitting"""
return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
def ja_split(ja_text: List[str]) -> List[str]:
"""Japanese splitting"""
return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]
NO_SPACE_LANG = {"zh": zh_split, "ja": ja_split}
########################################
# Tasks
########################################
......@@ -102,6 +121,12 @@ class GeneralTranslationTask(Task):
return rf.greedy_until(ctx, ["\n"])
def process_results(self, doc, results):
# Add spaces between words for BLEU score calculation of target languages like Chinese
tar_lang_code = self.sacrebleu_language_pair.split("-")[-1]
if tar_lang_code in NO_SPACE_LANG:
doc["ref"] = NO_SPACE_LANG[tar_lang_code]([doc["ref"]])[0]
results = NO_SPACE_LANG[tar_lang_code](results)
# These metrics are corpus-level not sentence level, so we'll hide the
# results in this dict and compute the corpus score in the aggregate method
ref_pred = (doc["ref"], results)
......
......@@ -39,6 +39,8 @@ setuptools.setup(
"zstandard==0.15.2",
"jsonlines==2.0.0",
"mock==4.0.3",
"openai==0.6.4"
"openai==0.6.4",
"jieba==0.42.1",
"nagisa==0.2.7"
]
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment