Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f046d331
Commit
f046d331
authored
Aug 15, 2021
by
Muennighoff
Browse files
Add space tokenization for JA/ZH
parent
198ca732
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
1 deletion
+28
-1
lm_eval/tasks/translation.py
lm_eval/tasks/translation.py
+25
-0
setup.py
setup.py
+3
-1
No files found.
lm_eval/tasks/translation.py
View file @
f046d331
...
@@ -3,6 +3,11 @@ from pprint import pprint
...
@@ -3,6 +3,11 @@ from pprint import pprint
from
sacrebleu
import
sacrebleu
from
sacrebleu
import
sacrebleu
from
lm_eval
import
metrics
from
lm_eval
import
metrics
from
lm_eval.base
import
Task
,
rf
from
lm_eval.base
import
Task
,
rf
from
typing
import
List
import
jieba
import
nagisa
"""
"""
This file implements translation tasks using datasets from WMT conferences, provided by sacrebleu.
This file implements translation tasks using datasets from WMT conferences, provided by sacrebleu.
...
@@ -25,6 +30,20 @@ def create_tasks_from_benchmarks(benchmark_dict):
...
@@ -25,6 +30,20 @@ def create_tasks_from_benchmarks(benchmark_dict):
for
language_pair
in
language_pairs
for
language_pair
in
language_pairs
}
}
########################################
# Language Specifics
########################################
def
zh_split
(
zh_text
:
List
[
str
])
->
List
[
str
]:
"""Chinese splitting"""
return
[
" "
.
join
(
jieba
.
cut
(
txt
.
strip
()))
for
txt
in
zh_text
]
def
ja_split
(
ja_text
:
List
[
str
])
->
List
[
str
]:
"""Japanese splitting"""
return
[
" "
.
join
(
nagisa
.
tagging
(
txt
.
strip
()).
words
)
for
txt
in
ja_text
]
NO_SPACE_LANG
=
{
"zh"
:
zh_split
,
"ja"
:
ja_split
}
########################################
########################################
# Tasks
# Tasks
########################################
########################################
...
@@ -102,6 +121,12 @@ class GeneralTranslationTask(Task):
...
@@ -102,6 +121,12 @@ class GeneralTranslationTask(Task):
return
rf
.
greedy_until
(
ctx
,
[
"
\n
"
])
return
rf
.
greedy_until
(
ctx
,
[
"
\n
"
])
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
# Add spaces between words for BLEU score calculation of target languages like Chinese
tar_lang_code
=
self
.
sacrebleu_language_pair
.
split
(
"-"
)[
-
1
]
if
tar_lang_code
in
NO_SPACE_LANG
:
doc
[
"ref"
]
=
NO_SPACE_LANG
[
tar_lang_code
]([
doc
[
"ref"
]])[
0
]
results
=
NO_SPACE_LANG
[
tar_lang_code
](
results
)
# These metrics are corpus-level not sentence level, so we'll hide the
# These metrics are corpus-level not sentence level, so we'll hide the
# results in this dict and compute the corpus score in the aggregate method
# results in this dict and compute the corpus score in the aggregate method
ref_pred
=
(
doc
[
"ref"
],
results
)
ref_pred
=
(
doc
[
"ref"
],
results
)
...
...
setup.py
View file @
f046d331
...
@@ -39,6 +39,8 @@ setuptools.setup(
...
@@ -39,6 +39,8 @@ setuptools.setup(
"zstandard==0.15.2"
,
"zstandard==0.15.2"
,
"jsonlines==2.0.0"
,
"jsonlines==2.0.0"
,
"mock==4.0.3"
,
"mock==4.0.3"
,
"openai==0.6.4"
"openai==0.6.4"
,
"jieba==0.42.1"
,
"nagisa==0.2.7"
]
]
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment