Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7051b892
"tests/vit_mae/__init__.py" did not exist on "dd4df80f0b77c8f8e07e502298df0121cada9ce8"
Unverified
Commit
7051b892
authored
Oct 05, 2021
by
Sam Hardwick
Committed by
GitHub
Oct 05, 2021
Browse files
Update Tatoeba conversion (#13757)
* Update Tatoeba conversion
parent
12b4d66a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
220 additions
and
164 deletions
+220
-164
src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
...ormers/models/marian/convert_marian_tatoeba_to_pytorch.py
+220
-164
No files found.
src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
View file @
7051b892
...
...
@@ -13,31 +13,32 @@
# limitations under the License.
import
argparse
import
datetime
import
json
import
os
import
re
from
pathlib
import
Path
from
typing
import
List
,
Tuple
from
typing
import
Tuple
from
tqdm
import
tqdm
import
yaml
from
transformers.models.marian.convert_marian_to_pytorch
import
(
FRONT_MATTER_TEMPLATE
,
_parse_readme
,
convert_all_sentencepiece_models
,
convert
,
convert_opus_name_to_hf_name
,
download_and_unzip
,
get_system_metadata
,
remove_prefix
,
remove_suffix
,
)
try
:
import
pandas
as
pd
except
ImportError
:
pass
DEFAULT_REPO
=
"Tatoeba-Challenge"
DEFAULT_MODEL_DIR
=
os
.
path
.
join
(
DEFAULT_REPO
,
"models"
)
LANG_CODE_URL
=
"https://datahub.io/core/language-codes/r/language-codes-3b2.csv"
ISO_URL
=
"https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
ISO_PATH
=
"lang_code_data/iso-639-3.csv"
LANG_CODE_PATH
=
"lang_code_data/language-codes-3b2.csv"
TATOEBA_MODELS_URL
=
"https://object.pouta.csc.fi/Tatoeba-MT-models"
class
TatoebaConverter
:
...
...
@@ -46,194 +47,236 @@ class TatoebaConverter:
Steps:
1.
c
onvert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
2.
r
ename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
1.
C
onvert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
2.
R
ename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
members.
3. Select the best model for a particular pair, parse the yml for it and write a model card. By default the
best model is the one listed first in released-model-results, but it's also possible to specify the most
recent one.
"""
def
__init__
(
self
,
save_dir
=
"marian_converted"
):
assert
Path
(
DEFAULT_REPO
).
exists
(),
"need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
reg
=
self
.
make_tatoeba_registry
()
self
.
download_metadata
()
self
.
registry
=
reg
reg_df
=
pd
.
DataFrame
(
reg
,
columns
=
[
"id"
,
"prepro"
,
"url_model"
,
"url_test_set"
])
assert
reg_df
.
id
.
value_counts
().
max
()
==
1
reg_df
=
reg_df
.
set_index
(
"id"
)
reg_df
[
"src"
]
=
reg_df
.
reset_index
().
id
.
apply
(
lambda
x
:
x
.
split
(
"-"
)[
0
]).
values
reg_df
[
"tgt"
]
=
reg_df
.
reset_index
().
id
.
apply
(
lambda
x
:
x
.
split
(
"-"
)[
1
]).
values
released_cols
=
[
"url_base"
,
"pair"
,
# (ISO639-3/ISO639-5 codes),
"short_pair"
,
# (reduced codes),
"chrF2_score"
,
"bleu"
,
"brevity_penalty"
,
"ref_len"
,
"src_name"
,
"tgt_name"
,
]
released
=
pd
.
read_csv
(
"Tatoeba-Challenge/models/released-models.txt"
,
sep
=
"
\t
"
,
header
=
None
).
iloc
[:
-
1
]
released
.
columns
=
released_cols
released
[
"fname"
]
=
released
[
"url_base"
].
apply
(
lambda
x
:
remove_suffix
(
remove_prefix
(
x
,
"https://object.pouta.csc.fi/Tatoeba-Challenge/opus"
),
".zip"
)
)
released
[
"2m"
]
=
released
.
fname
.
str
.
startswith
(
"2m"
)
released
[
"date"
]
=
pd
.
to_datetime
(
released
[
"fname"
].
apply
(
lambda
x
:
remove_prefix
(
remove_prefix
(
x
,
"2m-"
),
"-"
))
)
released
[
"base_ext"
]
=
released
.
url_base
.
apply
(
lambda
x
:
Path
(
x
).
name
)
reg_df
[
"base_ext"
]
=
reg_df
.
url_model
.
apply
(
lambda
x
:
Path
(
x
).
name
)
metadata_new
=
reg_df
.
reset_index
().
merge
(
released
.
rename
(
columns
=
{
"pair"
:
"id"
}),
on
=
[
"base_ext"
,
"id"
])
metadata_renamer
=
{
"src"
:
"src_alpha3"
,
"tgt"
:
"tgt_alpha3"
,
"id"
:
"long_pair"
,
"date"
:
"train_date"
}
metadata_new
=
metadata_new
.
rename
(
columns
=
metadata_renamer
)
metadata_new
[
"src_alpha2"
]
=
metadata_new
.
short_pair
.
apply
(
lambda
x
:
x
.
split
(
"-"
)[
0
])
metadata_new
[
"tgt_alpha2"
]
=
metadata_new
.
short_pair
.
apply
(
lambda
x
:
x
.
split
(
"-"
)[
1
])
DROP_COLS_BOTH
=
[
"url_base"
,
"base_ext"
,
"fname"
]
metadata_new
=
metadata_new
.
drop
(
DROP_COLS_BOTH
,
1
)
metadata_new
[
"prefer_old"
]
=
metadata_new
.
long_pair
.
isin
([])
self
.
metadata
=
metadata_new
assert
self
.
metadata
.
short_pair
.
value_counts
().
max
()
==
1
,
"Multiple metadata entries for a short pair"
self
.
metadata
=
self
.
metadata
.
set_index
(
"short_pair"
)
# wget.download(LANG_CODE_URL)
mapper
=
pd
.
read_csv
(
LANG_CODE_PATH
)
mapper
.
columns
=
[
"a3"
,
"a2"
,
"ref"
]
self
.
iso_table
=
pd
.
read_csv
(
ISO_PATH
,
sep
=
"
\t
"
).
rename
(
columns
=
lambda
x
:
x
.
lower
())
more_3_to_2
=
self
.
iso_table
.
set_index
(
"id"
).
part1
.
dropna
().
to_dict
()
more_3_to_2
.
update
(
mapper
.
set_index
(
"a3"
).
a2
.
to_dict
())
self
.
alpha3_to_alpha2
=
more_3_to_2
self
.
download_lang_info
()
self
.
model_results
=
json
.
load
(
open
(
"Tatoeba-Challenge/models/released-model-results.json"
))
self
.
alpha3_to_alpha2
=
{}
for
line
in
open
(
ISO_PATH
):
parts
=
line
.
split
(
"
\t
"
)
if
len
(
parts
[
0
])
==
3
and
len
(
parts
[
3
])
==
2
:
self
.
alpha3_to_alpha2
[
parts
[
0
]]
=
parts
[
3
]
for
line
in
LANG_CODE_PATH
:
parts
=
line
.
split
(
","
)
if
len
(
parts
[
0
])
==
3
and
len
(
parts
[
1
])
==
2
:
self
.
alpha3_to_alpha2
[
parts
[
0
]]
=
parts
[
1
]
self
.
model_card_dir
=
Path
(
save_dir
)
self
.
constituents
=
GROUP_MEMBERS
self
.
tag2name
=
{}
for
key
,
value
in
GROUP_MEMBERS
.
items
():
self
.
tag2name
[
key
]
=
value
[
0
]
def
convert_models
(
self
,
tatoeba_ids
,
dry_run
=
False
):
entrie
s_to_convert
=
[
x
for
x
in
self
.
registry
if
x
[
0
]
in
tatoeba_ids
]
converted_paths
=
convert_all_sentencepiece_models
(
entries_to_convert
,
dest_dir
=
self
.
model_card_dir
)
for
path
in
converted_paths
:
long_pair
=
remove_prefix
(
path
.
name
,
"opus-mt-"
).
split
(
"-"
)
# eg. heb-eng
assert
len
(
long_pair
)
==
2
new_p_src
=
self
.
get_two_letter_code
(
long_pair
[
0
]
)
new_p_tgt
=
self
.
get_two_letter_code
(
long_pair
[
1
])
hf_model_id
=
f
"opus-mt-
{
new_p_src
}
-
{
new_p_tgt
}
"
new_path
=
path
.
parent
.
joinpath
(
hf_model_id
)
# opus-mt-he-en
os
.
rename
(
str
(
path
),
str
(
new_path
))
self
.
write_model_card
(
hf_model_id
,
dry_run
=
dry_run
)
def
get_two_letter_code
(
self
,
three_letter_code
):
return
self
.
alpha3_to_alpha2
.
get
(
three_letter_c
ode
,
three_letter_code
)
model
s_to_convert
=
[
self
.
parse_metadata
(
x
)
for
x
in
tatoeba_ids
]
save_dir
=
Path
(
"marian_ckpt"
)
dest_dir
=
Path
(
self
.
model_card_dir
)
dest_dir
.
mkdir
(
exist_ok
=
True
)
for
model
in
tqdm
(
models_to_convert
):
# k, prepro, download, test_set_url in tqdm(model_list):
if
"SentencePiece"
not
in
model
[
"pre-processing"
]:
print
(
f
"Skipping
{
model
[
'release'
]
}
because it doesn't appear to use SentencePiece"
)
continue
if
not
os
.
path
.
exists
(
save_dir
/
model
[
"_name"
]):
download_and_unzip
(
f
"
{
TATOEBA_MODELS_URL
}
/
{
model
[
'release'
]
}
"
,
save_dir
/
model
[
"_name"
])
# from convert_marian_to_pytorch
opus_language_groups_to_hf
=
convert_opus_name_to_hf_name
pair_name
=
opus_language_groups_to_hf
(
model
[
"_name"
])
convert
(
save_dir
/
model
[
"_name"
],
dest_dir
/
f
"opus-mt-
{
pair_name
}
"
)
self
.
write_model_card
(
m
ode
l
,
dry_run
=
dry_run
)
def
expand_group_to_two_letter_codes
(
self
,
grp_name
):
return
[
self
.
get_two_letter_code
(
x
)
for
x
in
self
.
constituents
[
grp_name
]]
return
[
self
.
alpha3_to_alpha2
.
get
(
x
,
x
)
for
x
in
GROUP_MEMBERS
[
grp_name
][
1
]]
def
is_group
(
self
,
code
,
name
):
return
"languages"
in
name
or
len
(
GROUP_MEMBERS
.
get
(
code
,
[]))
>
1
def
get_tags
(
self
,
code
,
ref_
name
):
def
get_tags
(
self
,
code
,
name
):
if
len
(
code
)
==
2
:
assert
"languages"
not
in
ref_
name
,
f
"
{
code
}
:
{
ref_
name
}
"
return
[
code
]
,
False
elif
"languages"
in
ref_name
or
len
(
self
.
constituents
.
get
(
code
,
[]))
>
1
:
assert
"languages"
not
in
name
,
f
"
{
code
}
:
{
name
}
"
return
[
code
]
elif
self
.
is_group
(
code
,
name
)
:
group
=
self
.
expand_group_to_two_letter_codes
(
code
)
group
.
append
(
code
)
return
group
,
True
return
group
else
:
# zho-> zh
print
(
f
"Three letter monolingual code:
{
code
}
"
)
return
[
code
]
,
False
return
[
code
]
def
resolve_lang_code
(
self
,
r
)
->
Tuple
[
List
[
str
],
str
,
str
]:
"""R is a row in ported"""
short_pair
=
r
.
short_pair
src
,
tgt
=
short_pair
.
split
(
"-"
)
src_tags
,
src_multilingual
=
self
.
get_tags
(
src
,
r
.
src_name
)
assert
isinstance
(
src_tags
,
list
)
tgt_tags
,
tgt_multilingual
=
self
.
get_tags
(
tgt
,
r
.
tgt_name
)
assert
isinstance
(
tgt_tags
,
list
)
def
resolve_lang_code
(
self
,
src
,
tgt
)
->
Tuple
[
str
,
str
]:
src_tags
=
self
.
get_tags
(
src
,
self
.
tag2name
[
src
])
tgt_tags
=
self
.
get_tags
(
tgt
,
self
.
tag2name
[
tgt
])
return
src_tags
,
tgt_tags
return
dedup
(
src_tags
+
tgt_tags
),
src_multilingual
,
tgt_multilingual
@
staticmethod
def
model_type_info_from_model_name
(
name
):
info
=
{
"_has_backtranslated_data"
:
False
}
if
"1m"
in
name
:
info
[
"_data_per_pair"
]
=
str
(
1e6
)
if
"2m"
in
name
:
info
[
"_data_per_pair"
]
=
str
(
2e6
)
if
"4m"
in
name
:
info
[
"_data_per_pair"
]
=
str
(
4e6
)
if
"+bt"
in
name
:
info
[
"_has_backtranslated_data"
]
=
True
if
"tuned4"
in
name
:
info
[
"_tuned"
]
=
re
.
search
(
r
"tuned4[^-]+"
,
name
).
group
()
return
info
def
write_model_card
(
self
,
hf_model_id
:
str
,
repo_root
=
DEFAULT_REPO
,
dry_run
=
False
,
)
->
str
:
def
write_model_card
(
self
,
model_dict
,
dry_run
=
False
)
->
str
:
"""
Co
py the most recent model's readme section from opus, and add metadata
. upload command: aws s3 sync
model_card_dir
s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
Co
nstruct card from data parsed from YAML and the model's name
. upload command: aws s3 sync
model_card_dir
s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
"""
short_pair
=
remove_prefix
(
hf_model_id
,
"opus-mt-"
)
extra_metadata
=
self
.
metadata
.
loc
[
short_pair
].
drop
(
"2m"
)
extra_metadata
[
"short_pair"
]
=
short_pair
lang_tags
,
src_multilingual
,
tgt_multilingual
=
self
.
resolve_lang_code
(
extra_metadata
)
opus_name
=
f
"
{
extra_metadata
.
src_alpha3
}
-
{
extra_metadata
.
tgt_alpha3
}
"
# opus_name: str = self.convert_hf_name_to_opus_name(hf_model_name)
assert
repo_root
in
(
"OPUS-MT-train"
,
"Tatoeba-Challenge"
)
opus_readme_path
=
Path
(
repo_root
).
joinpath
(
"models"
,
opus_name
,
"README.md"
)
assert
opus_readme_path
.
exists
(),
f
"Readme file
{
opus_readme_path
}
not found"
model_dir_url
=
f
"
{
TATOEBA_MODELS_URL
}
/
{
model_dict
[
'release'
]
}
"
long_pair
=
model_dict
[
"_name"
].
split
(
"-"
)
assert
len
(
long_pair
)
==
2
,
f
"got a translation pair
{
model_dict
[
'_name'
]
}
that doesn't appear to be a pair"
short_src
=
self
.
alpha3_to_alpha2
.
get
(
long_pair
[
0
],
long_pair
[
0
])
short_tgt
=
self
.
alpha3_to_alpha2
.
get
(
long_pair
[
1
],
long_pair
[
1
])
model_dict
[
"_hf_model_id"
]
=
f
"opus-mt-
{
short_src
}
-
{
short_tgt
}
"
opus_src
,
opus_tgt
=
[
x
.
split
(
"+"
)
for
x
in
opus_name
.
split
(
"-"
)]
a3_src
,
a3_tgt
=
model_dict
[
"_name"
].
split
(
"-"
)
# opus_src_tags, opus_tgt_tags = a3_src.split("+"), a3_tgt.split("+")
readme_url
=
f
"https://github.com/Helsinki-NLP/
{
repo_root
}
/tree/master/models/
{
opus_name
}
/README.md"
# This messy part tries to deal with language tags in multilingual models, possibly
# not all having three-letter codes
resolved_src_tags
,
resolved_tgt_tags
=
self
.
resolve_lang_code
(
a3_src
,
a3_tgt
)
a2_src_tags
,
a2_tgt_tags
=
[],
[]
for
tag
in
resolved_src_tags
:
if
tag
not
in
self
.
alpha3_to_alpha2
:
a2_src_tags
.
append
(
tag
)
for
tag
in
resolved_tgt_tags
:
if
tag
not
in
self
.
alpha3_to_alpha2
:
a2_tgt_tags
.
append
(
tag
)
s
,
t
=
","
.
join
(
opus_src
),
","
.
join
(
opus_tgt
)
lang_tags
=
dedup
(
a2_src_tags
+
a2_tgt_tags
)
src_multilingual
,
tgt_multilingual
=
(
len
(
a2_src_tags
)
>
1
),
(
len
(
a2_tgt_tags
)
>
1
)
s
,
t
=
","
.
join
(
a2_src_tags
),
","
.
join
(
a2_tgt_tags
)
metadata
=
{
"hf_name"
:
short_pair
,
"hf_name"
:
model_dict
[
"_name"
]
,
"source_languages"
:
s
,
"target_languages"
:
t
,
"opus_readme_url"
:
readme_url
,
"original_repo"
:
repo_root
,
"opus_readme_url"
:
f
"
{
model_dir_url
}
/README.md"
,
"original_repo"
:
"Tatoeba-Challenge"
,
"tags"
:
[
"translation"
],
"languages"
:
lang_tags
,
}
lang_tags
=
l2front_matter
(
lang_tags
)
metadata
[
"src_constituents"
]
=
self
.
constituents
[
s
]
metadata
[
"tgt_constituents"
]
=
self
.
constituents
[
t
]
metadata
[
"src_constituents"
]
=
list
(
GROUP_MEMBERS
[
a3_src
][
1
])
metadata
[
"tgt_constituents"
]
=
list
(
GROUP_MEMBERS
[
a3_tgt
][
1
])
metadata
[
"src_multilingual"
]
=
src_multilingual
metadata
[
"tgt_multilingual"
]
=
tgt_multilingual
metadata
.
update
(
extra_metadata
)
metadata
.
update
(
get_system_metadata
(
repo_root
))
backtranslated_data
=
""
if
model_dict
[
"_has_backtranslated_data"
]:
backtranslated_data
=
" with backtranslations"
# combine with Tatoeba markdown
multilingual_data
=
""
if
"_data_per_pair"
in
model_dict
:
multilingual_data
=
f
"* data per pair in multilingual model:
{
model_dict
[
'_data_per_pair'
]
}
\n
"
tuned
=
""
if
"_tuned"
in
model_dict
:
tuned
=
f
"* multilingual model tuned for:
{
model_dict
[
'_tuned'
]
}
\n
"
model_base_filename
=
model_dict
[
"release"
].
split
(
"/"
)[
-
1
]
download
=
f
"* download original weights: [
{
model_base_filename
}
](
{
model_dir_url
}
/
{
model_dict
[
'release'
]
}
)
\n
"
langtoken
=
""
if
tgt_multilingual
:
langtoken
=
(
"* a sentence-initial language token is required in the form of >>id<<"
"(id = valid, usually three-letter target language ID)
\n
"
)
metadata
.
update
(
get_system_metadata
(
DEFAULT_REPO
))
scorestable
=
""
for
k
,
v
in
model_dict
.
items
():
if
"scores"
in
k
:
this_score_table
=
f
"*
{
k
}
\n
|Test set|score|
\n
|---|---|
\n
"
pairs
=
sorted
(
v
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
pair
in
pairs
:
this_score_table
+=
f
"|
{
pair
[
0
]
}
|
{
pair
[
1
]
}
|
\n
"
scorestable
+=
this_score_table
datainfo
=
""
if
"training-data"
in
model_dict
:
datainfo
+=
"* Training data:
\n
"
for
k
,
v
in
model_dict
[
"training-data"
].
items
():
datainfo
+=
f
" *
{
str
(
k
)
}
:
{
str
(
v
)
}
\n
"
if
"validation-data"
in
model_dict
:
datainfo
+=
"* Validation data:
\n
"
for
k
,
v
in
model_dict
[
"validation-data"
].
items
():
datainfo
+=
f
" *
{
str
(
k
)
}
:
{
str
(
v
)
}
\n
"
if
"test-data"
in
model_dict
:
datainfo
+=
"* Test data:
\n
"
for
k
,
v
in
model_dict
[
"test-data"
].
items
():
datainfo
+=
f
" *
{
str
(
k
)
}
:
{
str
(
v
)
}
\n
"
extra_markdown
=
f
"###
{
short_pair
}
\n\n
* source group:
{
metadata
[
'src_name'
]
}
\n
* target group:
{
metadata
[
'tgt_name'
]
}
\n
* OPUS readme: [
{
opus_name
}
](
{
readme_url
}
)
\n
"
testsetfilename
=
model_dict
[
"release"
].
replace
(
".zip"
,
".test.txt"
)
testscoresfilename
=
model_dict
[
"release"
].
replace
(
".zip"
,
".eval.txt"
)
testset
=
f
"* test set translations file: [test.txt](
{
model_dir_url
}
/
{
testsetfilename
}
)
\n
"
testscores
=
f
"* test set scores file: [eval.txt](
{
model_dir_url
}
/
{
testscoresfilename
}
)
\n
"
content
=
opus_readme_path
.
open
().
read
()
content
=
content
.
split
(
"
\n
# "
)[
-
1
]
# Get the lowest level 1 header in the README -- the most recent model.
splat
=
content
.
split
(
"*"
)[
2
:]
# combine with Tatoeba markdown
readme_url
=
f
"
{
TATOEBA_MODELS_URL
}
/
{
model_dict
[
'_name'
]
}
/README.md"
extra_markdown
=
f
"""
###
{
model_dict
[
'_name'
]
}
content
=
"*"
.
join
(
splat
)
# BETTER FRONT MATTER LOGIC
* source language name:
{
self
.
tag2name
[
a3_src
]
}
* target language name:
{
self
.
tag2name
[
a3_tgt
]
}
* OPUS readme: [README.md](
{
readme_url
}
)
"""
content
=
(
FRONT_MATTER_TEMPLATE
.
format
(
lang_tags
)
+
extra_markdown
+
"
\n
* "
+
content
.
replace
(
"download"
,
"download original "
"weights"
)
f
"""
* model:
{
model_dict
[
'modeltype'
]
}
* source language code
{
src_multilingual
*
's'
}
:
{
', '
.
join
(
a2_src_tags
)
}
* target language code
{
tgt_multilingual
*
's'
}
:
{
', '
.
join
(
a2_tgt_tags
)
}
* dataset: opus
{
backtranslated_data
}
* release date:
{
model_dict
[
'release-date'
]
}
* pre-processing:
{
model_dict
[
'pre-processing'
]
}
"""
+
multilingual_data
+
tuned
+
download
+
langtoken
+
datainfo
+
testset
+
testscores
+
scorestable
)
items
=
"
\n\n
"
.
join
([
f
"-
{
k
}
:
{
v
}
"
for
k
,
v
in
metadata
.
items
()])
content
=
FRONT_MATTER_TEMPLATE
.
format
(
lang_tags
)
+
extra_markdown
+
content
items
=
"
\n
"
.
join
([
f
"*
{
k
}
:
{
v
}
"
for
k
,
v
in
metadata
.
items
()])
sec3
=
"
\n
### System Info:
\n
"
+
items
content
+=
sec3
if
dry_run
:
return
content
,
metadata
sub_dir
=
self
.
model_card_dir
/
hf_model_id
print
(
"CONTENT:"
)
print
(
content
)
print
(
"METADATA:"
)
print
(
metadata
)
return
sub_dir
=
self
.
model_card_dir
/
model_dict
[
"_hf_model_id"
]
sub_dir
.
mkdir
(
exist_ok
=
True
)
dest
=
sub_dir
/
"README.md"
dest
.
open
(
"w"
).
write
(
content
)
pd
.
Series
(
metadata
).
to_json
(
sub_dir
/
"metadata.json"
)
return
content
,
metadata
for
k
,
v
in
metadata
.
items
():
if
isinstance
(
v
,
datetime
.
date
):
metadata
[
k
]
=
datetime
.
datetime
.
strftime
(
v
,
"%Y-%m-%d"
)
with
open
(
sub_dir
/
"metadata.json"
,
"w"
,
encoding
=
"utf-8"
)
as
writeobj
:
json
.
dump
(
metadata
,
writeobj
)
def
download_
metadata
(
self
):
def
download_
lang_info
(
self
):
Path
(
LANG_CODE_PATH
).
parent
.
mkdir
(
exist_ok
=
True
)
import
wget
...
...
@@ -242,20 +285,35 @@ class TatoebaConverter:
if
not
os
.
path
.
exists
(
LANG_CODE_PATH
):
wget
.
download
(
LANG_CODE_URL
,
LANG_CODE_PATH
)
@
staticmethod
def
make_tatoeba_registry
(
repo_path
=
DEFAULT_MODEL_DIR
):
if
not
(
Path
(
repo_path
)
/
"zho-eng"
/
"README.md"
).
exists
():
raise
ValueError
(
f
"repo_path:
{
repo_path
}
does not exist: "
"You must run: git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git before calling."
def
parse_metadata
(
self
,
model_name
,
repo_path
=
DEFAULT_MODEL_DIR
,
method
=
"best"
):
p
=
Path
(
repo_path
)
/
model_name
def
url_to_name
(
url
):
return
url
.
split
(
"/"
)[
-
1
].
split
(
"."
)[
0
]
if
model_name
not
in
self
.
model_results
:
# This is not a language pair, so model results are ambiguous, go by newest
method
=
"newest"
if
method
==
"best"
:
# Sort by how early they appear in released-models-results
results
=
[
url_to_name
(
model
[
"download"
])
for
model
in
self
.
model_results
[
model_name
]]
ymls
=
[
f
for
f
in
os
.
listdir
(
p
)
if
f
.
endswith
(
".yml"
)
and
f
[:
-
4
]
in
results
]
ymls
.
sort
(
key
=
lambda
x
:
results
.
index
(
x
[:
-
4
]))
metadata
=
yaml
.
safe_load
(
open
(
p
/
ymls
[
0
]))
metadata
.
update
(
self
.
model_type_info_from_model_name
(
ymls
[
0
][:
-
4
]))
elif
method
==
"newest"
:
ymls
=
[
f
for
f
in
os
.
listdir
(
p
)
if
f
.
endswith
(
".yml"
)]
# Sort by date
ymls
.
sort
(
key
=
lambda
x
:
datetime
.
datetime
.
strptime
(
re
.
search
(
r
"\d\d\d\d-\d\d?-\d\d?"
,
x
).
group
(),
"%Y-%m-%d"
)
)
results
=
{}
for
p
in
Path
(
repo_path
).
iterdir
():
if
len
(
p
.
name
)
!=
7
:
continue
lns
=
list
(
open
(
p
/
"README.md"
).
readlines
())
results
[
p
.
name
]
=
_parse_readme
(
lns
)
return
[(
k
,
v
[
"pre-processing"
],
v
[
"download"
],
v
[
"download"
][:
-
4
]
+
".test.txt"
)
for
k
,
v
in
results
.
items
()]
metadata
=
yaml
.
safe_load
(
open
(
p
/
ymls
[
-
1
]))
metadata
.
update
(
self
.
model_type_info_from_model_name
(
ymls
[
-
1
][:
-
4
]))
else
:
raise
NotImplementedError
(
f
"Don't know argument method='
{
method
}
' to parse_metadata()"
)
metadata
[
"_name"
]
=
model_name
return
metadata
GROUP_MEMBERS
=
{
...
...
@@ -1248,9 +1306,7 @@ def dedup(lst):
"""Preservers order"""
new_lst
=
[]
for
item
in
lst
:
if
not
item
:
continue
elif
item
in
new_lst
:
if
not
item
or
item
in
new_lst
:
continue
else
:
new_lst
.
append
(
item
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment