Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ab108a0e
"vscode:/vscode.git/clone" did not exist on "69715eabfec7e2b56faa8d35a91981d7857421ac"
Unverified
Commit
ab108a0e
authored
Oct 25, 2022
by
Guillaume Klein
Committed by
GitHub
Oct 25, 2022
Browse files
Add missing lang tokens in M2M100Tokenizer.get_vocab (#18416)
parent
0bd6d934
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
5 additions
and
2 deletions
+5
-2
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
+1
-1
tests/models/m2m_100/test_tokenization_m2m_100.py
tests/models/m2m_100/test_tokenization_m2m_100.py
+4
-1
No files found.
src/transformers/models/m2m_100/tokenization_m2m_100.py
View file @
ab108a0e
...
@@ -280,7 +280,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
...
@@ -280,7 +280,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
return
self
.
prefix_tokens
+
token_ids_0
+
token_ids_1
+
self
.
suffix_tokens
return
self
.
prefix_tokens
+
token_ids_0
+
token_ids_1
+
self
.
suffix_tokens
def
get_vocab
(
self
)
->
Dict
:
def
get_vocab
(
self
)
->
Dict
:
vocab
=
self
.
encoder
.
copy
()
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
return
vocab
...
...
tests/models/m2m_100/test_tokenization_m2m_100.py
View file @
ab108a0e
...
@@ -89,7 +89,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -89,7 +89,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self
.
assertEqual
(
vocab_keys
[
0
],
"</s>"
)
self
.
assertEqual
(
vocab_keys
[
0
],
"</s>"
)
self
.
assertEqual
(
vocab_keys
[
1
],
"<unk>"
)
self
.
assertEqual
(
vocab_keys
[
1
],
"<unk>"
)
self
.
assertEqual
(
vocab_keys
[
-
1
],
"<s>"
)
self
.
assertEqual
(
vocab_keys
[
-
1
],
"<s>"
)
self
.
assertEqual
(
len
(
vocab_keys
),
10
)
self
.
assertEqual
(
len
(
vocab_keys
),
1
10
)
def
test_vocab_size
(
self
):
def
test_vocab_size
(
self
):
self
.
assertEqual
(
self
.
get_tokenizer
().
vocab_size
,
117
)
self
.
assertEqual
(
self
.
get_tokenizer
().
vocab_size
,
117
)
...
@@ -160,6 +160,9 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
...
@@ -160,6 +160,9 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"ro"
),
128076
)
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"ro"
),
128076
)
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"mr"
),
128063
)
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"mr"
),
128063
)
def
test_get_vocab
(
self
):
self
.
assertIn
(
self
.
tokenizer
.
get_lang_token
(
"en"
),
self
.
tokenizer
.
get_vocab
())
def
test_tokenizer_batch_encode_plus
(
self
):
def
test_tokenizer_batch_encode_plus
(
self
):
self
.
tokenizer
.
src_lang
=
"en"
self
.
tokenizer
.
src_lang
=
"en"
ids
=
self
.
tokenizer
.
batch_encode_plus
(
self
.
src_text
).
input_ids
[
0
]
ids
=
self
.
tokenizer
.
batch_encode_plus
(
self
.
src_text
).
input_ids
[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment