Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ca905ba2
"...composable_kernel_rocm.git" did not exist on "1d074e34dd11dffa39cf1f1e07016acc5a389c3a"
Unverified
Commit
ca905ba2
authored
Feb 08, 2023
by
Guillaume Klein
Committed by
GitHub
Feb 08, 2023
Browse files
Exclude the madeup words from M2M100Tokenizer.vocab_size (#20976)
parent
cc1d0685
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
8 additions
and
7 deletions
+8
-7
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
+1
-1
tests/models/m2m_100/test_tokenization_m2m_100.py
tests/models/m2m_100/test_tokenization_m2m_100.py
+7
-6
No files found.
src/transformers/models/m2m_100/tokenization_m2m_100.py
View file @
ca905ba2
...
@@ -193,7 +193,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
...
@@ -193,7 +193,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
@
property
@
property
def
vocab_size
(
self
)
->
int
:
def
vocab_size
(
self
)
->
int
:
return
len
(
self
.
encoder
)
+
len
(
self
.
lang_token_to_id
)
+
self
.
num_madeup_words
return
len
(
self
.
encoder
)
+
len
(
self
.
lang_token_to_id
)
@
property
@
property
def
src_lang
(
self
)
->
str
:
def
src_lang
(
self
)
->
str
:
...
...
tests/models/m2m_100/test_tokenization_m2m_100.py
View file @
ca905ba2
...
@@ -84,15 +84,13 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
...
@@ -84,15 +84,13 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self
.
assertEqual
(
self
.
get_tokenizer
().
_convert_id_to_token
(
token_id
),
token
)
self
.
assertEqual
(
self
.
get_tokenizer
().
_convert_id_to_token
(
token_id
),
token
)
def
test_get_vocab
(
self
):
def
test_get_vocab
(
self
):
vocab_keys
=
list
(
self
.
get_tokenizer
().
get_vocab
().
keys
())
tokenizer
=
self
.
get_tokenizer
()
vocab_keys
=
list
(
tokenizer
.
get_vocab
().
keys
())
self
.
assertEqual
(
vocab_keys
[
0
],
"</s>"
)
self
.
assertEqual
(
vocab_keys
[
0
],
"</s>"
)
self
.
assertEqual
(
vocab_keys
[
1
],
"<unk>"
)
self
.
assertEqual
(
vocab_keys
[
1
],
"<unk>"
)
self
.
assertEqual
(
vocab_keys
[
-
1
],
"<s>"
)
self
.
assertEqual
(
vocab_keys
[
-
1
],
"<s>"
)
self
.
assertEqual
(
len
(
vocab_keys
),
110
)
self
.
assertEqual
(
len
(
vocab_keys
),
tokenizer
.
vocab_size
+
len
(
tokenizer
.
get_added_vocab
()))
def
test_vocab_size
(
self
):
self
.
assertEqual
(
self
.
get_tokenizer
().
vocab_size
,
117
)
@
unittest
.
skip
(
"Skip this test while all models are still to be uploaded."
)
@
unittest
.
skip
(
"Skip this test while all models are still to be uploaded."
)
def
test_pretrained_model_lists
(
self
):
def
test_pretrained_model_lists
(
self
):
...
@@ -161,7 +159,10 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
...
@@ -161,7 +159,10 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"mr"
),
128063
)
self
.
assertEqual
(
self
.
tokenizer
.
get_lang_id
(
"mr"
),
128063
)
def
test_get_vocab
(
self
):
def
test_get_vocab
(
self
):
self
.
assertIn
(
self
.
tokenizer
.
get_lang_token
(
"en"
),
self
.
tokenizer
.
get_vocab
())
vocab
=
self
.
tokenizer
.
get_vocab
()
self
.
assertEqual
(
len
(
vocab
),
self
.
tokenizer
.
vocab_size
)
self
.
assertEqual
(
vocab
[
"<unk>"
],
3
)
self
.
assertIn
(
self
.
tokenizer
.
get_lang_token
(
"en"
),
vocab
)
def
test_tokenizer_batch_encode_plus
(
self
):
def
test_tokenizer_batch_encode_plus
(
self
):
self
.
tokenizer
.
src_lang
=
"en"
self
.
tokenizer
.
src_lang
=
"en"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment