Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
814b9550
Unverified
Commit
814b9550
authored
Dec 01, 2020
by
Rodolfo Quispe
Committed by
GitHub
Dec 01, 2020
Browse files
Fix doc for language code (#8848)
parent
4a9e502a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
6 deletions
+6
-6
src/transformers/models/mbart/tokenization_mbart.py
src/transformers/models/mbart/tokenization_mbart.py
+3
-3
src/transformers/models/mbart/tokenization_mbart_fast.py
src/transformers/models/mbart/tokenization_mbart_fast.py
+3
-3
No files found.
src/transformers/models/mbart/tokenization_mbart.py
View file @
814b9550
...
@@ -153,7 +153,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
...
@@ -153,7 +153,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``
[
tgt_lang_code]
X [eos]
``
- ``decoder_input_ids``: (for decoder) ``
X [eos,
tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
separator.
...
@@ -220,13 +220,13 @@ class MBartTokenizer(XLMRobertaTokenizer):
...
@@ -220,13 +220,13 @@ class MBartTokenizer(XLMRobertaTokenizer):
return
model_inputs
return
model_inputs
def
set_src_lang_special_tokens
(
self
,
src_lang
)
->
None
:
def
set_src_lang_special_tokens
(
self
,
src_lang
)
->
None
:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos,
cur
_lang_code]."""
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos,
src
_lang_code]."""
self
.
cur_lang_code
=
self
.
lang_code_to_id
[
src_lang
]
self
.
cur_lang_code
=
self
.
lang_code_to_id
[
src_lang
]
self
.
prefix_tokens
=
[]
self
.
prefix_tokens
=
[]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
def
set_tgt_lang_special_tokens
(
self
,
lang
:
str
)
->
None
:
def
set_tgt_lang_special_tokens
(
self
,
lang
:
str
)
->
None
:
"""Reset the special tokens to the target language setting.
P
refix
[tgt_lang_code],
suffix
=[eos]."""
"""Reset the special tokens to the target language setting.
No p
refix
and
suffix=[eos
, tgt_lang_code
]."""
self
.
cur_lang_code
=
self
.
lang_code_to_id
[
lang
]
self
.
cur_lang_code
=
self
.
lang_code_to_id
[
lang
]
self
.
prefix_tokens
=
[]
self
.
prefix_tokens
=
[]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
src/transformers/models/mbart/tokenization_mbart_fast.py
View file @
814b9550
...
@@ -152,7 +152,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
...
@@ -152,7 +152,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
An MBART sequence has the following format, where ``X`` represents the sequence:
An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``
[
tgt_lang_code]
X [eos]
``
- ``decoder_input_ids``: (for decoder) ``
X [eos,
tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
separator.
...
@@ -218,7 +218,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
...
@@ -218,7 +218,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
return
model_inputs
return
model_inputs
def
set_src_lang_special_tokens
(
self
,
src_lang
)
->
None
:
def
set_src_lang_special_tokens
(
self
,
src_lang
)
->
None
:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos,
cur
_lang_code]."""
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos,
src
_lang_code]."""
self
.
cur_lang_code
=
self
.
convert_tokens_to_ids
(
src_lang
)
self
.
cur_lang_code
=
self
.
convert_tokens_to_ids
(
src_lang
)
self
.
prefix_tokens
=
[]
self
.
prefix_tokens
=
[]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
...
@@ -233,7 +233,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
...
@@ -233,7 +233,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
)
)
def
set_tgt_lang_special_tokens
(
self
,
lang
:
str
)
->
None
:
def
set_tgt_lang_special_tokens
(
self
,
lang
:
str
)
->
None
:
"""Reset the special tokens to the target language setting.
P
refix
[tgt_lang_code],
suffix
=[eos]."""
"""Reset the special tokens to the target language setting.
No p
refix
and
suffix=[eos
, tgt_lang_code
]."""
self
.
cur_lang_code
=
self
.
convert_tokens_to_ids
(
lang
)
self
.
cur_lang_code
=
self
.
convert_tokens_to_ids
(
lang
)
self
.
prefix_tokens
=
[]
self
.
prefix_tokens
=
[]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
self
.
suffix_tokens
=
[
self
.
eos_token_id
,
self
.
cur_lang_code
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment