"magic_pdf/git@developer.sourcefind.cn:wangsen/mineru.git" did not exist on "9a3fe26ee4c54ea920de6b392c204082f16c58a9"
Unverified Commit 814b9550 authored by Rodolfo Quispe's avatar Rodolfo Quispe Committed by GitHub
Browse files

Fix doc for language code (#8848)

parent 4a9e502a
......@@ -153,7 +153,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
- ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
......@@ -220,13 +220,13 @@ class MBartTokenizer(XLMRobertaTokenizer):
return model_inputs
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
self.cur_lang_code = self.lang_code_to_id[src_lang]
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
"""Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
self.cur_lang_code = self.lang_code_to_id[lang]
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
......@@ -152,7 +152,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
- ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
......@@ -218,7 +218,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
return model_inputs
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
......@@ -233,7 +233,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
)
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
"""Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment