Unverified Commit 814b9550 authored by Rodolfo Quispe's avatar Rodolfo Quispe Committed by GitHub
Browse files

Fix doc for language code (#8848)

parent 4a9e502a
...@@ -153,7 +153,7 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -153,7 +153,7 @@ class MBartTokenizer(XLMRobertaTokenizer):
adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence: adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]`` - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]`` - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator. separator.
...@@ -220,13 +220,13 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -220,13 +220,13 @@ class MBartTokenizer(XLMRobertaTokenizer):
return model_inputs return model_inputs
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
self.cur_lang_code = self.lang_code_to_id[src_lang] self.cur_lang_code = self.lang_code_to_id[src_lang]
self.prefix_tokens = [] self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
def set_tgt_lang_special_tokens(self, lang: str) -> None: def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
self.cur_lang_code = self.lang_code_to_id[lang] self.cur_lang_code = self.lang_code_to_id[lang]
self.prefix_tokens = [] self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
...@@ -152,7 +152,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): ...@@ -152,7 +152,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
An MBART sequence has the following format, where ``X`` represents the sequence: An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]`` - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]`` - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator. separator.
...@@ -218,7 +218,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): ...@@ -218,7 +218,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
return model_inputs return model_inputs
def set_src_lang_special_tokens(self, src_lang) -> None: def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang) self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
self.prefix_tokens = [] self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
...@@ -233,7 +233,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): ...@@ -233,7 +233,7 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
) )
def set_tgt_lang_special_tokens(self, lang: str) -> None: def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(lang) self.cur_lang_code = self.convert_tokens_to_ids(lang)
self.prefix_tokens = [] self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment