"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "700d48fb2deb0c24863b592513898f0f477822eb"
Unverified Commit db94b746 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Fix `FlaubertTokenizer` (#19552)



* fix flaubert tokenizer

* update

* update

* Final cleanup
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent 62f28bc1
...@@ -213,8 +213,6 @@ class FlaubertTokenizer(PreTrainedTokenizer): ...@@ -213,8 +213,6 @@ class FlaubertTokenizer(PreTrainedTokenizer):
Dictionary mapping languages string identifiers to their IDs. Dictionary mapping languages string identifiers to their IDs.
id2lang (`Dict[int, str]`, *optional*): id2lang (`Dict[int, str]`, *optional*):
Dictionary mapping language IDs to their string identifiers. Dictionary mapping language IDs to their string identifiers.
do_lowercase_and_remove_accent (`bool`, *optional*, defaults to `True`):
Whether to lowercase and remove accents when tokenizing.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -247,9 +245,20 @@ class FlaubertTokenizer(PreTrainedTokenizer): ...@@ -247,9 +245,20 @@ class FlaubertTokenizer(PreTrainedTokenizer):
], ],
lang2id=None, lang2id=None,
id2lang=None, id2lang=None,
do_lowercase_and_remove_accent=True,
**kwargs **kwargs
): ):
do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
if do_lowercase_and_remove_accent is not None:
logger.warning(
"`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
" `FlaubertTokenizer` will always set it to `False`."
)
# always `False`
self.do_lowercase_and_remove_accent = False
self.do_lowercase = do_lowercase
super().__init__( super().__init__(
unk_token=unk_token, unk_token=unk_token,
bos_token=bos_token, bos_token=bos_token,
...@@ -260,8 +269,6 @@ class FlaubertTokenizer(PreTrainedTokenizer): ...@@ -260,8 +269,6 @@ class FlaubertTokenizer(PreTrainedTokenizer):
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
lang2id=lang2id, lang2id=lang2id,
id2lang=id2lang, id2lang=id2lang,
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
do_lowercase=do_lowercase,
**kwargs, **kwargs,
) )
...@@ -280,8 +287,6 @@ class FlaubertTokenizer(PreTrainedTokenizer): ...@@ -280,8 +287,6 @@ class FlaubertTokenizer(PreTrainedTokenizer):
# cache of sm.MosesTokenizer instance # cache of sm.MosesTokenizer instance
self.cache_moses_tokenizer = dict() self.cache_moses_tokenizer = dict()
self.lang_with_custom_tokenizer = set(["zh", "th", "ja"]) self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
# True for current supported model (v1.2.0), False for XLM-17 & 100
self.do_lowercase_and_remove_accent = False
self.lang2id = lang2id self.lang2id = lang2id
self.id2lang = id2lang self.id2lang = id2lang
if lang2id is not None and id2lang is not None: if lang2id is not None and id2lang is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment