Unverified Commit 934e2799 authored by Li-Huai (Allan) Lin's avatar Li-Huai (Allan) Lin Committed by GitHub
Browse files

Fix mask token handling (#14364)

* Fix mask token handling

* Revert "Fix mask token handling"

This reverts commit daaa3f5291b1f71e5bc3604ca281c000000c4648.

* Fix FNet mask token tokenization
parent 4df7d05a
...@@ -113,8 +113,13 @@ class FNetTokenizer(PreTrainedTokenizer): ...@@ -113,8 +113,13 @@ class FNetTokenizer(PreTrainedTokenizer):
sp_model_kwargs: Optional[Dict[str, Any]] = None, sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
) -> None: ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it and
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token # is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
......
...@@ -107,8 +107,13 @@ class FNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -107,8 +107,13 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
mask_token="[MASK]", mask_token="[MASK]",
**kwargs **kwargs
): ):
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it and
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token # is included in the raw text, there should be a match in a non-normalized sentence.
mask_token = (
AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
if isinstance(mask_token, str)
else mask_token
)
super().__init__( super().__init__(
vocab_file, vocab_file,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment