Unverified Commit c94c1b89 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

update the arguments `add_prefix_space` and `trim_offsets` in...

update the arguments `add_prefix_space` and `trim_offsets` in `backend_tokenizer.post_processor` of `RobertaTokenizerFast` (#14752)

* add tests

* change post-processor, pre-tokenizer and decoder (can't update decoder)

* update test (remove decoder which doesn't depend on trim and add_prefix)

* just update the post_processor

* fix change

* `trim_offsets` has no influence on `pre_tokenizer`

* remove a test that need some input from the `tokenizers` lib maintainers

* format

* add new test offsets roberta

* polish comments
parent ec3567fe
...@@ -13,9 +13,11 @@ ...@@ -13,9 +13,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Fast Tokenization classes for RoBERTa.""" """Fast Tokenization classes for RoBERTa."""
import json
from typing import List, Optional from typing import List, Optional
from tokenizers import processors
from ...tokenization_utils_base import AddedToken from ...tokenization_utils_base import AddedToken
from ...utils import logging from ...utils import logging
from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
...@@ -162,6 +164,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -162,6 +164,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
add_prefix_space=False, add_prefix_space=False,
trim_offsets=True,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
...@@ -177,9 +180,37 @@ class RobertaTokenizerFast(GPT2TokenizerFast): ...@@ -177,9 +180,37 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pad_token=pad_token, pad_token=pad_token,
mask_token=mask_token, mask_token=mask_token,
add_prefix_space=add_prefix_space, add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
**kwargs, **kwargs,
) )
# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
state = json.loads(tokenizer_component_instance.__getstate__())
# The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
if "sep" in state:
state["sep"] = tuple(state["sep"])
if "cls" in state:
state["cls"] = tuple(state["cls"])
changes_to_apply = False
if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
state["add_prefix_space"] = add_prefix_space
changes_to_apply = True
if state.get("trim_offsets", trim_offsets) != trim_offsets:
state["trim_offsets"] = trim_offsets
changes_to_apply = True
if changes_to_apply:
component_class = getattr(processors, state.pop("type"))
new_value = component_class(**state)
setattr(self.backend_tokenizer, tokenizer_component, new_value)
@property @property
def mask_token(self) -> str: def mask_token(self) -> str:
""" """
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import itertools
import json import json
import os import os
import unittest import unittest
...@@ -196,3 +197,107 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -196,3 +197,107 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertSequenceEqual( self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"] tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
) )
def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
)
pre_tokenizer_state = json.loads(tokenizer_r.backend_tokenizer.pre_tokenizer.__getstate__())
post_processor_state = json.loads(tokenizer_r.backend_tokenizer.post_processor.__getstate__())
self.assertEqual(pre_tokenizer_state["add_prefix_space"], add_prefix_space)
self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
# Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
# `trim_offsets`
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
)
text = f" {text}"
# tokenizer_r = self.rust_tokenizer_class.from_pretrained(
# pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
# )
# encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
# self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
# self.assertEqual(
# encoding.offset_mapping[1],
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
# )
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
self.assertEqual(encoding.offset_mapping[0], (0, 1 + len(text_of_1_token)))
self.assertEqual(
encoding.offset_mapping[1],
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment