Unverified Commit 9f4e0c23 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Documentation about loading a fast tokenizer within Transformers (#11029)



* Documentation about loading a fast tokenizer within Transformers

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* style
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 6c25f522
Using tokenizers from 🤗 Tokenizers
=======================================================================================================================
The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
loaded very simply into 🤗 Transformers.
Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
.. code-block::
>>> from tokenizers import Tokenizer
>>> from tokenizers.models import BPE
>>> from tokenizers.trainers import BpeTrainer
>>> from tokenizers.pre_tokenizers import Whitespace
>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
>>> tokenizer.pre_tokenizer = Whitespace()
>>> files = [...]
>>> tokenizer.train(files, trainer)
We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
a JSON file for future re-use.
Loading directly from the tokenizer object
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
`tokenizer` object as an argument:
.. code-block::
>>> from transformers import PreTrainedTokenizerFast
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
page <main_classes/tokenizer>` for more information.
Loading from a JSON file
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
.. code-block::
>>> tokenizer.save("tokenizer.json")
The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
method using the :obj:`tokenizer_file` parameter:
.. code-block::
>>> from transformers import PreTrainedTokenizerFast
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
page <main_classes/tokenizer>` for more information.
......@@ -384,6 +384,7 @@ TensorFlow and/or Flax.
migration
contributing
add_new_model
fast_tokenizers
testing
serialization
......
......@@ -62,6 +62,11 @@ PreTrainedTokenizer
PreTrainedTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers
<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 tokenizers library can be
loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers
<../fast_tokenizers>` page to understand how this is done.
.. autoclass:: transformers.PreTrainedTokenizerFast
:special-members: __call__
:members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode,
......
......@@ -54,6 +54,12 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
# Slow tokenizers have an additional added tokens files
ADDED_TOKENS_FILE = "added_tokens.json"
INIT_TOKENIZER_DOCSTRING += """
tokenizer_object (:class:`tokenizers.Tokenizer`):
A :class:`tokenizers.Tokenizer` object from 🤗 tokenizers to instantiate from. See :doc:`Using tokenizers
from 🤗 tokenizers <../fast_tokenizers>` for more information.
"""
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
......@@ -72,6 +78,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
slow_tokenizer_class: PreTrainedTokenizer = None
def __init__(self, *args, **kwargs):
tokenizer_object = kwargs.pop("tokenizer_object", None)
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
......@@ -82,7 +89,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
"have sentencepiece installed."
)
if fast_tokenizer_file is not None and not from_slow:
if tokenizer_object is not None:
fast_tokenizer = tokenizer_object
elif fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None:
......@@ -94,10 +103,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
else:
raise ValueError(
"Couldn't instantiate the backend tokenizer from one of: "
"(1) a `tokenizers` library serialization file, "
"(2) a slow tokenizer instance to convert or "
"(3) an equivalent slow tokenizer class to instantiate and convert. "
"Couldn't instantiate the backend tokenizer from one of: \n"
"(1) a `tokenizers` library serialization file, \n"
"(2) a slow tokenizer instance to convert or \n"
"(3) an equivalent slow tokenizer class to instantiate and convert. \n"
"You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
)
......
......@@ -12,18 +12,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pickle
import tempfile
import unittest
from typing import Callable, Optional
import numpy as np
from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType, TokenSpan
from transformers import (
BatchEncoding,
BertTokenizer,
BertTokenizerFast,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
TensorType,
TokenSpan,
is_tokenizers_available,
)
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
if is_tokenizers_available():
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
class TokenizerUtilsTest(unittest.TestCase):
def check_tokenizer_from_pretrained(self, tokenizer_class):
s3_models = list(tokenizer_class.max_model_input_sizes.keys())
......@@ -253,3 +268,15 @@ class TokenizerUtilsTest(unittest.TestCase):
batch = tokenizer.pad(features, padding=True, return_tensors="tf")
self.assertTrue(isinstance(batch["input_ids"], tf.Tensor))
self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
@require_tokenizers
def test_instantiation_from_tokenizers(self):
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
PreTrainedTokenizerFast(tokenizer_object=bert_tokenizer)
@require_tokenizers
def test_instantiation_from_tokenizers_json_file(self):
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
with tempfile.TemporaryDirectory() as tmpdirname:
bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json"))
PreTrainedTokenizerFast(tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment