Style

b03b2a65 · Sylvain Gugger · ce11318e · b03b2a65 · b03b2a65 · b03b2a65
Commit b03b2a65 authored Apr 26, 2021 by Sylvain Gugger
3 changed files
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -172,9 +172,9 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
-                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
-                whitespace) which it will tokenize. This is useful for NER or token classification.
+                which it will tokenize. This is useful for NER or token classification.
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -643,9 +643,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            text (:obj:`str`):
                The text to prepare.
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
-                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
-                whitespace) which it will tokenize. This is useful for NER or token classification.
+                which it will tokenize. This is useful for NER or token classification.
            kwargs:
                Keyword arguments to use for the tokenization.

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1286,9 +1286,9 @@ ENCODE_KWARGS_DOCSTRING = r"""
                returned to provide some overlap between truncated and overflowing sequences. The value of this
                argument defines the number of overlapping tokens.
            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`,
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
-                the tokenizer assumes the input is already split into words (for instance, by splitting it on
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
-                whitespace) which it will tokenize. This is useful for NER or token classification.
+                which it will tokenize. This is useful for NER or token classification.
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).