Unverified Commit 601d4d69 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
parent fd405e9a
...@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base- ...@@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-
text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this." text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?" question = "What has Huggingface done ?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt") encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"] input_ids = encoding["input_ids"]
# default is local attention everywhere # default is local attention everywhere
......
...@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad") ...@@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad")
def get_answer(question, context): def get_answer(question, context):
input_text = "question: %s context: %s </s>" % (question, context) input_text = "question: %s context: %s </s>" % (question, context)
features = tokenizer.batch_encode_plus([input_text], return_tensors='pt') features = tokenizer([input_text], return_tensors='pt')
out = model.generate(input_ids=features['input_ids'], out = model.generate(input_ids=features['input_ids'],
attention_mask=features['attention_mask']) attention_mask=features['attention_mask'])
......
...@@ -255,7 +255,7 @@ ...@@ -255,7 +255,7 @@
"# tokens_pt = torch.tensor([tokens_ids])\n", "# tokens_pt = torch.tensor([tokens_ids])\n",
"\n", "\n",
"# This code can be factored into one-line as follow\n", "# This code can be factored into one-line as follow\n",
"tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n", "tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
"\n", "\n",
"for key, value in tokens_pt2.items():\n", "for key, value in tokens_pt2.items():\n",
" print(\"{}:\\n\\t{}\".format(key, value))\n", " print(\"{}:\\n\\t{}\".format(key, value))\n",
...@@ -268,7 +268,7 @@ ...@@ -268,7 +268,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n", "As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
"that will go through the model. \n", "that will go through the model. \n",
"\n", "\n",
"Moreover, you might have noticed it generated some additional tensors: \n", "Moreover, you might have noticed it generated some additional tensors: \n",
...@@ -302,10 +302,10 @@ ...@@ -302,10 +302,10 @@
], ],
"source": [ "source": [
"# Single segment input\n", "# Single segment input\n",
"single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n", "single_seg_input = tokenizer(\"This is a sample input\")\n",
"\n", "\n",
"# Multiple segment input\n", "# Multiple segment input\n",
"multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n", "multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
"\n", "\n",
"print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n", "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
"print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n", "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
...@@ -344,9 +344,9 @@ ...@@ -344,9 +344,9 @@
], ],
"source": [ "source": [
"# Padding highlight\n", "# Padding highlight\n",
"tokens = tokenizer.batch_encode_plus(\n", "tokens = tokenizer(\n",
" [\"This is a sample\", \"This is another longer sample text\"], \n", " [\"This is a sample\", \"This is another longer sample text\"], \n",
" pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n", " padding=True # First sentence will have some PADDED tokens to match second sequence length\n",
")\n", ")\n",
"\n", "\n",
"for i in range(2):\n", "for i in range(2):\n",
...@@ -405,8 +405,8 @@ ...@@ -405,8 +405,8 @@
], ],
"source": [ "source": [
"# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n", "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
"input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n", "input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
"input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n", "input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
"\n", "\n",
"# Let's compare the outputs\n", "# Let's compare the outputs\n",
"output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n", "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
...@@ -464,7 +464,7 @@ ...@@ -464,7 +464,7 @@
"from transformers import DistilBertModel\n", "from transformers import DistilBertModel\n",
"\n", "\n",
"bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n", "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
"input_pt = tokenizer.encode_plus(\n", "input_pt = tokenizer(\n",
" 'This is a sample input to demonstrate performance of distiled models especially inference time', \n", " 'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
" return_tensors=\"pt\"\n", " return_tensors=\"pt\"\n",
")\n", ")\n",
...@@ -514,7 +514,7 @@ ...@@ -514,7 +514,7 @@
"de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"\n", "\n",
"de_input = de_tokenizer.encode_plus(\n", "de_input = de_tokenizer(\n",
" \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n", " \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
" return_tensors=\"pt\"\n", " return_tensors=\"pt\"\n",
")\n", ")\n",
...@@ -559,4 +559,4 @@ ...@@ -559,4 +559,4 @@
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 4 "nbformat_minor": 4
} }
\ No newline at end of file
...@@ -248,7 +248,7 @@ ...@@ -248,7 +248,7 @@
"cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n", "cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
"\n", "\n",
"# Inputs are provided through numpy array\n", "# Inputs are provided through numpy array\n",
"model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n", "model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
"inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n", "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
"\n", "\n",
"# Run the model (None = get all the outputs)\n", "# Run the model (None = get all the outputs)\n",
......
...@@ -86,7 +86,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D ...@@ -86,7 +86,7 @@ def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], D
print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes)) print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes))
return axes return axes
tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework) tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
seq_len = tokens.input_ids.shape[-1] seq_len = tokens.input_ids.shape[-1]
outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
......
...@@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset): ...@@ -91,7 +91,7 @@ class LineByLineTextDataset(Dataset):
with open(file_path, encoding="utf-8") as f: with open(file_path, encoding="utf-8") as f:
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
self.examples = batch_encoding["input_ids"] self.examples = batch_encoding["input_ids"]
def __len__(self): def __len__(self):
......
...@@ -137,8 +137,11 @@ def _glue_convert_examples_to_features( ...@@ -137,8 +137,11 @@ def _glue_convert_examples_to_features(
labels = [label_from_example(example) for example in examples] labels = [label_from_example(example) for example in examples]
batch_encoding = tokenizer.batch_encode_plus( batch_encoding = tokenizer(
[(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, [(example.text_a, example.text_b) for example in examples],
max_length=max_length,
padding="max_length",
truncation=True,
) )
features = [] features = []
......
...@@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q ...@@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
spans = [] spans = []
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) truncated_query = tokenizer.encode(
example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
)
sequence_added_tokens = ( sequence_added_tokens = (
tokenizer.max_len - tokenizer.max_len_single_sentence + 1 tokenizer.max_len - tokenizer.max_len_single_sentence + 1
if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
...@@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q ...@@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
span_doc_tokens = all_doc_tokens span_doc_tokens = all_doc_tokens
while len(spans) * doc_stride < len(all_doc_tokens): while len(spans) * doc_stride < len(all_doc_tokens):
encoded_dict = tokenizer.encode_plus( encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
truncation="only_second" if tokenizer.padding_side == "right" else "only_first",
padding="max_length",
max_length=max_seq_length, max_length=max_seq_length,
return_overflowing_tokens=True, return_overflowing_tokens=True,
pad_to_max_length=True,
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
return_token_type_ids=True, return_token_type_ids=True,
) )
...@@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q ...@@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q
spans.append(encoded_dict) spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict: if "overflowing_tokens" not in encoded_dict or (
"overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
):
break break
span_doc_tokens = encoded_dict["overflowing_tokens"] span_doc_tokens = encoded_dict["overflowing_tokens"]
......
...@@ -278,7 +278,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r""" ...@@ -278,7 +278,7 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True) >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained >>> # the linear classifier still needs to be trained
...@@ -391,7 +391,7 @@ TF_MULTIPLE_CHOICE_SAMPLE = r""" ...@@ -391,7 +391,7 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
>>> choice0 = "It is eaten with a fork and a knife." >>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True) >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}} >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1 >>> outputs = model(inputs) # batch size is 1
......
...@@ -402,7 +402,7 @@ ALBERT_INPUTS_DOCSTRING = r""" ...@@ -402,7 +402,7 @@ ALBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.AlbertTokenizer`. Indices can be obtained using :class:`transformers.AlbertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
...@@ -579,7 +579,7 @@ BERT_INPUTS_DOCSTRING = r""" ...@@ -579,7 +579,7 @@ BERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`. Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
......
...@@ -251,7 +251,7 @@ CTRL_INPUTS_DOCSTRING = r""" ...@@ -251,7 +251,7 @@ CTRL_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.CTRLTokenizer`. Indices can be obtained using :class:`transformers.CTRLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
......
...@@ -360,7 +360,7 @@ DISTILBERT_INPUTS_DOCSTRING = r""" ...@@ -360,7 +360,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.DistilBertTokenizer`. Indices can be obtained using :class:`transformers.DistilBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
...@@ -893,7 +893,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): ...@@ -893,7 +893,7 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
>>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained >>> # the linear classifier still needs to be trained
......
...@@ -186,7 +186,7 @@ ELECTRA_INPUTS_DOCSTRING = r""" ...@@ -186,7 +186,7 @@ ELECTRA_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.ElectraTokenizer`. Indices can be obtained using :class:`transformers.ElectraTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
...@@ -65,7 +65,7 @@ FLAUBERT_INPUTS_DOCSTRING = r""" ...@@ -65,7 +65,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.BertTokenizer`. Indices can be obtained using :class:`transformers.BertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
...@@ -302,7 +302,7 @@ GPT2_INPUTS_DOCSTRING = r""" ...@@ -302,7 +302,7 @@ GPT2_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.GPT2Tokenizer`. Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
......
...@@ -454,7 +454,7 @@ LONGFORMER_INPUTS_DOCSTRING = r""" ...@@ -454,7 +454,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.LonmgformerTokenizer`. Indices can be obtained using :class:`transformers.LonmgformerTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
...@@ -970,7 +970,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): ...@@ -970,7 +970,7 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
>>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt") >>> encoding = tokenizer(question, text, return_tensors="pt")
>>> input_ids = encoding["input_ids"] >>> input_ids = encoding["input_ids"]
>>> # default is local attention everywhere >>> # default is local attention everywhere
......
...@@ -678,7 +678,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r""" ...@@ -678,7 +678,7 @@ MOBILEBERT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.MobileBertTokenizer`. Indices can be obtained using :class:`transformers.MobileBertTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
...@@ -296,7 +296,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" ...@@ -296,7 +296,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`. Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
...@@ -1487,7 +1487,7 @@ REFORMER_INPUTS_DOCSTRING = r""" ...@@ -1487,7 +1487,7 @@ REFORMER_INPUTS_DOCSTRING = r"""
Indices can be obtained using :class:`transformers.ReformerTokenizer`. Indices can be obtained using :class:`transformers.ReformerTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details. :func:`transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__ `What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment