"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "1609a436eca115853b5a4cfd80b9ec2302bb9fcc"
Unverified Commit b5e2b183 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
parent e13f72fb
...@@ -58,11 +58,13 @@ predicted token ids. ...@@ -58,11 +58,13 @@ predicted token ids.
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(batch): >>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"]) ... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech ... batch["speech"] = speech
... return batch ... return batch
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
...@@ -81,7 +83,11 @@ predicted token ids. ...@@ -81,7 +83,11 @@ predicted token ids.
>>> from transformers import pipeline >>> from transformers import pipeline
>>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de") >>> asr = pipeline(
... "automatic-speech-recognition",
... model="facebook/s2t-wav2vec2-large-en-de",
... feature_extractor="facebook/s2t-wav2vec2-large-en-de",
... )
>>> translation_de = asr(librispeech_en[0]["file"]) >>> translation_de = asr(librispeech_en[0]["file"])
``` ```
......
...@@ -98,8 +98,8 @@ language modeling head on top of the decoder. ...@@ -98,8 +98,8 @@ language modeling head on top of the decoder.
tokenizer = T5Tokenizer.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
# the forward function automatically creates the correct decoder_input_ids # the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss loss = model(input_ids=input_ids, labels=labels).loss
``` ```
...@@ -120,8 +120,8 @@ language modeling head on top of the decoder. ...@@ -120,8 +120,8 @@ language modeling head on top of the decoder.
tokenizer = T5Tokenizer.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
# the forward function automatically creates the correct decoder_input_ids # the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss loss = model(input_ids=input_ids, labels=labels).loss
``` ```
...@@ -148,7 +148,7 @@ language modeling head on top of the decoder. ...@@ -148,7 +148,7 @@ language modeling head on top of the decoder.
ignored. The code example below illustrates all of this. ignored. The code example below illustrates all of this.
```python ```python
from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch import torch
tokenizer = T5Tokenizer.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small")
...@@ -168,18 +168,19 @@ language modeling head on top of the decoder. ...@@ -168,18 +168,19 @@ language modeling head on top of the decoder.
# encode the inputs # encode the inputs
task_prefix = "translate English to French: " task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2] input_sequences = [input_sequence_1, input_sequence_2]
encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], encoding = tokenizer(
padding='longest', [task_prefix + sequence for sequence in input_sequences],
max_length=max_source_length, padding="longest",
truncation=True, max_length=max_source_length,
return_tensors="pt") truncation=True,
return_tensors="pt",
)
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
# encode the targets # encode the targets
target_encoding = tokenizer([output_sequence_1, output_sequence_2], target_encoding = tokenizer(
padding='longest', [output_sequence_1, output_sequence_2], padding="longest", max_length=max_target_length, truncation=True
max_length=max_target_length, )
truncation=True)
labels = target_encoding.input_ids labels = target_encoding.input_ids
# replace padding token id's of the labels by -100 # replace padding token id's of the labels by -100
...@@ -218,12 +219,12 @@ There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encode ...@@ -218,12 +219,12 @@ There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encode
generation works in general in encoder-decoder models. generation works in general in encoder-decoder models.
```python ```python
from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small") model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
outputs = model.generate(input_ids) outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# Das Haus ist wunderbar. # Das Haus ist wunderbar.
...@@ -242,17 +243,17 @@ model = T5ForConditionalGeneration.from_pretrained("t5-small") ...@@ -242,17 +243,17 @@ model = T5ForConditionalGeneration.from_pretrained("t5-small")
# when generating, we will use the logits of right-most token to predict the next token # when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left # so the padding should be on the left
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error tokenizer.pad_token = tokenizer.eos_token # to avoid an error
task_prefix = 'translate English to German: ' task_prefix = "translate English to German: "
sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching sentences = ["The house is wonderful.", "I like to work in NYC."] # use different length sentences to test batching
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
output_sequences = model.generate( output_sequences = model.generate(
input_ids=inputs['input_ids'], input_ids=inputs["input_ids"],
attention_mask=inputs['attention_mask'], attention_mask=inputs["attention_mask"],
do_sample=False, # disable sampling to test if batching affects output do_sample=False, # disable sampling to test if batching affects output
) )
print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
......
...@@ -22,7 +22,7 @@ One can directly plug in the weights of T5v1.1 into a T5 model, like so: ...@@ -22,7 +22,7 @@ One can directly plug in the weights of T5v1.1 into a T5 model, like so:
```python ```python
from transformers import T5ForConditionalGeneration from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base') model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
``` ```
T5 Version 1.1 includes the following improvements compared to the original T5 model: T5 Version 1.1 includes the following improvements compared to the original T5 model:
......
This diff is collapsed.
...@@ -77,11 +77,13 @@ The following example shows how to get the last hidden state using [`VisualBertM ...@@ -77,11 +77,13 @@ The following example shows how to get the last hidden state using [`VisualBertM
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
>>> inputs.update({ >>> inputs.update(
... "visual_embeds": visual_embeds, ... {
... "visual_token_type_ids": visual_token_type_ids, ... "visual_embeds": visual_embeds,
... "visual_attention_mask": visual_attention_mask ... "visual_token_type_ids": visual_token_type_ids,
... }) ... "visual_attention_mask": visual_attention_mask,
... }
... )
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state >>> last_hidden_state = outputs.last_hidden_state
``` ```
......
...@@ -50,9 +50,8 @@ For instance: ...@@ -50,9 +50,8 @@ For instance:
```python ```python
>>> model = AutoModel.from_pretrained( >>> model = AutoModel.from_pretrained(
>>> "julien-c/EsperBERTo-small", ... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash
>>> revision="v2.0.1" # tag name, or branch name, or commit hash ... )
>>> )
``` ```
## Push your model from Python ## Push your model from Python
...@@ -344,9 +343,8 @@ You may specify a revision by using the `revision` flag in the `from_pretrained` ...@@ -344,9 +343,8 @@ You may specify a revision by using the `revision` flag in the `from_pretrained`
```python ```python
>>> tokenizer = AutoTokenizer.from_pretrained( >>> tokenizer = AutoTokenizer.from_pretrained(
>>> "julien-c/EsperBERTo-small", ... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash
>>> revision="v2.0.1" # tag name, or branch name, or commit hash ... )
>>> )
``` ```
## Workflow in a Colab notebook ## Workflow in a Colab notebook
......
...@@ -62,18 +62,18 @@ The different languages this model/tokenizer handles, as well as the ids of thes ...@@ -62,18 +62,18 @@ The different languages this model/tokenizer handles, as well as the ids of thes
These ids should be used when passing a language parameter during a model pass. Let's define our inputs: These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
```py ```py
>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 >>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
``` ```
We should now define the language embedding by using the previously defined language id. We want to create a tensor We should now define the language embedding by using the previously defined language id. We want to create a tensor
filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0: filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
```py ```py
>>> language_id = tokenizer.lang2id['en'] # 0 >>> language_id = tokenizer.lang2id["en"] # 0
>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) >>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0])
>>> # We reshape it to be of size (batch_size, sequence_length) >>> # We reshape it to be of size (batch_size, sequence_length)
>>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) >>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
``` ```
You can then feed it all as input to your model: You can then feed it all as input to your model:
......
...@@ -69,8 +69,9 @@ Let's demonstrate this process with GPT-2. ...@@ -69,8 +69,9 @@ Let's demonstrate this process with GPT-2.
```python ```python
from transformers import GPT2LMHeadModel, GPT2TokenizerFast from transformers import GPT2LMHeadModel, GPT2TokenizerFast
device = 'cuda'
model_id = 'gpt2-large' device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device) model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id) tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
``` ```
...@@ -81,8 +82,9 @@ dataset in memory. ...@@ -81,8 +82,9 @@ dataset in memory.
```python ```python
from datasets import load_dataset from datasets import load_dataset
test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt') test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
``` ```
With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative
...@@ -104,10 +106,10 @@ nlls = [] ...@@ -104,10 +106,10 @@ nlls = []
for i in tqdm(range(0, encodings.input_ids.size(1), stride)): for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
begin_loc = max(i + stride - max_length, 0) begin_loc = max(i + stride - max_length, 0)
end_loc = min(i + stride, encodings.input_ids.size(1)) end_loc = min(i + stride, encodings.input_ids.size(1))
trg_len = end_loc - i # may be different from stride on last loop trg_len = end_loc - i # may be different from stride on last loop
input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device) input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
target_ids = input_ids.clone() target_ids = input_ids.clone()
target_ids[:,:-trg_len] = -100 target_ids[:, :-trg_len] = -100
with torch.no_grad(): with torch.no_grad():
outputs = model(input_ids, labels=target_ids) outputs = model(input_ids, labels=target_ids)
......
...@@ -36,7 +36,8 @@ To automatically download the vocab used during pretraining or fine-tuning a giv ...@@ -36,7 +36,8 @@ To automatically download the vocab used during pretraining or fine-tuning a giv
```py ```py
from transformers import AutoTokenizer from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
``` ```
## Base use ## Base use
...@@ -75,9 +76,7 @@ If you have several sentences you want to process, you can do this efficiently b ...@@ -75,9 +76,7 @@ If you have several sentences you want to process, you can do this efficiently b
tokenizer: tokenizer:
```py ```py
>>> batch_sentences = ["Hello I'm a single sentence", >>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
... "And another sentence",
... "And the very very last one"]
>>> encoded_inputs = tokenizer(batch_sentences) >>> encoded_inputs = tokenizer(batch_sentences)
>>> print(encoded_inputs) >>> print(encoded_inputs)
{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102], {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
...@@ -174,12 +173,12 @@ If you have a list of pairs of sequences you want to process, you should feed th ...@@ -174,12 +173,12 @@ If you have a list of pairs of sequences you want to process, you should feed th
list of first sentences and the list of second sentences: list of first sentences and the list of second sentences:
```py ```py
>>> batch_sentences = ["Hello I'm a single sentence", >>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
... "And another sentence", >>> batch_of_second_sentences = [
... "And the very very last one"] ... "I'm a sentence that goes with the first sentence",
>>> batch_of_second_sentences = ["I'm a sentence that goes with the first sentence", ... "And I should be encoded with the second sentence",
... "And I should be encoded with the second sentence", ... "And I go with the very last one",
... "And I go with the very last one"] ... ]
>>> encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences) >>> encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
>>> print(encoded_inputs) >>> print(encoded_inputs)
{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102],
...@@ -199,7 +198,7 @@ To double-check what is fed to the model, we can decode each list in _input_ids_ ...@@ -199,7 +198,7 @@ To double-check what is fed to the model, we can decode each list in _input_ids_
```py ```py
>>> for ids in encoded_inputs["input_ids"]: >>> for ids in encoded_inputs["input_ids"]:
>>> print(tokenizer.decode(ids)) ... print(tokenizer.decode(ids))
[CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP] [CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]
[CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP] [CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP]
[CLS] And the very very last one [SEP] And I go with the very last one [SEP] [CLS] And the very very last one [SEP] And I go with the very last one [SEP]
...@@ -307,35 +306,43 @@ This works exactly as before for batch of sentences or batch of pairs of sentenc ...@@ -307,35 +306,43 @@ This works exactly as before for batch of sentences or batch of pairs of sentenc
like this: like this:
```py ```py
batch_sentences = [["Hello", "I'm", "a", "single", "sentence"], batch_sentences = [
["And", "another", "sentence"], ["Hello", "I'm", "a", "single", "sentence"],
["And", "the", "very", "very", "last", "one"]] ["And", "another", "sentence"],
["And", "the", "very", "very", "last", "one"],
]
encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True) encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
``` ```
or a batch of pair sentences like this: or a batch of pair sentences like this:
```py ```py
batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"], batch_of_second_sentences = [
["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"], ["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
["And", "I", "go", "with", "the", "very", "last", "one"]] ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
["And", "I", "go", "with", "the", "very", "last", "one"],
]
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True) encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
``` ```
And you can add padding, truncation as well as directly return tensors like before: And you can add padding, truncation as well as directly return tensors like before:
```py ```py
batch = tokenizer(batch_sentences, batch = tokenizer(
batch_of_second_sentences, batch_sentences,
is_split_into_words=True, batch_of_second_sentences,
padding=True, is_split_into_words=True,
truncation=True, padding=True,
return_tensors="pt") truncation=True,
return_tensors="pt",
)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
batch = tokenizer(batch_sentences, batch = tokenizer(
batch_of_second_sentences, batch_sentences,
is_split_into_words=True, batch_of_second_sentences,
padding=True, is_split_into_words=True,
truncation=True, padding=True,
return_tensors="tf") truncation=True,
return_tensors="tf",
)
``` ```
...@@ -57,7 +57,8 @@ pip install tensorflow ...@@ -57,7 +57,8 @@ pip install tensorflow
```py ```py
>>> from transformers import pipeline >>> from transformers import pipeline
>>> classifier = pipeline('sentiment-analysis')
>>> classifier = pipeline("sentiment-analysis")
``` ```
When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
...@@ -67,7 +68,7 @@ make them readable. For instance: ...@@ -67,7 +68,7 @@ make them readable. For instance:
```py ```py
>>> classifier('We are very happy to show you the 🤗 Transformers library.') >>> classifier("We are very happy to show you the 🤗 Transformers library.")
[{'label': 'POSITIVE', 'score': 0.9998}] [{'label': 'POSITIVE', 'score': 0.9998}]
``` ```
...@@ -75,8 +76,7 @@ That's encouraging! You can use it on a list of sentences, which will be preproc ...@@ -75,8 +76,7 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
a list of dictionaries like this one: a list of dictionaries like this one:
```py ```py
>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
... "We hope you don't hate it."])
>>> for result in results: >>> for result in results:
... print(f"label: {result['label']}, with score: {round(result['score'], 4)}") ... print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
label: POSITIVE, with score: 0.9998 label: POSITIVE, with score: 0.9998
...@@ -102,7 +102,7 @@ see how we can use it. ...@@ -102,7 +102,7 @@ see how we can use it.
You can directly pass the name of the model to use to [`pipeline`]: You can directly pass the name of the model to use to [`pipeline`]:
```py ```py
>>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment") >>> classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
``` ```
This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
...@@ -125,13 +125,13 @@ any other model from the model hub): ...@@ -125,13 +125,13 @@ any other model from the model hub):
>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
>>> model = AutoModelForSequenceClassification.from_pretrained(model_name) >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name) >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer) >>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
>>> # This model only exists in PyTorch, so we use the _from_pt_ flag to import that model in TensorFlow. >>> # This model only exists in PyTorch, so we use the _from_pt_ flag to import that model in TensorFlow.
>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True) >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name) >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
>>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer) >>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
``` ```
If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
...@@ -150,11 +150,13 @@ As we saw, the model and tokenizer are created using the `from_pretrained` metho ...@@ -150,11 +150,13 @@ As we saw, the model and tokenizer are created using the `from_pretrained` metho
```py ```py
>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english" >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name) >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english" >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = AutoTokenizer.from_pretrained(model_name) >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
...@@ -199,7 +201,7 @@ and get tensors back. You can specify all of that to the tokenizer: ...@@ -199,7 +201,7 @@ and get tensors back. You can specify all of that to the tokenizer:
... padding=True, ... padding=True,
... truncation=True, ... truncation=True,
... max_length=512, ... max_length=512,
... return_tensors="pt" ... return_tensors="pt",
... ) ... )
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> tf_batch = tokenizer( >>> tf_batch = tokenizer(
...@@ -207,7 +209,7 @@ and get tensors back. You can specify all of that to the tokenizer: ...@@ -207,7 +209,7 @@ and get tensors back. You can specify all of that to the tokenizer:
... padding=True, ... padding=True,
... truncation=True, ... truncation=True,
... max_length=512, ... max_length=512,
... return_tensors="tf" ... return_tensors="tf",
... ) ... )
``` ```
...@@ -267,9 +269,11 @@ Let's apply the SoftMax activation to get predictions. ...@@ -267,9 +269,11 @@ Let's apply the SoftMax activation to get predictions.
```py ```py
>>> from torch import nn >>> from torch import nn
>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
``` ```
...@@ -291,13 +295,15 @@ attribute: ...@@ -291,13 +295,15 @@ attribute:
```py ```py
>>> import torch >>> import torch
>>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
>>> pt_outputs = pt_model(**pt_batch, labels=torch.tensor([1, 0]))
>>> print(pt_outputs) >>> print(pt_outputs)
SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833, 4.3364], SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833, 4.3364],
[ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None) [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
>>> tf_outputs = tf_model(tf_batch, labels=tf.constant([1, 0]))
>>> print(tf_outputs) >>> print(tf_outputs)
TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy= TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-4.0833 , 4.3364 ], array([[-4.0833 , 4.3364 ],
...@@ -317,11 +323,11 @@ case the attributes not set (that have `None` values) are ignored. ...@@ -317,11 +323,11 @@ case the attributes not set (that have `None` values) are ignored.
Once your model is fine-tuned, you can save it with its tokenizer in the following way: Once your model is fine-tuned, you can save it with its tokenizer in the following way:
```py ```py
>>> pt_save_directory = './pt_save_pretrained' >>> pt_save_directory = "./pt_save_pretrained"
>>> tokenizer.save_pretrained(pt_save_directory) >>> tokenizer.save_pretrained(pt_save_directory)
>>> pt_model.save_pretrained(pt_save_directory) >>> pt_model.save_pretrained(pt_save_directory)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> tf_save_directory = './tf_save_pretrained' >>> tf_save_directory = "./tf_save_pretrained"
>>> tokenizer.save_pretrained(tf_save_directory) >>> tokenizer.save_pretrained(tf_save_directory)
>>> tf_model.save_pretrained(tf_save_directory) >>> tf_model.save_pretrained(tf_save_directory)
``` ```
...@@ -343,10 +349,12 @@ Then, use the corresponding Auto class to load it like this: ...@@ -343,10 +349,12 @@ Then, use the corresponding Auto class to load it like this:
```py ```py
>>> from transformers import AutoModel >>> from transformers import AutoModel
>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) >>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
>>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True) >>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> from transformers import TFAutoModel >>> from transformers import TFAutoModel
>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
>>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True) >>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
``` ```
...@@ -356,11 +364,11 @@ Lastly, you can also ask the model to return all hidden states and all attention ...@@ -356,11 +364,11 @@ Lastly, you can also ask the model to return all hidden states and all attention
```py ```py
>>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True) >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
>>> all_hidden_states = pt_outputs.hidden_states >>> all_hidden_states = pt_outputs.hidden_states
>>> all_attentions = pt_outputs.attentions >>> all_attentions = pt_outputs.attentions
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True) >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
>>> all_hidden_states = tf_outputs.hidden_states >>> all_hidden_states = tf_outputs.hidden_states
>>> all_attentions = tf_outputs.attentions >>> all_attentions = tf_outputs.attentions
``` ```
...@@ -376,11 +384,13 @@ directly instantiate model and tokenizer without the auto magic: ...@@ -376,11 +384,13 @@ directly instantiate model and tokenizer without the auto magic:
```py ```py
>>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english" >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> model = DistilBertForSequenceClassification.from_pretrained(model_name) >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name) >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased-finetuned-sst-2-english" >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
>>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name) >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name) >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
...@@ -401,13 +411,15 @@ the model from scratch. Therefore, we instantiate the model from a configuration ...@@ -401,13 +411,15 @@ the model from scratch. Therefore, we instantiate the model from a configuration
```py ```py
>>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
>>> model = DistilBertForSequenceClassification(config) >>> model = DistilBertForSequenceClassification(config)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
>>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
>>> model = TFDistilBertForSequenceClassification(config) >>> model = TFDistilBertForSequenceClassification(config)
``` ```
...@@ -419,11 +431,13 @@ configuration appropriately: ...@@ -419,11 +431,13 @@ configuration appropriately:
```py ```py
>>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased" >>> model_name = "distilbert-base-uncased"
>>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10) >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name) >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
===PT-TF-SPLIT=== ===PT-TF-SPLIT===
>>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
>>> model_name = "distilbert-base-uncased" >>> model_name = "distilbert-base-uncased"
>>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10) >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
>>> tokenizer = DistilBertTokenizer.from_pretrained(model_name) >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
......
...@@ -109,6 +109,7 @@ This export can now be used in the ONNX inference runtime: ...@@ -109,6 +109,7 @@ This export can now be used in the ONNX inference runtime:
import onnxruntime as ort import onnxruntime as ort
from transformers import BertTokenizerFast from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx") ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
...@@ -382,7 +383,7 @@ tokenized_text = enc.tokenize(text) ...@@ -382,7 +383,7 @@ tokenized_text = enc.tokenize(text)
# Masking one of the input tokens # Masking one of the input tokens
masked_index = 8 masked_index = 8
tokenized_text[masked_index] = '[MASK]' tokenized_text[masked_index] = "[MASK]"
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
...@@ -393,8 +394,14 @@ dummy_input = [tokens_tensor, segments_tensors] ...@@ -393,8 +394,14 @@ dummy_input = [tokens_tensor, segments_tensors]
# Initializing the model with the torchscript flag # Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head. # Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, config = BertConfig(
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True) vocab_size_or_config_json_file=32000,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
torchscript=True,
)
# Instantiating the model # Instantiating the model
model = BertModel(config) model = BertModel(config)
......
...@@ -188,11 +188,15 @@ positions of the extracted answer in the text. ...@@ -188,11 +188,15 @@ positions of the extracted answer in the text.
```py ```py
>>> result = question_answerer(question="What is extractive question answering?", context=context) >>> result = question_answerer(question="What is extractive question answering?", context=context)
>>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}") >>> print(
... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
... )
Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95 Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95
>>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context) >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
>>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}") >>> print(
... f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
... )
Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160 Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160
``` ```
...@@ -232,18 +236,20 @@ Here is an example of question answering using a model and a tokenizer. The proc ...@@ -232,18 +236,20 @@ Here is an example of question answering using a model and a tokenizer. The proc
>>> for question in questions: >>> for question in questions:
... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
... input_ids = inputs["input_ids"].tolist()[0] ... input_ids = inputs["input_ids"].tolist()[0]
...
... outputs = model(**inputs) ... outputs = model(**inputs)
... answer_start_scores = outputs.start_logits ... answer_start_scores = outputs.start_logits
... answer_end_scores = outputs.end_logits ... answer_end_scores = outputs.end_logits
...
... # Get the most likely beginning of answer with the argmax of the score ... # Get the most likely beginning of answer with the argmax of the score
... answer_start = torch.argmax(answer_start_scores) ... answer_start = torch.argmax(answer_start_scores)
... # Get the most likely end of answer with the argmax of the score ... # Get the most likely end of answer with the argmax of the score
... answer_end = torch.argmax(answer_end_scores) + 1 ... answer_end = torch.argmax(answer_end_scores) + 1
...
... answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) ... answer = tokenizer.convert_tokens_to_string(
... ... tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
... )
... print(f"Question: {question}") ... print(f"Question: {question}")
... print(f"Answer: {answer}") ... print(f"Answer: {answer}")
Question: How many pretrained models are available in 🤗 Transformers? Question: How many pretrained models are available in 🤗 Transformers?
...@@ -275,18 +281,20 @@ Answer: tensorflow 2. 0 and pytorch ...@@ -275,18 +281,20 @@ Answer: tensorflow 2. 0 and pytorch
>>> for question in questions: >>> for question in questions:
... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
... input_ids = inputs["input_ids"].numpy()[0] ... input_ids = inputs["input_ids"].numpy()[0]
...
... outputs = model(inputs) ... outputs = model(inputs)
... answer_start_scores = outputs.start_logits ... answer_start_scores = outputs.start_logits
... answer_end_scores = outputs.end_logits ... answer_end_scores = outputs.end_logits
...
... # Get the most likely beginning of answer with the argmax of the score ... # Get the most likely beginning of answer with the argmax of the score
... answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0] ... answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
... # Get the most likely end of answer with the argmax of the score ... # Get the most likely end of answer with the argmax of the score
... answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1 ... answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
...
... answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) ... answer = tokenizer.convert_tokens_to_string(
... ... tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
... )
... print(f"Question: {question}") ... print(f"Question: {question}")
... print(f"Answer: {answer}") ... print(f"Answer: {answer}")
Question: How many pretrained models are available in 🤗 Transformers? Question: How many pretrained models are available in 🤗 Transformers?
...@@ -327,7 +335,12 @@ This outputs the sequences with the mask filled, the confidence score, and the t ...@@ -327,7 +335,12 @@ This outputs the sequences with the mask filled, the confidence score, and the t
```py ```py
>>> from pprint import pprint >>> from pprint import pprint
>>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
>>> pprint(
... unmasker(
... f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."
... )
... )
[{'score': 0.1793, [{'score': 0.1793,
'sequence': 'HuggingFace is creating a tool that the community uses to solve ' 'sequence': 'HuggingFace is creating a tool that the community uses to solve '
'NLP tasks.', 'NLP tasks.',
...@@ -374,8 +387,10 @@ Here is an example of doing masked language modeling using a model and a tokeniz ...@@ -374,8 +387,10 @@ Here is an example of doing masked language modeling using a model and a tokeniz
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
>>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased") >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
>>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \ >>> sequence = (
... "Distilled models are smaller than the models they mimic. Using them instead of the large "
... f"versions would help {tokenizer.mask_token} our carbon footprint." ... f"versions would help {tokenizer.mask_token} our carbon footprint."
... )
>>> inputs = tokenizer(sequence, return_tensors="pt") >>> inputs = tokenizer(sequence, return_tensors="pt")
>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
...@@ -399,8 +414,10 @@ Distilled models are smaller than the models they mimic. Using them instead of t ...@@ -399,8 +414,10 @@ Distilled models are smaller than the models they mimic. Using them instead of t
>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased") >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
>>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \ >>> sequence = (
... "Distilled models are smaller than the models they mimic. Using them instead of the large "
... f"versions would help {tokenizer.mask_token} our carbon footprint." ... f"versions would help {tokenizer.mask_token} our carbon footprint."
... )
>>> inputs = tokenizer(sequence, return_tensors="tf") >>> inputs = tokenizer(sequence, return_tensors="tf")
>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
...@@ -544,7 +561,7 @@ Below is an example of text generation using `XLNet` and its tokenizer, which in ...@@ -544,7 +561,7 @@ Below is an example of text generation using `XLNet` and its tokenizer, which in
>>> prompt_length = len(tokenizer.decode(inputs[0])) >>> prompt_length = len(tokenizer.decode(inputs[0]))
>>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:] >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
>>> print(generated) >>> print(generated)
Today the weather is really nice and I am planning ... Today the weather is really nice and I am planning ...
...@@ -571,7 +588,7 @@ Today the weather is really nice and I am planning ... ...@@ -571,7 +588,7 @@ Today the weather is really nice and I am planning ...
>>> prompt_length = len(tokenizer.decode(inputs[0])) >>> prompt_length = len(tokenizer.decode(inputs[0]))
>>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60) >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:] >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
>>> print(generated) >>> print(generated)
Today the weather is really nice and I am planning ... Today the weather is really nice and I am planning ...
...@@ -660,8 +677,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni ...@@ -660,8 +677,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
>>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
>>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \ >>> sequence = (
... "therefore very close to the Manhattan Bridge." ... "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
... "therefore very close to the Manhattan Bridge."
... )
>>> inputs = tokenizer(sequence, return_tensors="pt") >>> inputs = tokenizer(sequence, return_tensors="pt")
>>> tokens = inputs.tokens() >>> tokens = inputs.tokens()
...@@ -675,8 +694,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni ...@@ -675,8 +694,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
>>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
>>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \ >>> sequence = (
... "therefore very close to the Manhattan Bridge." ... "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
... "therefore very close to the Manhattan Bridge."
... )
>>> inputs = tokenizer(sequence, return_tensors="tf") >>> inputs = tokenizer(sequence, return_tensors="tf")
>>> tokens = inputs.tokens() >>> tokens = inputs.tokens()
...@@ -863,7 +884,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce ...@@ -863,7 +884,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
>>> inputs = tokenizer( >>> inputs = tokenizer(
... "translate English to German: Hugging Face is a technology company based in New York and Paris", ... "translate English to German: Hugging Face is a technology company based in New York and Paris",
... return_tensors="pt" ... return_tensors="pt",
... ) ... )
>>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True) >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
...@@ -877,7 +898,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce ...@@ -877,7 +898,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
>>> inputs = tokenizer( >>> inputs = tokenizer(
... "translate English to German: Hugging Face is a technology company based in New York and Paris", ... "translate English to German: Hugging Face is a technology company based in New York and Paris",
... return_tensors="tf" ... return_tensors="tf",
... ) ... )
>>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True) >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
......
...@@ -422,14 +422,14 @@ Let's depict the GPU requirements in the following table: ...@@ -422,14 +422,14 @@ Let's depict the GPU requirements in the following table:
For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed: For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
```python ```python no-style
@require_torch_multi_gpu @require_torch_multi_gpu
def test_example_with_multi_gpu(): def test_example_with_multi_gpu():
``` ```
If a test requires `tensorflow` use the `require_tf` decorator. For example: If a test requires `tensorflow` use the `require_tf` decorator. For example:
```python ```python no-style
@require_tf @require_tf
def test_tf_thing_with_tensorflow(): def test_tf_thing_with_tensorflow():
``` ```
...@@ -437,7 +437,7 @@ def test_tf_thing_with_tensorflow(): ...@@ -437,7 +437,7 @@ def test_tf_thing_with_tensorflow():
These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
how to set it up: how to set it up:
```python ```python no-style
@require_torch_gpu @require_torch_gpu
@slow @slow
def test_example_slow_on_gpu(): def test_example_slow_on_gpu():
...@@ -446,7 +446,7 @@ def test_example_slow_on_gpu(): ...@@ -446,7 +446,7 @@ def test_example_slow_on_gpu():
Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
last for them to work correctly. Here is an example of the correct usage: last for them to work correctly. Here is an example of the correct usage:
```python ```python no-style
@parameterized.expand(...) @parameterized.expand(...)
@require_torch_multi_gpu @require_torch_multi_gpu
def test_integration_foo(): def test_integration_foo():
...@@ -461,7 +461,8 @@ Inside tests: ...@@ -461,7 +461,8 @@ Inside tests:
```python ```python
from transformers.testing_utils import get_gpu_count from transformers.testing_utils import get_gpu_count
n_gpu = get_gpu_count() # works with torch and tf
n_gpu = get_gpu_count() # works with torch and tf
``` ```
### Distributed training ### Distributed training
...@@ -544,12 +545,16 @@ the test, but then there is no way of running that test for just one set of argu ...@@ -544,12 +545,16 @@ the test, but then there is no way of running that test for just one set of argu
# test_this1.py # test_this1.py
import unittest import unittest
from parameterized import parameterized from parameterized import parameterized
class TestMathUnitTest(unittest.TestCase): class TestMathUnitTest(unittest.TestCase):
@parameterized.expand([ @parameterized.expand(
("negative", -1.5, -2.0), [
("integer", 1, 1.0), ("negative", -1.5, -2.0),
("large fraction", 1.6, 1), ("integer", 1, 1.0),
]) ("large fraction", 1.6, 1),
]
)
def test_floor(self, name, input, expected): def test_floor(self, name, input, expected):
assert_equal(math.floor(input), expected) assert_equal(math.floor(input), expected)
``` ```
...@@ -601,6 +606,8 @@ Here is the same example, this time using `pytest`'s `parametrize` marker: ...@@ -601,6 +606,8 @@ Here is the same example, this time using `pytest`'s `parametrize` marker:
```python ```python
# test_this2.py # test_this2.py
import pytest import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name, input, expected", "name, input, expected",
[ [
...@@ -669,6 +676,8 @@ To start using those all you need is to make sure that the test resides in a sub ...@@ -669,6 +676,8 @@ To start using those all you need is to make sure that the test resides in a sub
```python ```python
from transformers.testing_utils import TestCasePlus from transformers.testing_utils import TestCasePlus
class PathExampleTest(TestCasePlus): class PathExampleTest(TestCasePlus):
def test_something_involving_local_locations(self): def test_something_involving_local_locations(self):
data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
...@@ -679,6 +688,8 @@ If you don't need to manipulate paths via `pathlib` or you just need a path as a ...@@ -679,6 +688,8 @@ If you don't need to manipulate paths via `pathlib` or you just need a path as a
```python ```python
from transformers.testing_utils import TestCasePlus from transformers.testing_utils import TestCasePlus
class PathExampleTest(TestCasePlus): class PathExampleTest(TestCasePlus):
def test_something_involving_stringified_locations(self): def test_something_involving_stringified_locations(self):
examples_dir = self.examples_dir_str examples_dir = self.examples_dir_str
...@@ -700,6 +711,8 @@ Here is an example of its usage: ...@@ -700,6 +711,8 @@ Here is an example of its usage:
```python ```python
from transformers.testing_utils import TestCasePlus from transformers.testing_utils import TestCasePlus
class ExamplesTests(TestCasePlus): class ExamplesTests(TestCasePlus):
def test_whatever(self): def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir() tmp_dir = self.get_auto_remove_tmp_dir()
...@@ -759,6 +772,7 @@ If you need to temporary override `sys.path` to import from another test for exa ...@@ -759,6 +772,7 @@ If you need to temporary override `sys.path` to import from another test for exa
```python ```python
import os import os
from transformers.testing_utils import ExtendSysPath from transformers.testing_utils import ExtendSysPath
bindir = os.path.abspath(os.path.dirname(__file__)) bindir = os.path.abspath(os.path.dirname(__file__))
with ExtendSysPath(f"{bindir}/.."): with ExtendSysPath(f"{bindir}/.."):
from test_trainer import TrainerIntegrationCommon # noqa from test_trainer import TrainerIntegrationCommon # noqa
...@@ -786,20 +800,20 @@ code that's buggy causes some bad state that will affect other tests, do not use ...@@ -786,20 +800,20 @@ code that's buggy causes some bad state that will affect other tests, do not use
- Here is how to skip whole test unconditionally: - Here is how to skip whole test unconditionally:
```python ```python no-style
@unittest.skip("this bug needs to be fixed") @unittest.skip("this bug needs to be fixed")
def test_feature_x(): def test_feature_x():
``` ```
or via pytest: or via pytest:
```python ```python no-style
@pytest.mark.skip(reason="this bug needs to be fixed") @pytest.mark.skip(reason="this bug needs to be fixed")
``` ```
or the `xfail` way: or the `xfail` way:
```python ```python no-style
@pytest.mark.xfail @pytest.mark.xfail
def test_feature_x(): def test_feature_x():
``` ```
...@@ -816,6 +830,7 @@ or the whole module: ...@@ -816,6 +830,7 @@ or the whole module:
```python ```python
import pytest import pytest
if not pytest.config.getoption("--custom-flag"): if not pytest.config.getoption("--custom-flag"):
pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True) pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
``` ```
...@@ -835,21 +850,21 @@ docutils = pytest.importorskip("docutils", minversion="0.3") ...@@ -835,21 +850,21 @@ docutils = pytest.importorskip("docutils", minversion="0.3")
- Skip a test based on a condition: - Skip a test based on a condition:
```python ```python no-style
@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") @pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
def test_feature_x(): def test_feature_x():
``` ```
or: or:
```python ```python no-style
@unittest.skipIf(torch_device == "cpu", "Can't do half precision") @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
def test_feature_x(): def test_feature_x():
``` ```
or skip the whole module: or skip the whole module:
```python ```python no-style
@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows") @pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
class TestClass(): class TestClass():
def test_feature_x(self): def test_feature_x(self):
...@@ -863,7 +878,7 @@ The library of tests is ever-growing, and some of the tests take minutes to run, ...@@ -863,7 +878,7 @@ The library of tests is ever-growing, and some of the tests take minutes to run,
an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
marked as in the example below: marked as in the example below:
```python ```python no-style
from transformers.testing_utils import slow from transformers.testing_utils import slow
@slow @slow
def test_integration_foo(): def test_integration_foo():
...@@ -878,8 +893,8 @@ RUN_SLOW=1 pytest tests ...@@ -878,8 +893,8 @@ RUN_SLOW=1 pytest tests
Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage: `@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
```python ```python no-style
@parameterized.expand(...) @parameteriz ed.expand(...)
@slow @slow
def test_integration_foo(): def test_integration_foo():
``` ```
...@@ -935,13 +950,21 @@ In order to test functions that write to `stdout` and/or `stderr`, the test can ...@@ -935,13 +950,21 @@ In order to test functions that write to `stdout` and/or `stderr`, the test can
```python ```python
import sys import sys
def print_to_stdout(s): print(s)
def print_to_stderr(s): sys.stderr.write(s)
def print_to_stdout(s):
print(s)
def print_to_stderr(s):
sys.stderr.write(s)
def test_result_and_stdout(capsys): def test_result_and_stdout(capsys):
msg = "Hello" msg = "Hello"
print_to_stdout(msg) print_to_stdout(msg)
print_to_stderr(msg) print_to_stderr(msg)
out, err = capsys.readouterr() # consume the captured output streams out, err = capsys.readouterr() # consume the captured output streams
# optional: if you want to replay the consumed streams: # optional: if you want to replay the consumed streams:
sys.stdout.write(out) sys.stdout.write(out)
sys.stderr.write(err) sys.stderr.write(err)
...@@ -954,10 +977,13 @@ And, of course, most of the time, `stderr` will come as a part of an exception, ...@@ -954,10 +977,13 @@ And, of course, most of the time, `stderr` will come as a part of an exception,
a case: a case:
```python ```python
def raise_exception(msg): raise ValueError(msg) def raise_exception(msg):
raise ValueError(msg)
def test_something_exception(): def test_something_exception():
msg = "Not a good value" msg = "Not a good value"
error = '' error = ""
try: try:
raise_exception(msg) raise_exception(msg)
except Exception as e: except Exception as e:
...@@ -970,7 +996,12 @@ Another approach to capturing stdout is via `contextlib.redirect_stdout`: ...@@ -970,7 +996,12 @@ Another approach to capturing stdout is via `contextlib.redirect_stdout`:
```python ```python
from io import StringIO from io import StringIO
from contextlib import redirect_stdout from contextlib import redirect_stdout
def print_to_stdout(s): print(s)
def print_to_stdout(s):
print(s)
def test_result_and_stdout(): def test_result_and_stdout():
msg = "Hello" msg = "Hello"
buffer = StringIO() buffer = StringIO()
...@@ -993,6 +1024,7 @@ some `\r`'s in it or not, so it's a simple: ...@@ -993,6 +1024,7 @@ some `\r`'s in it or not, so it's a simple:
```python ```python
from transformers.testing_utils import CaptureStdout from transformers.testing_utils import CaptureStdout
with CaptureStdout() as cs: with CaptureStdout() as cs:
function_that_writes_to_stdout() function_that_writes_to_stdout()
print(cs.out) print(cs.out)
...@@ -1002,17 +1034,19 @@ Here is a full test example: ...@@ -1002,17 +1034,19 @@ Here is a full test example:
```python ```python
from transformers.testing_utils import CaptureStdout from transformers.testing_utils import CaptureStdout
msg = "Secret message\r" msg = "Secret message\r"
final = "Hello World" final = "Hello World"
with CaptureStdout() as cs: with CaptureStdout() as cs:
print(msg + final) print(msg + final)
assert cs.out == final+"\n", f"captured: {cs.out}, expecting {final}" assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
``` ```
If you'd like to capture `stderr` use the `CaptureStderr` class instead: If you'd like to capture `stderr` use the `CaptureStderr` class instead:
```python ```python
from transformers.testing_utils import CaptureStderr from transformers.testing_utils import CaptureStderr
with CaptureStderr() as cs: with CaptureStderr() as cs:
function_that_writes_to_stderr() function_that_writes_to_stderr()
print(cs.err) print(cs.err)
...@@ -1022,6 +1056,7 @@ If you need to capture both streams at once, use the parent `CaptureStd` class: ...@@ -1022,6 +1056,7 @@ If you need to capture both streams at once, use the parent `CaptureStd` class:
```python ```python
from transformers.testing_utils import CaptureStd from transformers.testing_utils import CaptureStd
with CaptureStd() as cs: with CaptureStd() as cs:
function_that_writes_to_stdout_and_stderr() function_that_writes_to_stdout_and_stderr()
print(cs.err, cs.out) print(cs.err, cs.out)
...@@ -1044,7 +1079,7 @@ logging.set_verbosity_info() ...@@ -1044,7 +1079,7 @@ logging.set_verbosity_info()
logger = logging.get_logger("transformers.models.bart.tokenization_bart") logger = logging.get_logger("transformers.models.bart.tokenization_bart")
with CaptureLogger(logger) as cl: with CaptureLogger(logger) as cl:
logger.info(msg) logger.info(msg)
assert cl.out, msg+"\n" assert cl.out, msg + "\n"
``` ```
### Testing with environment variables ### Testing with environment variables
...@@ -1054,6 +1089,8 @@ If you want to test the impact of environment variables for a specific test you ...@@ -1054,6 +1089,8 @@ If you want to test the impact of environment variables for a specific test you
```python ```python
from transformers.testing_utils import mockenv from transformers.testing_utils import mockenv
class HfArgumentParserTest(unittest.TestCase): class HfArgumentParserTest(unittest.TestCase):
@mockenv(TRANSFORMERS_VERBOSITY="error") @mockenv(TRANSFORMERS_VERBOSITY="error")
def test_env_override(self): def test_env_override(self):
...@@ -1065,6 +1102,8 @@ multiple local paths. A helper class `transformers.test_utils.TestCasePlus` come ...@@ -1065,6 +1102,8 @@ multiple local paths. A helper class `transformers.test_utils.TestCasePlus` come
```python ```python
from transformers.testing_utils import TestCasePlus from transformers.testing_utils import TestCasePlus
class EnvExampleTest(TestCasePlus): class EnvExampleTest(TestCasePlus):
def test_external_prog(self): def test_external_prog(self):
env = self.get_env() env = self.get_env()
...@@ -1089,16 +1128,20 @@ seed = 42 ...@@ -1089,16 +1128,20 @@ seed = 42
# python RNG # python RNG
import random import random
random.seed(seed) random.seed(seed)
# pytorch RNGs # pytorch RNGs
import torch import torch
torch.manual_seed(seed) torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
# numpy RNG # numpy RNG
import numpy as np import numpy as np
np.random.seed(seed) np.random.seed(seed)
# tf RNG # tf RNG
......
...@@ -104,6 +104,7 @@ seen before, by decomposing them into known subwords. For instance, the [`~trans ...@@ -104,6 +104,7 @@ seen before, by decomposing them into known subwords. For instance, the [`~trans
```py ```py
>>> from transformers import BertTokenizer >>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
>>> tokenizer.tokenize("I have a new GPU!") >>> tokenizer.tokenize("I have a new GPU!")
["i", "have", "a", "new", "gp", "##u", "!"] ["i", "have", "a", "new", "gp", "##u", "!"]
...@@ -117,6 +118,7 @@ As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously ex ...@@ -117,6 +118,7 @@ As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously ex
```py ```py
>>> from transformers import XLNetTokenizer >>> from transformers import XLNetTokenizer
>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.") >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."] ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
......
...@@ -74,6 +74,7 @@ However, we can instead apply these preprocessing steps to all the splits of our ...@@ -74,6 +74,7 @@ However, we can instead apply these preprocessing steps to all the splits of our
def tokenize_function(examples): def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True) return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
``` ```
...@@ -82,8 +83,8 @@ You can learn more about the map method or the other ways to preprocess the data ...@@ -82,8 +83,8 @@ You can learn more about the map method or the other ways to preprocess the data
Next we will generate a small subset of the training and validation set, to enable faster training: Next we will generate a small subset of the training and validation set, to enable faster training:
```python ```python
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"] full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"] full_eval_dataset = tokenized_datasets["test"]
``` ```
...@@ -130,9 +131,7 @@ Then we can instantiate a [`Trainer`] like this: ...@@ -130,9 +131,7 @@ Then we can instantiate a [`Trainer`] like this:
```python ```python
from transformers import Trainer from transformers import Trainer
trainer = Trainer( trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)
``` ```
To fine-tune our model, we just need to call To fine-tune our model, we just need to call
...@@ -160,6 +159,7 @@ from datasets import load_metric ...@@ -160,6 +159,7 @@ from datasets import load_metric
metric = load_metric("accuracy") metric = load_metric("accuracy")
def compute_metrics(eval_pred): def compute_metrics(eval_pred):
logits, labels = eval_pred logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1) predictions = np.argmax(logits, axis=-1)
...@@ -322,12 +322,7 @@ from transformers import get_scheduler ...@@ -322,12 +322,7 @@ from transformers import get_scheduler
num_epochs = 3 num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader) num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler( lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
``` ```
One last thing, we will want to use the GPU if we have access to one (otherwise training might take several hours One last thing, we will want to use the GPU if we have access to one (otherwise training might take several hours
...@@ -372,7 +367,7 @@ use a metric from the datasets library. Here we accumulate the predictions at ea ...@@ -372,7 +367,7 @@ use a metric from the datasets library. Here we accumulate the predictions at ea
result when the loop is finished. result when the loop is finished.
```python ```python
metric= load_metric("accuracy") metric = load_metric("accuracy")
model.eval() model.eval()
for batch in eval_dataloader: for batch in eval_dataloader:
batch = {k: v.to(device) for k, v in batch.items()} batch = {k: v.to(device) for k, v in batch.items()}
......
...@@ -488,15 +488,20 @@ class PretrainedConfig(PushToHubMixin): ...@@ -488,15 +488,20 @@ class PretrainedConfig(PushToHubMixin):
```python ```python
# We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
# derived class: BertConfig # derived class: BertConfig
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache. config = BertConfig.from_pretrained(
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')* "bert-base-uncased"
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') ) # Download configuration from huggingface.co and cache.
config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) config = BertConfig.from_pretrained(
"./test/saved_model/"
) # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
assert config.output_attentions == True assert config.output_attentions == True
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, config, unused_kwargs = BertConfig.from_pretrained(
foo=False, return_unused_kwargs=True) "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
)
assert config.output_attentions == True assert config.output_attentions == True
assert unused_kwargs == {'foo': False} assert unused_kwargs == {"foo": False}
```""" ```"""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
......
...@@ -588,6 +588,7 @@ class SquadProcessor(DataProcessor): ...@@ -588,6 +588,7 @@ class SquadProcessor(DataProcessor):
```python ```python
>>> import tensorflow_datasets as tfds >>> import tensorflow_datasets as tfds
>>> dataset = tfds.load("squad") >>> dataset = tfds.load("squad")
>>> training_examples = get_examples_from_dataset(dataset, evaluate=False) >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
......
...@@ -107,7 +107,7 @@ class DebugUnderflowOverflow: ...@@ -107,7 +107,7 @@ class DebugUnderflowOverflow:
given batch, and only do that for batches 1 and 3. Then you instantiate this class as : given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
```python ```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3]) debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
``` ```
And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed. And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
...@@ -121,7 +121,7 @@ class DebugUnderflowOverflow: ...@@ -121,7 +121,7 @@ class DebugUnderflowOverflow:
You can also specify the batch number after which to stop the training, with : You can also specify the batch number after which to stop the training, with :
```python ```python
debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3) debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
``` ```
This feature is mainly useful in the tracing mode, but you can use it for any mode. This feature is mainly useful in the tracing mode, but you can use it for any mode.
......
...@@ -273,15 +273,22 @@ class FeatureExtractionMixin: ...@@ -273,15 +273,22 @@ class FeatureExtractionMixin:
```python ```python
# We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
# derived class: *Wav2Vec2FeatureExtractor* # derived class: *Wav2Vec2FeatureExtractor*
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache. feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')* "facebook/wav2vec2-base-960h"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json') ) # Download feature_extraction_config from huggingface.co and cache.
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"./test/saved_model/"
) # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("./test/saved_model/preprocessor_config.json")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False
)
assert feature_extractor.return_attention_mask is False assert feature_extractor.return_attention_mask is False
feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained(
foo=False, return_unused_kwargs=True) "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False, return_unused_kwargs=True
)
assert feature_extractor.return_attention_mask is False assert feature_extractor.return_attention_mask is False
assert unused_kwargs == {'foo': False} assert unused_kwargs == {"foo": False}
```""" ```"""
feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
......
...@@ -956,11 +956,11 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r""" ...@@ -956,11 +956,11 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels) >>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss >>> loss = outputs.loss
...@@ -975,11 +975,11 @@ PT_QUESTION_ANSWERING_SAMPLE = r""" ...@@ -975,11 +975,11 @@ PT_QUESTION_ANSWERING_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> inputs = tokenizer(question, text, return_tensors='pt') >>> inputs = tokenizer(question, text, return_tensors="pt")
>>> start_positions = torch.tensor([1]) >>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3]) >>> end_positions = torch.tensor([3])
...@@ -997,11 +997,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" ...@@ -997,11 +997,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
>>> outputs = model(**inputs, labels=labels) >>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss >>> loss = outputs.loss
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1013,11 +1013,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" ...@@ -1013,11 +1013,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}', problem_type="multi_label_classification") >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss >>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss
>>> outputs = model(**inputs, labels=labels) >>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss >>> loss = outputs.loss
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1032,8 +1032,8 @@ PT_MASKED_LM_SAMPLE = r""" ...@@ -1032,8 +1032,8 @@ PT_MASKED_LM_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
...@@ -1051,8 +1051,8 @@ PT_BASE_MODEL_SAMPLE = r""" ...@@ -1051,8 +1051,8 @@ PT_BASE_MODEL_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
...@@ -1068,16 +1068,16 @@ PT_MULTIPLE_CHOICE_SAMPLE = r""" ...@@ -1068,16 +1068,16 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import torch >>> import torch
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife." >>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
>>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True) >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
>>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels) # batch size is 1
>>> # the linear classifier still needs to be trained >>> # the linear classifier still needs to be trained
>>> loss = outputs.loss >>> loss = outputs.loss
...@@ -1092,8 +1092,8 @@ PT_CAUSAL_LM_SAMPLE = r""" ...@@ -1092,8 +1092,8 @@ PT_CAUSAL_LM_SAMPLE = r"""
>>> import torch >>> import torch
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs, labels=inputs["input_ids"]) >>> outputs = model(**inputs, labels=inputs["input_ids"])
...@@ -1112,8 +1112,8 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r""" ...@@ -1112,8 +1112,8 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
>>> processor = {processor_class}.from_pretrained('{checkpoint}') >>> processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> # audio file is decoded on the fly >>> # audio file is decoded on the fly
>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
...@@ -1134,8 +1134,8 @@ PT_SPEECH_CTC_SAMPLE = r""" ...@@ -1134,8 +1134,8 @@ PT_SPEECH_CTC_SAMPLE = r"""
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
>>> processor = {processor_class}.from_pretrained('{checkpoint}') >>> processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> # audio file is decoded on the fly >>> # audio file is decoded on the fly
>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
...@@ -1164,8 +1164,8 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r""" ...@@ -1164,8 +1164,8 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
>>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}') >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> # audio file is decoded on the fly >>> # audio file is decoded on the fly
>>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt") >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
...@@ -1192,8 +1192,8 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r""" ...@@ -1192,8 +1192,8 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
>>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}') >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> # audio file is decoded on the fly >>> # audio file is decoded on the fly
>>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt") >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
...@@ -1216,8 +1216,8 @@ PT_SPEECH_XVECTOR_SAMPLE = r""" ...@@ -1216,8 +1216,8 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> sampling_rate = dataset.features["audio"].sampling_rate >>> sampling_rate = dataset.features["audio"].sampling_rate
>>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}') >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> # audio file is decoded on the fly >>> # audio file is decoded on the fly
>>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt") >>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt")
...@@ -1227,7 +1227,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r""" ...@@ -1227,7 +1227,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
>>> # the resulting embeddings can be used for cosine similarity-based retrieval >>> # the resulting embeddings can be used for cosine similarity-based retrieval
>>> cosine_sim = torch.nn.CosineSimilarity(dim=-1) >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
>>> similarity = cosine_sim(embeddings[0], embeddings[1]) >>> similarity = cosine_sim(embeddings[0], embeddings[1])
>>> threshold = 0.7 # the optimal threshold is dataset-dependent >>> threshold = 0.7 # the optimal threshold is dataset-dependent
>>> if similarity < threshold: >>> if similarity < threshold:
... print("Speakers are not the same!") ... print("Speakers are not the same!")
``` ```
...@@ -1256,12 +1256,14 @@ TF_TOKEN_CLASSIFICATION_SAMPLE = r""" ...@@ -1256,12 +1256,14 @@ TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> input_ids = inputs["input_ids"] >>> input_ids = inputs["input_ids"]
>>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1 >>> inputs["labels"] = tf.reshape(
... tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))
>>> ) # Batch size 1
>>> outputs = model(inputs) >>> outputs = model(inputs)
>>> loss = outputs.loss >>> loss = outputs.loss
...@@ -1276,17 +1278,17 @@ TF_QUESTION_ANSWERING_SAMPLE = r""" ...@@ -1276,17 +1278,17 @@ TF_QUESTION_ANSWERING_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> input_dict = tokenizer(question, text, return_tensors='tf') >>> input_dict = tokenizer(question, text, return_tensors="tf")
>>> outputs = model(input_dict) >>> outputs = model(input_dict)
>>> start_logits = outputs.start_logits >>> start_logits = outputs.start_logits
>>> end_logits = outputs.end_logits >>> end_logits = outputs.end_logits
>>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0]) >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
>>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1]) >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
``` ```
""" """
...@@ -1297,11 +1299,11 @@ TF_SEQUENCE_CLASSIFICATION_SAMPLE = r""" ...@@ -1297,11 +1299,11 @@ TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1 >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
>>> outputs = model(inputs) >>> outputs = model(inputs)
>>> loss = outputs.loss >>> loss = outputs.loss
...@@ -1316,8 +1318,8 @@ TF_MASKED_LM_SAMPLE = r""" ...@@ -1316,8 +1318,8 @@ TF_MASKED_LM_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf") >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
>>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
...@@ -1335,8 +1337,8 @@ TF_BASE_MODEL_SAMPLE = r""" ...@@ -1335,8 +1337,8 @@ TF_BASE_MODEL_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs) >>> outputs = model(inputs)
...@@ -1352,16 +1354,16 @@ TF_MULTIPLE_CHOICE_SAMPLE = r""" ...@@ -1352,16 +1354,16 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife." >>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='tf', padding=True) >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
>>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}} >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
>>> outputs = model(inputs) # batch size is 1 >>> outputs = model(inputs) # batch size is 1
>>> # the linear classifier still needs to be trained >>> # the linear classifier still needs to be trained
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1375,8 +1377,8 @@ TF_CAUSAL_LM_SAMPLE = r""" ...@@ -1375,8 +1377,8 @@ TF_CAUSAL_LM_SAMPLE = r"""
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> import tensorflow as tf >>> import tensorflow as tf
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
>>> outputs = model(inputs) >>> outputs = model(inputs)
...@@ -1401,10 +1403,10 @@ FLAX_TOKEN_CLASSIFICATION_SAMPLE = r""" ...@@ -1401,10 +1403,10 @@ FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1417,11 +1419,11 @@ FLAX_QUESTION_ANSWERING_SAMPLE = r""" ...@@ -1417,11 +1419,11 @@ FLAX_QUESTION_ANSWERING_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> inputs = tokenizer(question, text, return_tensors='jax') >>> inputs = tokenizer(question, text, return_tensors="jax")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> start_scores = outputs.start_logits >>> start_scores = outputs.start_logits
...@@ -1435,10 +1437,10 @@ FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r""" ...@@ -1435,10 +1437,10 @@ FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1451,10 +1453,10 @@ FLAX_MASKED_LM_SAMPLE = r""" ...@@ -1451,10 +1453,10 @@ FLAX_MASKED_LM_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("The capital of France is {mask}.", return_tensors='jax') >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> logits = outputs.logits >>> logits = outputs.logits
...@@ -1467,10 +1469,10 @@ FLAX_BASE_MODEL_SAMPLE = r""" ...@@ -1467,10 +1469,10 @@ FLAX_BASE_MODEL_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state >>> last_hidden_states = outputs.last_hidden_state
...@@ -1483,15 +1485,15 @@ FLAX_MULTIPLE_CHOICE_SAMPLE = r""" ...@@ -1483,15 +1485,15 @@ FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> choice0 = "It is eaten with a fork and a knife." >>> choice0 = "It is eaten with a fork and a knife."
>>> choice1 = "It is eaten while held in the hand." >>> choice1 = "It is eaten while held in the hand."
>>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='jax', padding=True) >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
>>> outputs = model(**{{k: v[None, :] for k,v in encoding.items()}}) >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
>>> logits = outputs.logits >>> logits = outputs.logits
``` ```
...@@ -1503,8 +1505,8 @@ FLAX_CAUSAL_LM_SAMPLE = r""" ...@@ -1503,8 +1505,8 @@ FLAX_CAUSAL_LM_SAMPLE = r"""
```python ```python
>>> from transformers import {processor_class}, {model_class} >>> from transformers import {processor_class}, {model_class}
>>> tokenizer = {processor_class}.from_pretrained('{checkpoint}') >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained('{checkpoint}') >>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
>>> outputs = model(**inputs) >>> outputs = model(**inputs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment